blob: d08a7dc48c3f53e7863cb6e1a45fdc5de8cb5d0c [file] [log] [blame]
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Author: mdsteele@google.com (Matthew D. Steele)
#include "net/instaweb/rewriter/public/js_inline_filter.h"
#include "base/logging.h"
#include "net/instaweb/rewriter/public/inline_rewrite_context.h"
#include "net/instaweb/rewriter/public/javascript_code_block.h"
#include "net/instaweb/rewriter/public/resource.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/script_tag_scanner.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/html/html_node.h"
#include "pagespeed/kernel/util/gzip_inflater.h"
#include "pagespeed/kernel/util/re2.h"
namespace net_instaweb {
const char JsInlineFilter::kNumJsInlined[] = "num_js_inlined";
class JsInlineFilter::Context : public InlineRewriteContext {
public:
Context(JsInlineFilter* filter, HtmlElement* element,
HtmlElement::Attribute* src)
: InlineRewriteContext(filter, element, src), filter_(filter) {}
virtual bool ShouldInline(const ResourcePtr& resource,
GoogleString* reason) const {
return filter_->ShouldInline(resource, reason);
}
virtual void RenderInline(
const ResourcePtr& resource, const StringPiece& text,
HtmlElement* element) {
filter_->RenderInline(resource, text, element);
}
virtual const char* id() const { return RewriteOptions::kJavascriptInlineId; }
private:
JsInlineFilter* filter_;
DISALLOW_COPY_AND_ASSIGN(Context);
};
JsInlineFilter::JsInlineFilter(RewriteDriver* driver)
: CommonFilter(driver),
size_threshold_bytes_(driver->options()->js_inline_max_bytes()),
script_tag_scanner_(driver),
should_inline_(false) {
Statistics* stats = server_context()->statistics();
num_js_inlined_ = stats->GetVariable(kNumJsInlined);
}
JsInlineFilter::~JsInlineFilter() {}
void JsInlineFilter::InitStats(Statistics* statistics) {
statistics->AddVariable(kNumJsInlined);
}
void JsInlineFilter::StartDocumentImpl() {
should_inline_ = false;
}
void JsInlineFilter::EndDocument() {
}
void JsInlineFilter::StartElementImpl(HtmlElement* element) {
DCHECK(!should_inline_);
HtmlElement::Attribute* src;
if (script_tag_scanner_.ParseScriptElement(element, &src) ==
ScriptTagScanner::kJavaScript) {
should_inline_ = (src != NULL) && (src->DecodedValueOrNull() != NULL);
}
}
void JsInlineFilter::EndElementImpl(HtmlElement* element) {
if (should_inline_ && driver()->IsRewritable(element)) {
DCHECK(element->keyword() == HtmlName::kScript);
HtmlElement::Attribute* attr = element->FindAttribute(HtmlName::kSrc);
CHECK(attr != NULL);
const char* src = attr->DecodedValueOrNull();
DCHECK(src != NULL) << "should_inline_ should be false if attr val is null";
// StartInlining() transfers ownership of ctx to RewriteDriver, or deletes
// it on failure.
// TODO(morlovich): Consider async/defer here; it may not be a good
// idea to inline async scripts in particular.
Context* ctx = new Context(this, element, attr);
ctx->StartInlining();
}
should_inline_ = false;
}
bool JsInlineFilter::ShouldInline(const ResourcePtr& resource,
GoogleString* reason) const {
// Don't inline if it's too big.
StringPiece contents(resource->ExtractUncompressedContents());
if (contents.size() > size_threshold_bytes_) {
*reason = StrCat("JS not inlined since it's bigger than ",
Integer64ToString(size_threshold_bytes_),
" bytes");
return false;
}
// Or if it looks like it's gzip encoded.
if (GzipInflater::HasGzipMagicBytes(contents)) {
*reason = "JS not inlined because it appears to be gzip-encoded";
return false;
}
// Or if it looks like it's trying to get at its own url.
if (driver()->options()->avoid_renaming_introspective_javascript() &&
JavascriptCodeBlock::UnsafeToRename(contents)) {
*reason = "JS not inlined since it may be looking for its source";
return false;
}
return true;
}
void JsInlineFilter::RenderInline(
const ResourcePtr& resource, const StringPiece& contents,
HtmlElement* element) {
// If it contains '</script' we need to escape. The standard way to do this
// is to replace </script with <\/script, but escaping / with \ is only valid
// inside strings, and the following is legal javascript:
//
// pathological.js:
// if(2</script>/) {
// alert("foo");
// } else {
// alert("bar");
// }
//
// This checks whether 2 is less than the regexp "/script>/". While I would
// be fine just abandoning this as too unlikely to worry about, we can
// actually support this by encoding 's' as \x73 and using <\x73cript instead.
// The html parser won't read that as </script> but the js parser will.
//
// Unfortunately escaping </script> can expose a different bug where browsers
// treat <script> specially inside inline scripts after <!--. So if we
// currently have:
//
// nested.js:
// <!--
// document.write("<script>...</script>");
//
// and we inline it as:
//
// <script><!--
// document.write("<script>...</\x73cript>");
// </script>
//
// then the browser will treat the </script> tag as closing the <script>
// that's inside the document.write, and will continue parsing the rest of the
// document as javascript. We were already open to this bug with code that
// included <script> without </script> but that's probably less common. So we
// should escape <script> too.
//
// Because there are legitimate uses of "<script" where it's part of an
// identifier we can't use the shorter \xNN notation but need \uNNNN notation
// instead. I don't know why they decided \uNNNN would be good for both
// strings and identifiers but \xNN would be good only for strings, but that's
// the way it is. For clarity (and gzip?) we'll just use \uNNNN everywhere.
GoogleString contents_for_escaping;
StringPiece escaped_contents;
// First quickly scan to see if there's anything we need to fix.
if (FindIgnoreCase(contents, "<script") != StringPiece::npos ||
FindIgnoreCase(contents, "</script") != StringPiece::npos) {
contents.CopyToString(&contents_for_escaping);
// To keep the case of the original 'script' text we need to run twice, once
// for 's' and once for 'S'.
RE2::GlobalReplace(&contents_for_escaping,
"<(/?)s([cC][rR][iI][pP][tT])",
"<\\1\\\\u0073\\2");
RE2::GlobalReplace(&contents_for_escaping,
"<(/?)S([cC][rR][iI][pP][tT])",
"<\\1\\\\u0053\\2");
escaped_contents = contents_for_escaping;
} else {
escaped_contents = contents;
}
// If we're in XHTML, we should wrap the script in a <!CDATA[...]]>
// block to ensure that we don't break well-formedness. Since XHTML is
// sometimes interpreted as HTML (which will ignore CDATA delimiters),
// we have to hide the CDATA delimiters behind Javascript comments.
// See http://lachy.id.au/log/2006/11/xhtml-script
// and http://code.google.com/p/modpagespeed/issues/detail?id=125
if (driver()->MimeTypeXhtmlStatus() != RewriteDriver::kIsNotXhtml) {
// CDATA sections cannot be nested because they end with the first
// occurrence of "]]>", so if the script contains that string
// anywhere (and we're in XHTML) we can't inline.
// TODO(mdsteele): We should consider escaping somehow.
if (escaped_contents.find("]]>") == StringPiece::npos) {
HtmlCharactersNode* node =
driver()->NewCharactersNode(element, "//<![CDATA[\n");
node->Append(escaped_contents);
node->Append("\n//]]>");
driver()->AppendChild(element, node);
element->DeleteAttribute(HtmlName::kSrc);
}
} else {
// If we're not in XHTML, we can simply paste in the external script
// verbatim.
driver()->AppendChild(
element, driver()->NewCharactersNode(element, escaped_contents));
element->DeleteAttribute(HtmlName::kSrc);
}
num_js_inlined_->Add(1);
}
void JsInlineFilter::Characters(HtmlCharactersNode* characters) {
if (should_inline_) {
HtmlElement* script_element = characters->parent();
DCHECK(script_element != NULL);
DCHECK_EQ(HtmlName::kScript, script_element->keyword());
if (driver()->IsRewritable(script_element) &&
OnlyWhitespace(characters->contents())) {
// If it's just whitespace inside the script tag, it's (probably) safe to
// just remove it.
driver()->DeleteNode(characters);
} else {
// This script tag isn't empty, despite having a src field. The contents
// won't be executed by the browser, but will still be in the DOM; some
// external scripts like to use this as a place to store data. So, we'd
// better not try to inline in this case.
should_inline_ = false;
}
}
}
} // namespace net_instaweb