blob: 91f8ee7ca684d80097f6be18f86f0168efb40571 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: slamm@google.com (Stephen Lamm)
// The Google Analytics writer filter by scanning all the <script> elements.
// The rewriter follows these steps:
// 1. Match fixed strings that represent a synchronous load
// o Can be either document.write or <script src=...>
// 2. Match calls to _gat._getTracker
// o Only done if step #1 succeeds.
// 3. Match any methods that the rewriter cannot handle such as the
// Google Analytics methods that return values.
// o Only done if step #2 succeeds.
// o If any unhandled methods are found, the rewriter resets to the
// first step.
// 4. At the end of the document, perform the rewrite if steps #1 and #2
// succeeded and the matched script elements are editable (i.e. in
// the current buffer).
//
#include "net/instaweb/rewriter/public/google_analytics_filter.h"
#include <memory>
#include <vector>
#include "base/logging.h"
#include "net/instaweb/rewriter/google_analytics_snippet.h"
#include "pagespeed/kernel/base/scoped_ptr.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/stl_util.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/html/html_node.h"
#include "pagespeed/kernel/html/html_parse.h"
namespace net_instaweb {
const char kGaJsUrlSuffix[] = "google-analytics.com/ga.js";
const char kGaJsDocumentWriteStart[] = "document.write(";
const char kGaJsDocumentWriteEnd[] = "%3C/script%3E\"));";
const char kGaJsGetTracker[] = "_gat._getTracker(";
const char kGaJsCreateTracker[] = "_gat._createTracker(";
const char GoogleAnalyticsFilter::kPageLoadCount[] =
"google_analytics_page_load_count";
const char GoogleAnalyticsFilter::kRewrittenCount[] =
"google_analytics_rewritten_count";
ScriptEditor::ScriptEditor(HtmlElement* script_element,
HtmlCharactersNode *characters_node,
GoogleString::size_type pos,
GoogleString::size_type len,
Type editor_type)
: script_element_(script_element),
script_characters_node_(characters_node),
pos_(pos),
len_(len),
editor_type_(editor_type) {}
void ScriptEditor::NewContents(const StringPiece& replacement,
GoogleString* contents) const {
if (pos_ == GoogleString::npos) {
replacement.CopyToString(contents);
} else {
StringPiece old_contents = script_characters_node_->contents();
contents->clear();
contents->append(old_contents.data(), pos_);
contents->append(replacement.data(), replacement.size());
StringPiece suffix = old_contents.substr(pos_ + len_,
old_contents.size() - pos_ - len_);
contents->append(suffix.data(), suffix.size());
}
}
GoogleAnalyticsFilter::GoogleAnalyticsFilter(
HtmlParse* html_parse, Statistics* stats)
: glue_methods_(new MethodVector),
unhandled_methods_(new MethodVector),
html_parse_(html_parse),
script_element_(NULL),
script_characters_node_(NULL),
page_load_count_(stats->GetVariable(kPageLoadCount)),
rewritten_count_(stats->GetVariable(kRewrittenCount)) {
// The following are the methods that need to be forwarded to the asyn
// interface. This list was created by parsing ga.js and finding the method
// names in the documentation. Methods that return values were added to the
// list of unhandled methods.
glue_methods_->push_back("_trackPageview");
glue_methods_->push_back("_trackEvent");
glue_methods_->push_back("_trackTrans");
glue_methods_->push_back("_addIgnoredOrganic");
glue_methods_->push_back("_addIgnoredRef");
glue_methods_->push_back("_addItem");
glue_methods_->push_back("_addOrganic");
glue_methods_->push_back("_addTrans");
glue_methods_->push_back("_clearIgnoredOrganic");
glue_methods_->push_back("_clearIgnoredRef");
glue_methods_->push_back("_clearOrganic");
glue_methods_->push_back("_clearXKey");
glue_methods_->push_back("_clearXValue");
glue_methods_->push_back("_cookiePathCopy");
glue_methods_->push_back("_deleteCustomVar");
glue_methods_->push_back("_link");
glue_methods_->push_back("_linkByPost");
glue_methods_->push_back("_sendXEvent");
glue_methods_->push_back("_setAllowAnchor");
glue_methods_->push_back("_setAllowHash");
glue_methods_->push_back("_setAllowLinker");
glue_methods_->push_back("_setAutoTrackOutbound");
glue_methods_->push_back("_setCampCIdKey");
glue_methods_->push_back("_setCampContentKey");
glue_methods_->push_back("_setCampIdKey");
glue_methods_->push_back("_setCampMediumKey");
glue_methods_->push_back("_setCampNOKey");
glue_methods_->push_back("_setCampNameKey");
glue_methods_->push_back("_setCampSourceKey");
glue_methods_->push_back("_setCampTermKey");
glue_methods_->push_back("_setCampaignCookieTimeout");
glue_methods_->push_back("_setCampaignTrack");
glue_methods_->push_back("_setClientInfo");
glue_methods_->push_back("_setCookiePath");
glue_methods_->push_back("_setCookiePersistence");
glue_methods_->push_back("_setCookieTimeout");
glue_methods_->push_back("_setCustomVar");
glue_methods_->push_back("_setDetectFlash");
glue_methods_->push_back("_setDetectTitle");
glue_methods_->push_back("_setDomainName");
glue_methods_->push_back("_setHrefExamineLimit");
glue_methods_->push_back("_setLocalGifPath");
glue_methods_->push_back("_setLocalRemoteServerMode");
glue_methods_->push_back("_setLocalServerMode");
glue_methods_->push_back("_setMaxCustomVariables");
glue_methods_->push_back("_setNamespace");
glue_methods_->push_back("_setReferrerOverride");
glue_methods_->push_back("_setRemoteServerMode");
glue_methods_->push_back("_setSampleRate");
glue_methods_->push_back("_setSessionCookieTimeout");
glue_methods_->push_back("_setSessionTimeout");
glue_methods_->push_back("_setTrackOutboundSubdomains");
glue_methods_->push_back("_setTrans");
glue_methods_->push_back("_setTransactionDelim");
glue_methods_->push_back("_setVar");
glue_methods_->push_back("_setVisitorCookieTimeout");
glue_methods_->push_back("_setXKey");
glue_methods_->push_back("_setXValue");
unhandled_methods_->push_back("_anonymizeIp");
unhandled_methods_->push_back("_createEventTracker"); // getter method
unhandled_methods_->push_back("_createXObj"); // getter method
unhandled_methods_->push_back("_require");
unhandled_methods_->push_back("_visitCode"); // getter method
unhandled_methods_->push_back("_get");
unhandled_methods_->push_back("_getAccount");
unhandled_methods_->push_back("_getClientInfo");
unhandled_methods_->push_back("_getDetectFlash");
unhandled_methods_->push_back("_getDetectTitle");
unhandled_methods_->push_back("_getLinkerUrl");
unhandled_methods_->push_back("_getLocalGifPath");
unhandled_methods_->push_back("_getName");
unhandled_methods_->push_back("_getServiceMode");
unhandled_methods_->push_back("_getTrackerByName");
unhandled_methods_->push_back("_getVersion");
unhandled_methods_->push_back("_getVisitorCustomVar");
unhandled_methods_->push_back("_getXKey");
unhandled_methods_->push_back("_getXValue");
unhandled_methods_->push_back("_setAccount"); // async only
}
GoogleAnalyticsFilter::GoogleAnalyticsFilter(
HtmlParse* html_parse, Statistics* stats,
MethodVector* glue_methods, MethodVector* unhandled_methods)
: glue_methods_(glue_methods),
unhandled_methods_(unhandled_methods),
html_parse_(html_parse),
script_element_(NULL),
script_characters_node_(NULL),
page_load_count_((stats == NULL) ? NULL :
stats->GetVariable(kPageLoadCount)),
rewritten_count_((stats == NULL) ? NULL :
stats->GetVariable(kRewrittenCount))
{ }
GoogleAnalyticsFilter::~GoogleAnalyticsFilter() {}
void GoogleAnalyticsFilter::InitStats(Statistics* statistics) {
statistics->AddVariable(kPageLoadCount);
statistics->AddVariable(kRewrittenCount);
}
void GoogleAnalyticsFilter::StartDocument() {
ResetFilter();
page_load_count_->Add(1);
}
void GoogleAnalyticsFilter::EndDocument() {
if (is_load_found_) {
if (is_init_found_) {
if (RewriteAsAsync()) {
rewritten_count_->Add(1);
html_parse_->InfoHere("Google Analytics rewritten: SUCCESS!");
} else {
html_parse_->InfoHere("Google Analytics not rewritten: rewrite failed");
}
} else {
html_parse_->InfoHere(
"Google Analytics not rewritten: only found ga.js load");
}
}
ResetFilter();
}
void GoogleAnalyticsFilter::StartElement(HtmlElement* element) {
// No tags allowed inside script element.
if (script_element_ != NULL) {
html_parse_->ErrorHere("Google Analytics reset: Tag '%s' found inside "
"script.", CEscape(element->name_str()).c_str());
ResetFilter();
}
if (element->keyword() == HtmlName::kScript) {
script_element_ = element;
}
}
void GoogleAnalyticsFilter::EndElement(HtmlElement* element) {
if (script_element_ != NULL) {
if (element != script_element_) {
html_parse_->ErrorHere("Google Analytics reset: Unexpected tag '%s' "
"inside a script.",
CEscape(element->name_str()).c_str());
ResetFilter();
} else {
FindRewritableScripts();
script_element_ = NULL;
script_characters_node_ = NULL;
}
}
}
void GoogleAnalyticsFilter::Flush() {
if (script_element_ != NULL) {
html_parse_->InfoHere("Google Analytics reset: flush in a script.");
ResetFilter();
}
}
void GoogleAnalyticsFilter::Characters(HtmlCharactersNode* characters_node) {
if (script_element_ != NULL) {
if (script_characters_node_ == NULL) {
script_characters_node_ = characters_node;
} else {
html_parse_->ErrorHere("Google Analytics reset: multiple character "
"nodes in script.");
ResetFilter();
}
}
}
void GoogleAnalyticsFilter::Comment(HtmlCommentNode* comment) {
if (script_element_ != NULL) {
html_parse_->InfoHere("Google Analytics reset: comment found inside "
"script.");
ResetFilter();
}
}
void GoogleAnalyticsFilter::Cdata(HtmlCdataNode* cdata) {
if (script_element_ != NULL) {
html_parse_->InfoHere("Google Analytics reset: CDATA found inside script.");
ResetFilter();
}
}
void GoogleAnalyticsFilter::IEDirective(HtmlIEDirectiveNode* directive) {
if (script_element_ != NULL) {
html_parse_->ErrorHere("Google Analytics reset: IE Directive found "
"inside script.");
ResetFilter();
}
}
void GoogleAnalyticsFilter::ResetFilter() {
script_element_ = NULL;
script_characters_node_ = NULL;
is_init_found_ = false;
is_load_found_ = false;
STLDeleteContainerPointers(script_editors_.begin(),
script_editors_.end());
script_editors_.clear();
}
bool GoogleAnalyticsFilter::MatchSyncLoad(StringPiece contents,
GoogleString::size_type* pos,
GoogleString::size_type* len) const {
GoogleString::size_type url_pos = contents.find(kGaJsUrlSuffix);
if (url_pos != GoogleString::npos) {
// In the common case, document.write is 56 characters before the url.
// Allow a little extra wiggle room (e.g. for different formating), but
// not so much that an unrelated document.write is found.
const GoogleString::size_type max_distance = 80;
GoogleString::size_type write_start_pos =
url_pos < max_distance ? 0 : url_pos - max_distance;
StringPiece write_start(contents.data() + write_start_pos,
url_pos - write_start_pos);
GoogleString::size_type write_pos = write_start.find(
kGaJsDocumentWriteStart);
if (write_pos == GoogleString::npos) {
html_parse_->InfoHere("Found ga.js without a matching document.write");
} else {
write_pos += write_start_pos;
GoogleString::size_type write_end_pos = contents.find(
kGaJsDocumentWriteEnd,
url_pos + StringPiece(kGaJsUrlSuffix).size());
if (write_end_pos != GoogleString::npos) {
write_end_pos += StringPiece(kGaJsDocumentWriteEnd).size();
*pos = write_pos;
*len = write_end_pos - write_pos;
html_parse_->InfoHere("Found ga.js load: document.write");
return true;
}
}
}
return false;
}
bool GoogleAnalyticsFilter::MatchSyncInit(StringPiece contents,
GoogleString::size_type start_pos,
GoogleString::size_type* pos,
GoogleString::size_type* len) const {
StringPiece tracker_method(kGaJsGetTracker);
GoogleString::size_type tracker_method_pos = contents.find(
tracker_method, start_pos);
if (tracker_method_pos == GoogleString::npos) {
tracker_method = StringPiece(kGaJsCreateTracker);
tracker_method_pos = contents.find(tracker_method, start_pos);
}
if (tracker_method_pos != GoogleString::npos) {
html_parse_->InfoHere("Found ga.js init: %s",
tracker_method.as_string().c_str());
*pos = tracker_method_pos;
*len = tracker_method.size();
return true;
}
return false;
}
bool GoogleAnalyticsFilter::MatchUnhandledCalls(
StringPiece contents, GoogleString::size_type start_pos) const {
// TODO(slamm): Use a more efficient multiple pattern algorithm
while (1) {
GoogleString::size_type candidate_pos = contents.find("._");
if (candidate_pos == GoogleString::npos) {
break;
}
contents = contents.substr(candidate_pos + 1);
for (int i = 0, ni = unhandled_methods_->size(); i < ni; ++i) {
const StringPiece& method = unhandled_methods_->at(i);
if (contents.starts_with(method)) {
for (int j = method.size(), nj = contents.size(); j < nj; ++j) {
char c = contents[j];
if (c == '(') {
html_parse_->InfoHere("Matched unhandled call: %s",
method.as_string().c_str());
return true;
} else if (!IsHtmlSpace(c)) {
break;
}
}
}
}
}
return false;
}
void GoogleAnalyticsFilter::FindRewritableScripts() {
if (html_parse_->IsRewritable(script_element_)) {
StringPiece src = script_element_->AttributeValue(HtmlName::kSrc);
if (src != NULL && !src.empty()) {
if (src.ends_with(kGaJsUrlSuffix)) {
html_parse_->InfoHere("Found ga.js load: script src");
is_load_found_ = true;
script_editors_.push_back(new ScriptEditor(
script_element_, script_characters_node_,
GoogleString::npos, GoogleString::npos,
ScriptEditor::kGaJsScriptSrcLoad));
}
} else if (script_characters_node_ != NULL) {
StringPiece contents = script_characters_node_->contents();
if (!contents.empty()) {
GoogleString::size_type start_pos = 0;
GoogleString::size_type pos, len;
if (MatchSyncLoad(contents, &pos, &len)) {
is_load_found_ = true;
script_editors_.push_back(new ScriptEditor(
script_element_, script_characters_node_, pos, len,
ScriptEditor::kGaJsDocWriteLoad));
start_pos = pos + len;
}
if (is_load_found_ && MatchSyncInit(contents, start_pos, &pos, &len)) {
is_init_found_ = true;
script_editors_.push_back(new ScriptEditor(
script_element_, script_characters_node_, pos, len,
ScriptEditor::kGaJsInit));
start_pos = pos + len;
}
if (is_init_found_ && MatchUnhandledCalls(contents, start_pos)) {
html_parse_->InfoHere("Google Analytics reset: unhandled call.");
ResetFilter();
return;
}
}
}
}
}
void GoogleAnalyticsFilter::GetSyncToAsyncScript(GoogleString *buffer) const {
buffer->clear();
buffer->append(kGaSnippetPrefix);
int last_index = glue_methods_->size() - 1;
for (int i = 0; i <= last_index; i++) {
buffer->append(" '");
buffer->append(glue_methods_->at(i).as_string());
if (i == last_index) {
buffer->append("'\n");
} else {
buffer->append("',\n");
}
}
buffer->append(kGaSnippetSuffix);
}
bool GoogleAnalyticsFilter::RewriteAsAsync() {
if (!is_init_found_ || !is_load_found_) {
return false;
}
ScriptEditor* first_editor = script_editors_[0];
HtmlElement* first_script = first_editor->GetScriptElement();
if (!html_parse_->IsRewritable(first_script)) {
html_parse_->InfoHere("First script is not rewritable.");
return false;
}
ScriptEditor::Type first_type = first_editor->GetType();
CHECK(first_type == ScriptEditor::kGaJsScriptSrcLoad ||
first_type == ScriptEditor::kGaJsDocWriteLoad);
GoogleString replacement_script;
for (int i = script_editors_.size() - 1; i > 0; --i) {
ScriptEditor* editor = script_editors_[i];
HtmlElement* script = editor->GetScriptElement();
if (editor->GetType() == ScriptEditor::kGaJsScriptSrcLoad) {
html_parse_->DeleteNode(script);
html_parse_->InfoHere("Deleted script src load");
} else if (editor->GetType() == ScriptEditor::kGaJsDocWriteLoad) {
editor->NewContents("", &replacement_script);
html_parse_->ReplaceNode(
editor->GetScriptCharactersNode(),
html_parse_->NewCharactersNode(script, replacement_script));
html_parse_->InfoHere("Deleted document.write load");
} else if (editor->GetType() == ScriptEditor::kGaJsInit) {
editor->NewContents(kGaSnippetGetTracker, &replacement_script);
html_parse_->ReplaceNode(
editor->GetScriptCharactersNode(),
html_parse_->NewCharactersNode(script, replacement_script));
html_parse_->InfoHere("Replaced init");
}
}
GoogleString glue_script;
GetSyncToAsyncScript(&glue_script);
if (first_type == ScriptEditor::kGaJsScriptSrcLoad) {
html_parse_->PrependChild(
first_script,
html_parse_->NewCharactersNode(first_script, glue_script));
first_script->DeleteAttribute(HtmlName::kSrc);
html_parse_->InfoHere("Replaced script src load");
} else {
first_editor->NewContents(glue_script, &replacement_script);
html_parse_->ReplaceNode(
first_editor->GetScriptCharactersNode(),
html_parse_->NewCharactersNode(first_script, replacement_script));
html_parse_->InfoHere("Replaced document.write load");
}
return true;
}
} // namespace net_instaweb