src/net/instaweb/rewriter/common_filter.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: sligocki@google.com (Shawn Ligocki)

 #include "net/instaweb/rewriter/public/common_filter.h"

 #include "base/logging.h"
 #include "net/instaweb/http/public/log_record.h"
 #include "net/instaweb/rewriter/public/critical_images_beacon_filter.h"
 #include "net/instaweb/rewriter/public/resource.h"
 #include "net/instaweb/rewriter/public/rewrite_driver.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/html/doctype.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/html/html_node.h"
 #include "pagespeed/kernel/http/content_type.h"
 #include "pagespeed/kernel/http/google_url.h"
 #include "pagespeed/kernel/http/http_names.h"
 #include "pagespeed/kernel/http/response_headers.h"
 #include "pagespeed/opt/logging/enums.pb.h"

 namespace net_instaweb {

 const char CommonFilter::kCreateResourceFailedDebugMsg[] =
     "Cannot create resource: either its domain is unauthorized and "
     "InlineUnauthorizedResources is not enabled, or it cannot be fetched "
     "(check the server logs)";

 CommonFilter::CommonFilter(RewriteDriver* driver)
     : driver_(driver),
       server_context_(driver->server_context()),
       rewrite_options_(driver->options()),
       noscript_element_(NULL),
       end_body_point_(NULL),
       seen_base_(false) {
 }

 CommonFilter::~CommonFilter() {}

 void CommonFilter::InsertNodeAtBodyEnd(HtmlNode* data) {
   if (end_body_point_ != NULL && driver_->CanAppendChild(end_body_point_)) {
     driver_->AppendChild(end_body_point_, data);
   } else {
     driver_->InsertNodeBeforeCurrent(data);
   }
 }

 void CommonFilter::StartDocument() {
   // Base URL starts as document URL.
   noscript_element_ = NULL;
   end_body_point_ = NULL;
   // Reset whether or not we've seen the base tag yet, because we're starting
   // back at the top of the document.
   seen_base_ = false;
   // Run the actual filter's StartDocumentImpl.
   StartDocumentImpl();
 }

 void CommonFilter::StartElement(HtmlElement* element) {
   if (element->keyword() == HtmlName::kNoscript) {
     if (noscript_element_ == NULL) {
       noscript_element_ = element;  // Record top-level <noscript>
     }
   }
   // If this is a base tag with an href attribute, then we've seen the base, and
   // any url references after this point are relative to that base.
   if (element->keyword() == HtmlName::kBase &&
       element->FindAttribute(HtmlName::kHref) != NULL) {
     seen_base_ = true;
   }

   // If end_body_point_ was set (if we've already seen a </body> for instance),
   // and we encounter a new open element, clear end_body_point_ as it's no
   // longer the end of the body.
   end_body_point_ = NULL;

   // Run actual filter's StartElementImpl.
   StartElementImpl(element);
 }

 void CommonFilter::EndElement(HtmlElement* element) {
   switch (element->keyword()) {
     case HtmlName::kNoscript:
       if (element == noscript_element_) {
         noscript_element_ = NULL;  // We are exiting the top-level <noscript>
       }
       end_body_point_ = NULL;
       break;
     case HtmlName::kBody:
       // Preferred injection location
       end_body_point_ = element;
       break;
     case HtmlName::kHtml:
       if ((end_body_point_ == NULL ||
            !driver()->CanAppendChild(end_body_point_)) &&
           driver()->CanAppendChild(element)) {
         // Try to inject before </html> if before </body> won't work.
         end_body_point_ = element;
       }
       break;
     default:
       // There were (possibly implicit) close tags after </body> or </html>, so
       // throw that point away.
       end_body_point_ = NULL;
       break;
   }

   // Run actual filter's EndElementImpl.
   EndElementImpl(element);
 }

 void CommonFilter::Characters(net_instaweb::HtmlCharactersNode* characters) {
   // If we have a character node after the closing body or html tag, then we
   // can't safely insert something depending on being at the end of the document
   // there. This can happen due to a faulty filter, or malformed HTML.
   if (end_body_point_ != NULL && !OnlyWhitespace(characters->contents())) {
     end_body_point_ = NULL;
   }
 }

 // Returns whether or not we can resolve against the base tag.  References
 // that occur before the base tag can not be resolved against it.
 // Different browsers deal with such refs differently, but we shouldn't
 // change their behavior.
 bool CommonFilter::BaseUrlIsValid() const {
   // If there are no href or src attributes before the base, it's
   // always valid.
   if (!driver_->refs_before_base()) {
     return true;
   }
   // If the filter has already seen the base url, then it's now valid
   // even if there were urls before it.
   return seen_base_;
 }

 void CommonFilter::ResolveUrl(StringPiece input_url, GoogleUrl* out_url) {
   out_url->Clear();
   if (!input_url.empty()) {
     if (!BaseUrlIsValid()) {
       out_url->Reset(input_url);
     } else if (base_url().IsWebValid()) {
       out_url->Reset(base_url(), input_url);
     }
   }
 }

 ResourcePtr CommonFilter::CreateInputResource(StringPiece input_url,
                                               bool* is_authorized) {
   *is_authorized = true;  // Must be false iff input_url is not authorized.
   ResourcePtr resource;
   GoogleUrl resource_url;
   ResolveUrl(input_url, &resource_url);
   if (resource_url.IsWebValid()) {
     resource = driver_->CreateInputResource(
         resource_url,
         AllowUnauthorizedDomain(),
         (IntendedForInlining()
          ? RewriteDriver::kIntendedForInlining
          : RewriteDriver::kIntendedForGeneral),
         is_authorized);
   }
   return resource;
 }

 ResourcePtr CommonFilter::CreateInputResourceOrInsertDebugComment(
     StringPiece input_url, HtmlElement* element) {
   DCHECK(element != NULL);
   bool is_authorized;
   ResourcePtr input_resource(CreateInputResource(input_url, &is_authorized));
   if (input_resource.get() == NULL) {
     if (!is_authorized) {
       driver()->InsertUnauthorizedDomainDebugComment(input_url, element);
     }
   }
   return input_resource;
 }

 const GoogleUrl& CommonFilter::base_url() const {
   return driver_->base_url();
 }

 const GoogleUrl& CommonFilter::decoded_base_url() const {
   return driver_->decoded_base_url();
 }

 bool CommonFilter::ExtractMetaTagDetails(const HtmlElement& element,
                                          const ResponseHeaders* headers,
                                          GoogleString* content,
                                          GoogleString* mime_type,
                                          GoogleString* charset) {
   // The charset can be specified in an http-equiv or a charset attribute.
   const HtmlElement::Attribute* equiv;
   const HtmlElement::Attribute* value;
   const HtmlElement::Attribute* cs_attr;

   bool result = false;

   // HTTP-EQUIV case.
   if ((equiv = element.FindAttribute(HtmlName::kHttpEquiv)) != NULL &&
       (value = element.FindAttribute(HtmlName::kContent)) != NULL) {
     StringPiece attribute = equiv->DecodedValueOrNull();
     StringPiece value_str = value->DecodedValueOrNull();
     if (!value_str.empty() && !attribute.empty()) {
       value_str.CopyToString(content);
       TrimWhitespace(&attribute);

       // http-equiv must equal "Content-Type" and content mustn't be blank.
       if (StringCaseEqual(attribute, HttpAttributes::kContentType) &&
           !content->empty()) {
         // Per http://webdesign.about.com/od/metatags/qt/meta-charset.htm we
         // need to handle this:
         //   <meta http-equiv=Content-Type content=text/html; charset=UTF-8>
         // The approach here is to first parse the content string, then if it
         // doesn't have charset, look for a charset attribute and if the
         // content ends with ';' append the 'content=charset' text. Note that
         // we have to parse first because we need the -final- content for
         // checking the headers. If the initial parsing fails then there's no
         // point in proceeding because even if we add the content= then it
         // won't parse and we'll return false.
         bool have_parsed = true;  // Controls the second parse below.
         GoogleString local_charset;
         result = ParseContentType(*content, mime_type, &local_charset);
         if (result) {
           // No charset, see if we have a charset attribute to append.
           if (local_charset.empty() && *(content->rbegin()) == ';' &&
               ((cs_attr = element.FindAttribute(HtmlName::kCharset)) != NULL) &&
               (cs_attr->DecodedValueOrNull() != NULL)) {
             StrAppend(content, " charset=", cs_attr->DecodedValueOrNull());
             have_parsed = false;
           }
           // If requested, check to see if we have this value already.
           if (headers != NULL && headers->HasValue(attribute, *content)) {
             result = false;
           } else if (!have_parsed) {
             result = ParseContentType(*content, mime_type, &local_charset);
           }
           if (result) {
             *charset = local_charset;
           }
         }
       }
     }
   // charset case.
   } else if (((cs_attr = element.FindAttribute(HtmlName::kCharset)) != NULL) &&
              (cs_attr->DecodedValueOrNull() != NULL)) {
     *mime_type = "";
     *charset = cs_attr->DecodedValueOrNull();
     result = true;
   }

   return result;
 }

 bool CommonFilter::CanAddPagespeedOnloadToImage(const HtmlElement& element) {
   const HtmlElement::Attribute* onload_attribute =
       element.FindAttribute(HtmlName::kOnload);
   return (noscript_element() == NULL &&
           (onload_attribute == NULL ||
            (onload_attribute->DecodedValueOrNull() != NULL &&
             strcmp(onload_attribute->DecodedValueOrNull(),
                    CriticalImagesBeaconFilter::kImageOnloadCode) == 0)));
 }

 void CommonFilter::LogFilterModifiedContent() {
   driver()->log_record()->SetRewriterLoggingStatus(
       LoggingId(), RewriterApplication::APPLIED_OK);
 }

 void CommonFilter::AddJsToElement(StringPiece js, HtmlElement* script) {
   DCHECK(script->keyword() == HtmlName::kScript);
   // CDATA tags are required for inlined JS in XHTML pages to prevent
   // interpretation of certain characters (like &). In apache, something
   // downstream of mod_pagespeed could modify the content type of the response.
   // So CDATA tags are added conservatively if we are not sure that it is safe
   // to exclude them.
   GoogleString js_str;

   if (!(driver_->MimeTypeXhtmlStatus() == RewriteDriver::kIsNotXhtml)) {
     StrAppend(&js_str, "//<![CDATA[\n", js, "\n//]]>");
     js = js_str;
   }

   if (!driver_->doctype().IsVersion5()) {
     driver_->AddAttribute(script, HtmlName::kType, "text/javascript");
   }
   HtmlCharactersNode* script_content = driver_->NewCharactersNode(script, js);
   driver_->AppendChild(script, script_content);
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: sligocki@google.com (Shawn Ligocki)

	#include "net/instaweb/rewriter/public/common_filter.h"

	#include "base/logging.h"
	#include "net/instaweb/http/public/log_record.h"
	#include "net/instaweb/rewriter/public/critical_images_beacon_filter.h"
	#include "net/instaweb/rewriter/public/resource.h"
	#include "net/instaweb/rewriter/public/rewrite_driver.h"
	#include "pagespeed/kernel/base/string_util.h"
	#include "pagespeed/kernel/html/doctype.h"
	#include "pagespeed/kernel/html/html_element.h"
	#include "pagespeed/kernel/html/html_name.h"
	#include "pagespeed/kernel/html/html_node.h"
	#include "pagespeed/kernel/http/content_type.h"
	#include "pagespeed/kernel/http/google_url.h"
	#include "pagespeed/kernel/http/http_names.h"
	#include "pagespeed/kernel/http/response_headers.h"
	#include "pagespeed/opt/logging/enums.pb.h"

	namespace net_instaweb {

	const char CommonFilter::kCreateResourceFailedDebugMsg[] =
	"Cannot create resource: either its domain is unauthorized and "
	"InlineUnauthorizedResources is not enabled, or it cannot be fetched "
	"(check the server logs)";

	CommonFilter::CommonFilter(RewriteDriver* driver)
	: driver_(driver),
	server_context_(driver->server_context()),
	rewrite_options_(driver->options()),
	noscript_element_(NULL),
	end_body_point_(NULL),
	seen_base_(false) {
	}

	CommonFilter::~CommonFilter() {}

	void CommonFilter::InsertNodeAtBodyEnd(HtmlNode* data) {
	if (end_body_point_ != NULL && driver_->CanAppendChild(end_body_point_)) {
	driver_->AppendChild(end_body_point_, data);
	} else {
	driver_->InsertNodeBeforeCurrent(data);
	}
	}

	void CommonFilter::StartDocument() {
	// Base URL starts as document URL.
	noscript_element_ = NULL;
	end_body_point_ = NULL;
	// Reset whether or not we've seen the base tag yet, because we're starting
	// back at the top of the document.
	seen_base_ = false;
	// Run the actual filter's StartDocumentImpl.
	StartDocumentImpl();
	}

	void CommonFilter::StartElement(HtmlElement* element) {
	if (element->keyword() == HtmlName::kNoscript) {
	if (noscript_element_ == NULL) {
	noscript_element_ = element; // Record top-level <noscript>
	}
	}
	// If this is a base tag with an href attribute, then we've seen the base, and
	// any url references after this point are relative to that base.
	if (element->keyword() == HtmlName::kBase &&
	element->FindAttribute(HtmlName::kHref) != NULL) {
	seen_base_ = true;
	}

	// If end_body_point_ was set (if we've already seen a </body> for instance),
	// and we encounter a new open element, clear end_body_point_ as it's no
	// longer the end of the body.
	end_body_point_ = NULL;

	// Run actual filter's StartElementImpl.
	StartElementImpl(element);
	}

	void CommonFilter::EndElement(HtmlElement* element) {
	switch (element->keyword()) {
	case HtmlName::kNoscript:
	if (element == noscript_element_) {
	noscript_element_ = NULL; // We are exiting the top-level <noscript>
	}
	end_body_point_ = NULL;
	break;
	case HtmlName::kBody:
	// Preferred injection location
	end_body_point_ = element;
	break;
	case HtmlName::kHtml:
	if ((end_body_point_ == NULL \|\|
	!driver()->CanAppendChild(end_body_point_)) &&
	driver()->CanAppendChild(element)) {
	// Try to inject before </html> if before </body> won't work.
	end_body_point_ = element;
	}
	break;
	default:
	// There were (possibly implicit) close tags after </body> or </html>, so
	// throw that point away.
	end_body_point_ = NULL;
	break;
	}

	// Run actual filter's EndElementImpl.
	EndElementImpl(element);
	}

	void CommonFilter::Characters(net_instaweb::HtmlCharactersNode* characters) {
	// If we have a character node after the closing body or html tag, then we
	// can't safely insert something depending on being at the end of the document
	// there. This can happen due to a faulty filter, or malformed HTML.
	if (end_body_point_ != NULL && !OnlyWhitespace(characters->contents())) {
	end_body_point_ = NULL;
	}
	}

	// Returns whether or not we can resolve against the base tag. References
	// that occur before the base tag can not be resolved against it.
	// Different browsers deal with such refs differently, but we shouldn't
	// change their behavior.
	bool CommonFilter::BaseUrlIsValid() const {
	// If there are no href or src attributes before the base, it's
	// always valid.
	if (!driver_->refs_before_base()) {
	return true;
	}
	// If the filter has already seen the base url, then it's now valid
	// even if there were urls before it.
	return seen_base_;
	}

	void CommonFilter::ResolveUrl(StringPiece input_url, GoogleUrl* out_url) {
	out_url->Clear();
	if (!input_url.empty()) {
	if (!BaseUrlIsValid()) {
	out_url->Reset(input_url);
	} else if (base_url().IsWebValid()) {
	out_url->Reset(base_url(), input_url);
	}
	}
	}

	ResourcePtr CommonFilter::CreateInputResource(StringPiece input_url,
	bool* is_authorized) {
	*is_authorized = true; // Must be false iff input_url is not authorized.
	ResourcePtr resource;
	GoogleUrl resource_url;
	ResolveUrl(input_url, &resource_url);
	if (resource_url.IsWebValid()) {
	resource = driver_->CreateInputResource(
	resource_url,
	AllowUnauthorizedDomain(),
	(IntendedForInlining()
	? RewriteDriver::kIntendedForInlining
	: RewriteDriver::kIntendedForGeneral),
	is_authorized);
	}
	return resource;
	}

	ResourcePtr CommonFilter::CreateInputResourceOrInsertDebugComment(
	StringPiece input_url, HtmlElement* element) {
	DCHECK(element != NULL);
	bool is_authorized;
	ResourcePtr input_resource(CreateInputResource(input_url, &is_authorized));
	if (input_resource.get() == NULL) {
	if (!is_authorized) {
	driver()->InsertUnauthorizedDomainDebugComment(input_url, element);
	}
	}
	return input_resource;
	}

	const GoogleUrl& CommonFilter::base_url() const {
	return driver_->base_url();
	}

	const GoogleUrl& CommonFilter::decoded_base_url() const {
	return driver_->decoded_base_url();
	}

	bool CommonFilter::ExtractMetaTagDetails(const HtmlElement& element,
	const ResponseHeaders* headers,
	GoogleString* content,
	GoogleString* mime_type,
	GoogleString* charset) {
	// The charset can be specified in an http-equiv or a charset attribute.
	const HtmlElement::Attribute* equiv;
	const HtmlElement::Attribute* value;
	const HtmlElement::Attribute* cs_attr;

	bool result = false;

	// HTTP-EQUIV case.
	if ((equiv = element.FindAttribute(HtmlName::kHttpEquiv)) != NULL &&
	(value = element.FindAttribute(HtmlName::kContent)) != NULL) {
	StringPiece attribute = equiv->DecodedValueOrNull();
	StringPiece value_str = value->DecodedValueOrNull();
	if (!value_str.empty() && !attribute.empty()) {
	value_str.CopyToString(content);
	TrimWhitespace(&attribute);

	// http-equiv must equal "Content-Type" and content mustn't be blank.
	if (StringCaseEqual(attribute, HttpAttributes::kContentType) &&
	!content->empty()) {
	// Per http://webdesign.about.com/od/metatags/qt/meta-charset.htm we
	// need to handle this:
	// <meta http-equiv=Content-Type content=text/html; charset=UTF-8>
	// The approach here is to first parse the content string, then if it
	// doesn't have charset, look for a charset attribute and if the
	// content ends with ';' append the 'content=charset' text. Note that
	// we have to parse first because we need the -final- content for
	// checking the headers. If the initial parsing fails then there's no
	// point in proceeding because even if we add the content= then it
	// won't parse and we'll return false.
	bool have_parsed = true; // Controls the second parse below.
	GoogleString local_charset;
	result = ParseContentType(*content, mime_type, &local_charset);
	if (result) {
	// No charset, see if we have a charset attribute to append.
	if (local_charset.empty() && *(content->rbegin()) == ';' &&
	((cs_attr = element.FindAttribute(HtmlName::kCharset)) != NULL) &&
	(cs_attr->DecodedValueOrNull() != NULL)) {
	StrAppend(content, " charset=", cs_attr->DecodedValueOrNull());
	have_parsed = false;
	}
	// If requested, check to see if we have this value already.
	if (headers != NULL && headers->HasValue(attribute, *content)) {
	result = false;
	} else if (!have_parsed) {
	result = ParseContentType(*content, mime_type, &local_charset);
	}
	if (result) {
	*charset = local_charset;
	}
	}
	}
	}
	// charset case.
	} else if (((cs_attr = element.FindAttribute(HtmlName::kCharset)) != NULL) &&
	(cs_attr->DecodedValueOrNull() != NULL)) {
	*mime_type = "";
	*charset = cs_attr->DecodedValueOrNull();
	result = true;
	}

	return result;
	}

	bool CommonFilter::CanAddPagespeedOnloadToImage(const HtmlElement& element) {
	const HtmlElement::Attribute* onload_attribute =
	element.FindAttribute(HtmlName::kOnload);
	return (noscript_element() == NULL &&
	(onload_attribute == NULL \|\|
	(onload_attribute->DecodedValueOrNull() != NULL &&
	strcmp(onload_attribute->DecodedValueOrNull(),
	CriticalImagesBeaconFilter::kImageOnloadCode) == 0)));
	}

	void CommonFilter::LogFilterModifiedContent() {
	driver()->log_record()->SetRewriterLoggingStatus(
	LoggingId(), RewriterApplication::APPLIED_OK);
	}

	void CommonFilter::AddJsToElement(StringPiece js, HtmlElement* script) {
	DCHECK(script->keyword() == HtmlName::kScript);
	// CDATA tags are required for inlined JS in XHTML pages to prevent
	// interpretation of certain characters (like &). In apache, something
	// downstream of mod_pagespeed could modify the content type of the response.
	// So CDATA tags are added conservatively if we are not sure that it is safe
	// to exclude them.
	GoogleString js_str;

	if (!(driver_->MimeTypeXhtmlStatus() == RewriteDriver::kIsNotXhtml)) {
	StrAppend(&js_str, "//<![CDATA[\n", js, "\n//]]>");
	js = js_str;
	}

	if (!driver_->doctype().IsVersion5()) {
	driver_->AddAttribute(script, HtmlName::kType, "text/javascript");
	}
	HtmlCharactersNode* script_content = driver_->NewCharactersNode(script, js);
	driver_->AppendChild(script, script_content);
	}

	} // namespace net_instaweb