src/net/instaweb/rewriter/scan_filter.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2011 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #include "net/instaweb/rewriter/public/scan_filter.h"

 #include <memory>

 #include "net/instaweb/rewriter/public/common_filter.h"
 #include "net/instaweb/rewriter/public/domain_lawyer.h"
 #include "net/instaweb/rewriter/public/resource_tag_scanner.h"
 #include "net/instaweb/rewriter/public/rewrite_driver.h"
 #include "net/instaweb/rewriter/public/rewrite_options.h"
 #include "net/instaweb/rewriter/public/rewrite_stats.h"
 #include "net/instaweb/rewriter/public/server_context.h"
 #include "pagespeed/kernel/base/charset_util.h"
 #include "pagespeed/kernel/base/statistics.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/html/html_node.h"
 #include "pagespeed/kernel/http/google_url.h"
 #include "pagespeed/kernel/http/response_headers.h"

 namespace net_instaweb {

 ScanFilter::ScanFilter(RewriteDriver* driver)
     : driver_(driver) {
 }

 ScanFilter::~ScanFilter() {
 }

 void ScanFilter::StartDocument() {
   // TODO(jmarantz): consider having rewrite_driver access the url in this
   // class, rather than poking it into rewrite_driver.
   seen_any_nodes_ = false;
   seen_refs_ = false;
   seen_base_ = false;
   seen_meta_tag_charset_ = false;

   // Set the driver's containing charset to whatever the headers set it to; if
   // they don't set it to anything, blank the driver's so we know it's not set.
   const ResponseHeaders* headers = driver_->response_headers();
   driver_->set_containing_charset(headers == NULL ? "" :
                                   headers->DetermineCharset());
 }

 void ScanFilter::Cdata(HtmlCdataNode* cdata) {
   seen_any_nodes_ = true;
 }

 void ScanFilter::Comment(HtmlCommentNode* comment) {
   seen_any_nodes_ = true;
 }

 void ScanFilter::IEDirective(HtmlIEDirectiveNode* directive) {
   seen_any_nodes_ = true;
 }

 void ScanFilter::Directive(HtmlDirectiveNode* directive) {
   seen_any_nodes_ = true;
 }

 void ScanFilter::Characters(HtmlCharactersNode* characters) {
   // Check for a BOM at the start of the document. All other event handlers
   // set the flag to false without using it, so if it's true on entry then
   // this must be the first event.
   if (!seen_any_nodes_ && driver_->containing_charset().empty()) {
     StringPiece charset = GetCharsetForBom(characters->contents());
     if (!charset.empty()) {
       driver_->set_containing_charset(charset);
     }
   }
   seen_any_nodes_ = true;  // ignore any subsequent BOMs.
 }

 void ScanFilter::StartElement(HtmlElement* element) {
   seen_any_nodes_ = true;
   // <base>
   if (element->keyword() == HtmlName::kBase) {
     HtmlElement::Attribute* href = element->FindAttribute(HtmlName::kHref);
     // See http://www.whatwg.org/specs/web-apps/current-work/multipage
     // /semantics.html#the-base-element
     //
     // TODO(jmarantz): If the base is present but cannot be decoded, we should
     // probably not do any resource rewriting at all.
     if ((href != NULL) && (href->DecodedValueOrNull() != NULL)) {
       // TODO(jmarantz): consider having rewrite_driver access the url in this
       // class, rather than poking it into rewrite_driver.
       GoogleString new_base = href->DecodedValueOrNull();
       driver_->options()->domain_lawyer()->AddProxySuffix(driver_->google_url(),
                                                           &new_base);
       driver_->SetBaseUrlIfUnset(new_base);
       seen_base_ = true;
       if (seen_refs_) {
         driver_->set_refs_before_base();
       }
     }
     // TODO(jmarantz): handle base targets in addition to hrefs.
   } else {
     resource_tag_scanner::UrlCategoryVector attributes;
     resource_tag_scanner::ScanElement(element, driver_->options(), &attributes);
     for (int i = 0, n = attributes.size(); i < n; ++i) {
       // Don't count <html manifest=...> as a ref for the purpose of determining
       // if there are refs before base.  It's also important not to count <head
       // profile=...> but ScanElement skips that.
       if (!seen_refs_ && !seen_base_ &&
           !(element->keyword() == HtmlName::kHtml &&
             attributes[i].url->keyword() == HtmlName::kManifest)) {
         seen_refs_ = true;
       }
     }
   }

   // Get/set the charset of the containing HTML page.
   // HTTP1.1 says the default charset is ISO-8859-1 but as the W3C says (in
   // http://www.w3.org/International/O-HTTP-charset.en.php) not many browsers
   // actually do this so we default to "" instead so that we can tell if it
   // has been set. The following logic is taken from
   // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#
   // determining-the-character-encoding:
   // 1. If the UA specifies an encoding, use that (not relevant to us).
   // 2. If the transport layer specifies an encoding, use that.
   //    Implemented by using the charset from any Content-Type header.
   // 3. If there is a BOM at the start of the file, use the relevant encoding.
   // 4. If there is a meta tag in the HTML, use the encoding specified if any.
   // 5. There are various other heuristics listed which are not implemented.
   // 6. Otherwise, use no charset or default to something "sensible".
   if (!seen_meta_tag_charset_ &&
       driver_->containing_charset().empty() &&
       element->keyword() == HtmlName::kMeta) {
     GoogleString content, mime_type, charset;
     if (CommonFilter::ExtractMetaTagDetails(*element, NULL,
                                             &content, &mime_type, &charset)) {
       if (!charset.empty()) {
         driver_->set_containing_charset(charset);
         seen_meta_tag_charset_ = true;
       }
     }
   }
 }

 void ScanFilter::EndElement(HtmlElement* element) {
   if (element->keyword() == HtmlName::kBase &&
       !driver_->options()->domain_lawyer()->proxy_suffix().empty()) {
     HtmlElement::Attribute* href = element->FindAttribute(HtmlName::kHref);
     if (href != NULL) {
       href->SetValue(driver_->base_url().AllExceptQuery());
     }
   }
 }

 void ScanFilter::Flush() {
   driver_->server_context()->rewrite_stats()->num_flushes()->Add(1);
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2011 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)

	#include "net/instaweb/rewriter/public/scan_filter.h"

	#include <memory>

	#include "net/instaweb/rewriter/public/common_filter.h"
	#include "net/instaweb/rewriter/public/domain_lawyer.h"
	#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
	#include "net/instaweb/rewriter/public/rewrite_driver.h"
	#include "net/instaweb/rewriter/public/rewrite_options.h"
	#include "net/instaweb/rewriter/public/rewrite_stats.h"
	#include "net/instaweb/rewriter/public/server_context.h"
	#include "pagespeed/kernel/base/charset_util.h"
	#include "pagespeed/kernel/base/statistics.h"
	#include "pagespeed/kernel/base/string.h"
	#include "pagespeed/kernel/base/string_util.h"
	#include "pagespeed/kernel/html/html_element.h"
	#include "pagespeed/kernel/html/html_name.h"
	#include "pagespeed/kernel/html/html_node.h"
	#include "pagespeed/kernel/http/google_url.h"
	#include "pagespeed/kernel/http/response_headers.h"

	namespace net_instaweb {

	ScanFilter::ScanFilter(RewriteDriver* driver)
	: driver_(driver) {
	}

	ScanFilter::~ScanFilter() {
	}

	void ScanFilter::StartDocument() {
	// TODO(jmarantz): consider having rewrite_driver access the url in this
	// class, rather than poking it into rewrite_driver.
	seen_any_nodes_ = false;
	seen_refs_ = false;
	seen_base_ = false;
	seen_meta_tag_charset_ = false;

	// Set the driver's containing charset to whatever the headers set it to; if
	// they don't set it to anything, blank the driver's so we know it's not set.
	const ResponseHeaders* headers = driver_->response_headers();
	driver_->set_containing_charset(headers == NULL ? "" :
	headers->DetermineCharset());
	}

	void ScanFilter::Cdata(HtmlCdataNode* cdata) {
	seen_any_nodes_ = true;
	}

	void ScanFilter::Comment(HtmlCommentNode* comment) {
	seen_any_nodes_ = true;
	}

	void ScanFilter::IEDirective(HtmlIEDirectiveNode* directive) {
	seen_any_nodes_ = true;
	}

	void ScanFilter::Directive(HtmlDirectiveNode* directive) {
	seen_any_nodes_ = true;
	}

	void ScanFilter::Characters(HtmlCharactersNode* characters) {
	// Check for a BOM at the start of the document. All other event handlers
	// set the flag to false without using it, so if it's true on entry then
	// this must be the first event.
	if (!seen_any_nodes_ && driver_->containing_charset().empty()) {
	StringPiece charset = GetCharsetForBom(characters->contents());
	if (!charset.empty()) {
	driver_->set_containing_charset(charset);
	}
	}
	seen_any_nodes_ = true; // ignore any subsequent BOMs.
	}

	void ScanFilter::StartElement(HtmlElement* element) {
	seen_any_nodes_ = true;
	// <base>
	if (element->keyword() == HtmlName::kBase) {
	HtmlElement::Attribute* href = element->FindAttribute(HtmlName::kHref);
	// See http://www.whatwg.org/specs/web-apps/current-work/multipage
	// /semantics.html#the-base-element
	//
	// TODO(jmarantz): If the base is present but cannot be decoded, we should
	// probably not do any resource rewriting at all.
	if ((href != NULL) && (href->DecodedValueOrNull() != NULL)) {
	// TODO(jmarantz): consider having rewrite_driver access the url in this
	// class, rather than poking it into rewrite_driver.
	GoogleString new_base = href->DecodedValueOrNull();
	driver_->options()->domain_lawyer()->AddProxySuffix(driver_->google_url(),
	&new_base);
	driver_->SetBaseUrlIfUnset(new_base);
	seen_base_ = true;
	if (seen_refs_) {
	driver_->set_refs_before_base();
	}
	}
	// TODO(jmarantz): handle base targets in addition to hrefs.
	} else {
	resource_tag_scanner::UrlCategoryVector attributes;
	resource_tag_scanner::ScanElement(element, driver_->options(), &attributes);
	for (int i = 0, n = attributes.size(); i < n; ++i) {
	// Don't count <html manifest=...> as a ref for the purpose of determining
	// if there are refs before base. It's also important not to count <head
	// profile=...> but ScanElement skips that.
	if (!seen_refs_ && !seen_base_ &&
	!(element->keyword() == HtmlName::kHtml &&
	attributes[i].url->keyword() == HtmlName::kManifest)) {
	seen_refs_ = true;
	}
	}
	}

	// Get/set the charset of the containing HTML page.
	// HTTP1.1 says the default charset is ISO-8859-1 but as the W3C says (in
	// http://www.w3.org/International/O-HTTP-charset.en.php) not many browsers
	// actually do this so we default to "" instead so that we can tell if it
	// has been set. The following logic is taken from
	// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#
	// determining-the-character-encoding:
	// 1. If the UA specifies an encoding, use that (not relevant to us).
	// 2. If the transport layer specifies an encoding, use that.
	// Implemented by using the charset from any Content-Type header.
	// 3. If there is a BOM at the start of the file, use the relevant encoding.
	// 4. If there is a meta tag in the HTML, use the encoding specified if any.
	// 5. There are various other heuristics listed which are not implemented.
	// 6. Otherwise, use no charset or default to something "sensible".
	if (!seen_meta_tag_charset_ &&
	driver_->containing_charset().empty() &&
	element->keyword() == HtmlName::kMeta) {
	GoogleString content, mime_type, charset;
	if (CommonFilter::ExtractMetaTagDetails(*element, NULL,
	&content, &mime_type, &charset)) {
	if (!charset.empty()) {
	driver_->set_containing_charset(charset);
	seen_meta_tag_charset_ = true;
	}
	}
	}
	}

	void ScanFilter::EndElement(HtmlElement* element) {
	if (element->keyword() == HtmlName::kBase &&
	!driver_->options()->domain_lawyer()->proxy_suffix().empty()) {
	HtmlElement::Attribute* href = element->FindAttribute(HtmlName::kHref);
	if (href != NULL) {
	href->SetValue(driver_->base_url().AllExceptQuery());
	}
	}
	}

	void ScanFilter::Flush() {
	driver_->server_context()->rewrite_stats()->num_flushes()->Add(1);
	}

	} // namespace net_instaweb