blob: 6ab5a9c6e0c3e2c2d27a1438b56bfb55858a7f02 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "net/instaweb/rewriter/public/scan_filter.h"
#include <memory>
#include "net/instaweb/rewriter/public/common_filter.h"
#include "net/instaweb/rewriter/public/domain_lawyer.h"
#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/rewrite_stats.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "pagespeed/kernel/base/charset_util.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/html/html_node.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/response_headers.h"
namespace net_instaweb {
ScanFilter::ScanFilter(RewriteDriver* driver)
: driver_(driver) {
}
ScanFilter::~ScanFilter() {
}
void ScanFilter::StartDocument() {
// TODO(jmarantz): consider having rewrite_driver access the url in this
// class, rather than poking it into rewrite_driver.
seen_any_nodes_ = false;
seen_refs_ = false;
seen_base_ = false;
seen_meta_tag_charset_ = false;
// Set the driver's containing charset to whatever the headers set it to; if
// they don't set it to anything, blank the driver's so we know it's not set.
const ResponseHeaders* headers = driver_->response_headers();
driver_->set_containing_charset(headers == NULL ? "" :
headers->DetermineCharset());
}
void ScanFilter::Cdata(HtmlCdataNode* cdata) {
seen_any_nodes_ = true;
}
void ScanFilter::Comment(HtmlCommentNode* comment) {
seen_any_nodes_ = true;
}
void ScanFilter::IEDirective(HtmlIEDirectiveNode* directive) {
seen_any_nodes_ = true;
}
void ScanFilter::Directive(HtmlDirectiveNode* directive) {
seen_any_nodes_ = true;
}
void ScanFilter::Characters(HtmlCharactersNode* characters) {
// Check for a BOM at the start of the document. All other event handlers
// set the flag to false without using it, so if it's true on entry then
// this must be the first event.
if (!seen_any_nodes_ && driver_->containing_charset().empty()) {
StringPiece charset = GetCharsetForBom(characters->contents());
if (!charset.empty()) {
driver_->set_containing_charset(charset);
}
}
seen_any_nodes_ = true; // ignore any subsequent BOMs.
}
void ScanFilter::StartElement(HtmlElement* element) {
seen_any_nodes_ = true;
// <base>
if (element->keyword() == HtmlName::kBase) {
HtmlElement::Attribute* href = element->FindAttribute(HtmlName::kHref);
// See http://www.whatwg.org/specs/web-apps/current-work/multipage
// /semantics.html#the-base-element
//
// TODO(jmarantz): If the base is present but cannot be decoded, we should
// probably not do any resource rewriting at all.
if ((href != NULL) && (href->DecodedValueOrNull() != NULL)) {
// TODO(jmarantz): consider having rewrite_driver access the url in this
// class, rather than poking it into rewrite_driver.
GoogleString new_base = href->DecodedValueOrNull();
driver_->options()->domain_lawyer()->AddProxySuffix(driver_->google_url(),
&new_base);
driver_->SetBaseUrlIfUnset(new_base);
seen_base_ = true;
if (seen_refs_) {
driver_->set_refs_before_base();
}
}
// TODO(jmarantz): handle base targets in addition to hrefs.
} else {
resource_tag_scanner::UrlCategoryVector attributes;
resource_tag_scanner::ScanElement(element, driver_->options(), &attributes);
for (int i = 0, n = attributes.size(); i < n; ++i) {
// Don't count <html manifest=...> as a ref for the purpose of determining
// if there are refs before base. It's also important not to count <head
// profile=...> but ScanElement skips that.
if (!seen_refs_ && !seen_base_ &&
!(element->keyword() == HtmlName::kHtml &&
attributes[i].url->keyword() == HtmlName::kManifest)) {
seen_refs_ = true;
}
}
}
// Get/set the charset of the containing HTML page.
// HTTP1.1 says the default charset is ISO-8859-1 but as the W3C says (in
// http://www.w3.org/International/O-HTTP-charset.en.php) not many browsers
// actually do this so we default to "" instead so that we can tell if it
// has been set. The following logic is taken from
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#
// determining-the-character-encoding:
// 1. If the UA specifies an encoding, use that (not relevant to us).
// 2. If the transport layer specifies an encoding, use that.
// Implemented by using the charset from any Content-Type header.
// 3. If there is a BOM at the start of the file, use the relevant encoding.
// 4. If there is a meta tag in the HTML, use the encoding specified if any.
// 5. There are various other heuristics listed which are not implemented.
// 6. Otherwise, use no charset or default to something "sensible".
if (!seen_meta_tag_charset_ &&
driver_->containing_charset().empty() &&
element->keyword() == HtmlName::kMeta) {
GoogleString content, mime_type, charset;
if (CommonFilter::ExtractMetaTagDetails(*element, NULL,
&content, &mime_type, &charset)) {
if (!charset.empty()) {
driver_->set_containing_charset(charset);
seen_meta_tag_charset_ = true;
}
}
}
}
void ScanFilter::EndElement(HtmlElement* element) {
if (element->keyword() == HtmlName::kBase &&
!driver_->options()->domain_lawyer()->proxy_suffix().empty()) {
HtmlElement::Attribute* href = element->FindAttribute(HtmlName::kHref);
if (href != NULL) {
href->SetValue(driver_->base_url().AllExceptQuery());
}
}
}
void ScanFilter::Flush() {
driver_->server_context()->rewrite_stats()->num_flushes()->Add(1);
}
} // namespace net_instaweb