blob: edaf4d13014aca8237e089ce89e99908c7e8c1cc [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmaessen@google.com (Jan Maessen)
#include "net/instaweb/rewriter/public/url_left_trim_filter.h"
#include <cstddef>
#include <memory>
#include "base/logging.h"
#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/http/google_url.h"
namespace {
// names for Statistics variables.
const char kUrlTrims[] = "url_trims";
const char kUrlTrimSavedBytes[] = "url_trim_saved_bytes";
} // namespace
namespace net_instaweb {
UrlLeftTrimFilter::UrlLeftTrimFilter(RewriteDriver* rewrite_driver,
Statistics *stats)
: CommonFilter(rewrite_driver),
trim_count_(stats->GetVariable(kUrlTrims)),
trim_saved_bytes_(stats->GetVariable(kUrlTrimSavedBytes)) {
}
UrlLeftTrimFilter::~UrlLeftTrimFilter() {}
void UrlLeftTrimFilter::InitStats(Statistics* statistics) {
statistics->AddVariable(kUrlTrims);
statistics->AddVariable(kUrlTrimSavedBytes);
}
// Do not rewrite the base tag.
void UrlLeftTrimFilter::StartElementImpl(HtmlElement* element) {
if (element->keyword() != HtmlName::kBase &&
BaseUrlIsValid()) {
resource_tag_scanner::UrlCategoryVector attributes;
resource_tag_scanner::ScanElement(
element, driver()->options(), &attributes);
for (int i = 0, n = attributes.size(); i < n; ++i) {
TrimAttribute(attributes[i].url);
}
}
}
// Resolve the url we want to trim, and then remove the scheme, origin
// and/or path as appropriate.
bool UrlLeftTrimFilter::Trim(const GoogleUrl& base_url,
const StringPiece& url_to_trim,
GoogleString* trimmed_url,
MessageHandler* handler) {
if (!base_url.IsWebValid() || url_to_trim.empty()) {
return false;
}
GoogleUrl long_url(base_url, url_to_trim);
// Don't try to rework an invalid url
if (!long_url.IsWebValid()) {
return false;
}
StringPiece long_url_buffer = long_url.Spec();
size_t to_trim = 0;
// If we can strip the whole origin (http://www.google.com/) do it,
// then see if we can strip the prefix of the path.
StringPiece origin = base_url.Origin();
if (origin.length() < long_url_buffer.length() &&
long_url.Origin() == origin) {
to_trim = origin.length();
StringPiece path = base_url.PathSansLeaf();
// If the path still starts with a "//", we can't trim the origin.
// "//" is not actually the same as a single /, though most
// servers will do the same thing with it.
// E.g. on http://example.com/foo.html, don't trim
// http://example.com//bar.html to //bar or /bar.
if (long_url_buffer.substr(to_trim, 2) == "//") {
to_trim = 0;
} else if (to_trim + path.length() < long_url_buffer.length() &&
StringPiece(long_url.PathSansLeaf()).starts_with(path)) {
// Don't trim the path off queries in the form http://foo.com/?a=b
// Instead resolve to /?a=b (not ?a=b, which resolves to
// index.html?a=b on http://foo.com/index.html).
if (!long_url.has_query() || long_url.LeafSansQuery().length() > 0) {
to_trim += path.length();
// If the path now starts with "//", we need to undo the trim.
// E.g. on http://example.com/foo/bar/index.html, don't trim
// http://example.com/foo/bar//baz/other.html to //baz/other.html
// or to /baz/other.html.
// a url ".../#anchor" with resolve relative to the base page instead
// of the base directory.
if (long_url_buffer[to_trim] == '/' ||
long_url_buffer[to_trim] == '#' ||
long_url_buffer[to_trim] == '?') {
to_trim -= path.length();
}
}
}
}
// If we can't strip the whole origin, see if we can strip off the scheme.
// TODO(jmaessen): disabled; causes IE8 to double-fetch urls, and problems
// with other scripting. Switch on for whitelisted user-agents in future?
// Not a huge savings in general anyway.
#define STRIP_URL_SCHEME 0
#if STRIP_URL_SCHEME
StringPiece scheme = base_url.Scheme();
if (false && to_trim == 0 && scheme.length() + 1 < long_url_buffer.length() &&
long_url.SchemeIs(scheme)) {
// +1 for : (not included in scheme)
to_trim = scheme.length() + 1;
}
#endif
// Candidate trimmed URL.
StringPiece trimmed_url_piece(long_url_buffer);
trimmed_url_piece.remove_prefix(to_trim);
if (trimmed_url_piece.length() < url_to_trim.length()) {
// If we have a colon before the first slash there are two options:
// option 1 - we still have our scheme, in which case we're not shortening
// anything, and can just abort.
// option 2 - the original url had some nasty scheme-looking stuff in the
// middle of the url, and now it's at the front. This causes Badness,
// revert to the original.
size_t colon_pos = trimmed_url_piece.find(':');
if (colon_pos != trimmed_url_piece.npos) {
if (trimmed_url_piece.rfind('/', colon_pos) == trimmed_url_piece.npos) {
return false;
}
}
GoogleUrl resolved_newurl(base_url, trimmed_url_piece);
// Error condition: this shouldn't happen.
DCHECK(resolved_newurl.IsWebValid());
DCHECK(resolved_newurl == long_url);
if (!resolved_newurl.IsWebValid() || resolved_newurl != long_url) {
return false;
}
*trimmed_url = trimmed_url_piece.as_string();
return true;
}
return false;
}
// Trim the value of the given attribute, if the attribute is non-NULL.
void UrlLeftTrimFilter::TrimAttribute(HtmlElement::Attribute* attr) {
if (attr != NULL) {
StringPiece val(attr->DecodedValueOrNull());
GoogleString trimmed_val;
size_t orig_size = val.size();
if (!val.empty() &&
Trim(driver()->base_url(), val, &trimmed_val,
driver()->message_handler())) {
attr->SetValue(trimmed_val);
trim_count_->Add(1);
trim_saved_bytes_->Add(orig_size - trimmed_val.size());
}
}
}
} // namespace net_instaweb