| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmaessen@google.com (Jan Maessen) |
| |
| #include "net/instaweb/rewriter/public/url_left_trim_filter.h" |
| |
| #include <cstddef> |
| #include <memory> |
| |
| #include "base/logging.h" |
| #include "net/instaweb/rewriter/public/resource_tag_scanner.h" |
| #include "net/instaweb/rewriter/public/rewrite_driver.h" |
| #include "pagespeed/kernel/base/statistics.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/html/html_element.h" |
| #include "pagespeed/kernel/html/html_name.h" |
| #include "pagespeed/kernel/http/google_url.h" |
| |
| namespace { |
| |
| // names for Statistics variables. |
| const char kUrlTrims[] = "url_trims"; |
| const char kUrlTrimSavedBytes[] = "url_trim_saved_bytes"; |
| |
| } // namespace |
| |
| namespace net_instaweb { |
| |
| UrlLeftTrimFilter::UrlLeftTrimFilter(RewriteDriver* rewrite_driver, |
| Statistics *stats) |
| : CommonFilter(rewrite_driver), |
| trim_count_(stats->GetVariable(kUrlTrims)), |
| trim_saved_bytes_(stats->GetVariable(kUrlTrimSavedBytes)) { |
| } |
| |
| UrlLeftTrimFilter::~UrlLeftTrimFilter() {} |
| |
| void UrlLeftTrimFilter::InitStats(Statistics* statistics) { |
| statistics->AddVariable(kUrlTrims); |
| statistics->AddVariable(kUrlTrimSavedBytes); |
| } |
| |
| // Do not rewrite the base tag. |
| void UrlLeftTrimFilter::StartElementImpl(HtmlElement* element) { |
| if (element->keyword() != HtmlName::kBase && |
| BaseUrlIsValid()) { |
| resource_tag_scanner::UrlCategoryVector attributes; |
| resource_tag_scanner::ScanElement( |
| element, driver()->options(), &attributes); |
| for (int i = 0, n = attributes.size(); i < n; ++i) { |
| TrimAttribute(attributes[i].url); |
| } |
| } |
| } |
| |
| // Resolve the url we want to trim, and then remove the scheme, origin |
| // and/or path as appropriate. |
| bool UrlLeftTrimFilter::Trim(const GoogleUrl& base_url, |
| const StringPiece& url_to_trim, |
| GoogleString* trimmed_url, |
| MessageHandler* handler) { |
| if (!base_url.IsWebValid() || url_to_trim.empty()) { |
| return false; |
| } |
| |
| GoogleUrl long_url(base_url, url_to_trim); |
| // Don't try to rework an invalid url |
| if (!long_url.IsWebValid()) { |
| return false; |
| } |
| |
| StringPiece long_url_buffer = long_url.Spec(); |
| size_t to_trim = 0; |
| |
| // If we can strip the whole origin (http://www.google.com/) do it, |
| // then see if we can strip the prefix of the path. |
| StringPiece origin = base_url.Origin(); |
| if (origin.length() < long_url_buffer.length() && |
| long_url.Origin() == origin) { |
| to_trim = origin.length(); |
| StringPiece path = base_url.PathSansLeaf(); |
| |
| // If the path still starts with a "//", we can't trim the origin. |
| // "//" is not actually the same as a single /, though most |
| // servers will do the same thing with it. |
| // E.g. on http://example.com/foo.html, don't trim |
| // http://example.com//bar.html to //bar or /bar. |
| if (long_url_buffer.substr(to_trim, 2) == "//") { |
| to_trim = 0; |
| } else if (to_trim + path.length() < long_url_buffer.length() && |
| StringPiece(long_url.PathSansLeaf()).starts_with(path)) { |
| // Don't trim the path off queries in the form http://foo.com/?a=b |
| // Instead resolve to /?a=b (not ?a=b, which resolves to |
| // index.html?a=b on http://foo.com/index.html). |
| if (!long_url.has_query() || long_url.LeafSansQuery().length() > 0) { |
| to_trim += path.length(); |
| |
| // If the path now starts with "//", we need to undo the trim. |
| // E.g. on http://example.com/foo/bar/index.html, don't trim |
| // http://example.com/foo/bar//baz/other.html to //baz/other.html |
| // or to /baz/other.html. |
| // a url ".../#anchor" with resolve relative to the base page instead |
| // of the base directory. |
| if (long_url_buffer[to_trim] == '/' || |
| long_url_buffer[to_trim] == '#' || |
| long_url_buffer[to_trim] == '?') { |
| to_trim -= path.length(); |
| } |
| } |
| } |
| } |
| |
| // If we can't strip the whole origin, see if we can strip off the scheme. |
| // TODO(jmaessen): disabled; causes IE8 to double-fetch urls, and problems |
| // with other scripting. Switch on for whitelisted user-agents in future? |
| // Not a huge savings in general anyway. |
| #define STRIP_URL_SCHEME 0 |
| #if STRIP_URL_SCHEME |
| StringPiece scheme = base_url.Scheme(); |
| if (false && to_trim == 0 && scheme.length() + 1 < long_url_buffer.length() && |
| long_url.SchemeIs(scheme)) { |
| // +1 for : (not included in scheme) |
| to_trim = scheme.length() + 1; |
| } |
| #endif |
| |
| // Candidate trimmed URL. |
| StringPiece trimmed_url_piece(long_url_buffer); |
| trimmed_url_piece.remove_prefix(to_trim); |
| |
| if (trimmed_url_piece.length() < url_to_trim.length()) { |
| // If we have a colon before the first slash there are two options: |
| // option 1 - we still have our scheme, in which case we're not shortening |
| // anything, and can just abort. |
| // option 2 - the original url had some nasty scheme-looking stuff in the |
| // middle of the url, and now it's at the front. This causes Badness, |
| // revert to the original. |
| size_t colon_pos = trimmed_url_piece.find(':'); |
| if (colon_pos != trimmed_url_piece.npos) { |
| if (trimmed_url_piece.rfind('/', colon_pos) == trimmed_url_piece.npos) { |
| return false; |
| } |
| } |
| GoogleUrl resolved_newurl(base_url, trimmed_url_piece); |
| // Error condition: this shouldn't happen. |
| DCHECK(resolved_newurl.IsWebValid()); |
| DCHECK(resolved_newurl == long_url); |
| if (!resolved_newurl.IsWebValid() || resolved_newurl != long_url) { |
| return false; |
| } |
| *trimmed_url = trimmed_url_piece.as_string(); |
| return true; |
| } |
| return false; |
| } |
| |
| // Trim the value of the given attribute, if the attribute is non-NULL. |
| void UrlLeftTrimFilter::TrimAttribute(HtmlElement::Attribute* attr) { |
| if (attr != NULL) { |
| StringPiece val(attr->DecodedValueOrNull()); |
| GoogleString trimmed_val; |
| size_t orig_size = val.size(); |
| if (!val.empty() && |
| Trim(driver()->base_url(), val, &trimmed_val, |
| driver()->message_handler())) { |
| attr->SetValue(trimmed_val); |
| trim_count_->Add(1); |
| trim_saved_bytes_->Add(orig_size - trimmed_val.size()); |
| } |
| } |
| } |
| |
| } // namespace net_instaweb |