blob: 4dc7de6fe308ccd7176ad3005feae4504dda4964 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
// nforman@google.com (Naomi Forman)
#include "pagespeed/kernel/http/google_url.h"
#include <algorithm> // for std::find
#include <cstddef>
#include <string>
#include "base/logging.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/query_params.h"
namespace net_instaweb {
const size_t GoogleUrl::npos = std::string::npos;
GoogleUrl::GoogleUrl()
: gurl_() {
Init();
}
GoogleUrl::GoogleUrl(const GURL& gurl)
: gurl_(gurl) {
Init();
}
GoogleUrl::GoogleUrl(const GoogleString& spec)
: gurl_(spec) {
Init();
}
GoogleUrl::GoogleUrl(StringPiece sp)
: gurl_(sp.as_string()) {
Init();
}
GoogleUrl::GoogleUrl(const char* str)
: gurl_(str) {
Init();
}
// The following three constructors create a new GoogleUrl by resolving the
// String(Piece) against the base.
GoogleUrl::GoogleUrl(const GoogleUrl& base, const GoogleString& str) {
Reset(base, str);
}
GoogleUrl::GoogleUrl(const GoogleUrl& base, StringPiece sp) {
Reset(base, sp);
}
GoogleUrl::GoogleUrl(const GoogleUrl& base, const char* str) {
Reset(base, str);
}
void GoogleUrl::Swap(GoogleUrl* google_url) {
gurl_.Swap(&google_url->gurl_);
bool old_is_web_valid = is_web_valid_;
bool old_is_web_or_data_valid = is_web_or_data_valid_;
is_web_valid_ = google_url->is_web_valid_;
is_web_or_data_valid_ = google_url->is_web_or_data_valid_;
google_url->is_web_valid_ = old_is_web_valid;
google_url->is_web_or_data_valid_ = old_is_web_or_data_valid;
}
void GoogleUrl::Init() {
is_web_valid_ = gurl_.is_valid() && (SchemeIs("http") || SchemeIs("https"));
is_web_or_data_valid_ =
is_web_valid_ || (gurl_.is_valid() && SchemeIs("data"));
}
bool GoogleUrl::ResolveHelper(const GURL& base, const std::string& url) {
gurl_ = base.Resolve(url);
Init();
return gurl_.is_valid();
}
bool GoogleUrl::Reset(const GoogleUrl& base, const GoogleString& str) {
return ResolveHelper(base.gurl_, str);
}
bool GoogleUrl::Reset(const GoogleUrl& base, StringPiece sp) {
return ResolveHelper(base.gurl_, sp.as_string());
}
bool GoogleUrl::Reset(const GoogleUrl& base, const char* str) {
return ResolveHelper(base.gurl_, str);
}
bool GoogleUrl::Reset(StringPiece new_value) {
gurl_ = GURL(new_value.as_string());
Init();
return gurl_.is_valid();
}
bool GoogleUrl::Reset(const GoogleUrl& new_value) {
gurl_ = GURL(new_value.gurl_);
Init();
return gurl_.is_valid();
}
void GoogleUrl::Clear() {
gurl_ = GURL();
Init();
}
bool GoogleUrl::IsWebValid() const {
DCHECK(is_web_valid_ ==
(gurl_.is_valid() && (SchemeIs("http") || SchemeIs("https"))));
return is_web_valid_;
}
bool GoogleUrl::IsWebOrDataValid() const {
DCHECK(is_web_or_data_valid_ ==
(gurl_.is_valid() && (SchemeIs("http") || SchemeIs("https") ||
SchemeIs("data"))));
return is_web_or_data_valid_;
}
bool GoogleUrl::IsAnyValid() const {
return gurl_.is_valid();
}
GoogleUrl* GoogleUrl::CopyAndAddQueryParam(
StringPiece unescaped_name, StringPiece unescaped_value) const {
if (unescaped_value.data() == NULL) {
return CopyAndAddEscapedQueryParam(EscapeQueryParam(unescaped_name), NULL);
} else {
return CopyAndAddEscapedQueryParam(EscapeQueryParam(unescaped_name),
EscapeQueryParam(unescaped_value));
}
}
GoogleUrl* GoogleUrl::CopyAndAddEscapedQueryParam(
StringPiece escaped_name, StringPiece escaped_value) const {
QueryParams query_params;
query_params.ParseFromUrl(*this);
query_params.AddEscaped(escaped_name, escaped_value);
GoogleString query_params_string = query_params.ToEscapedString();
url::Replacements<char> replace_query;
url::Component query;
query.len = query_params_string.size();
replace_query.SetQuery(query_params_string.c_str(), query);
GoogleUrl* result = new GoogleUrl(gurl_.ReplaceComponents(replace_query));
return result;
}
size_t GoogleUrl::LeafEndPosition(const GURL& gurl) {
url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
if (parsed.path.is_valid()) {
return parsed.path.end();
}
if (parsed.port.is_valid()) {
return parsed.port.end();
}
if (parsed.host.is_valid()) {
return parsed.host.end();
}
if (parsed.password.is_valid()) {
return parsed.password.end();
}
if (parsed.username.is_valid()) {
return parsed.username.end();
}
if (parsed.scheme.is_valid()) {
return parsed.scheme.end();
}
return npos;
}
// Returns the offset at which the leaf ends in valid url spec.
// If there is no path, steps backward until valid end is found.
size_t GoogleUrl::LeafEndPosition() const {
return LeafEndPosition(gurl_);
}
size_t GoogleUrl::LeafStartPosition(const GURL& gurl) {
url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
size_t start_reverse_search_from = npos;
if (parsed.query.is_valid() && (parsed.query.begin > 0)) {
// query includes '?', so start the search from the character
// before it.
start_reverse_search_from = parsed.query.begin - 1;
}
return gurl.possibly_invalid_spec().rfind('/', start_reverse_search_from);
}
// Returns the offset at which the leaf starts in the fully
// qualified spec.
size_t GoogleUrl::LeafStartPosition() const {
return LeafStartPosition(gurl_);
}
size_t GoogleUrl::PathStartPosition(const GURL& gurl) {
const std::string& spec = gurl.spec();
url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
size_t origin_size = parsed.path.begin;
if (!parsed.path.is_valid()) {
origin_size = spec.size();
}
CHECK_LT(0, static_cast<int>(origin_size));
CHECK_LE(origin_size, spec.size());
return origin_size;
}
// Find the start of the path, includes '/'
size_t GoogleUrl::PathStartPosition() const {
return PathStartPosition(gurl_);
}
StringPiece GoogleUrl::AllExceptQuery() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
const std::string& spec = gurl_.possibly_invalid_spec();
size_t leaf_end = LeafEndPosition();
if (leaf_end == npos) {
return StringPiece();
} else {
return StringPiece(spec.data(), leaf_end);
}
}
StringPiece GoogleUrl::AllAfterQuery() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
const std::string& spec = gurl_.possibly_invalid_spec();
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
size_t query_end;
if (gurl_.has_query()) {
query_end = parsed.query.end();
} else {
query_end = LeafEndPosition();
}
if (query_end == npos) {
return StringPiece();
} else {
return StringPiece(spec.data() + query_end, spec.size() - query_end);
}
}
// Find the last slash before the question-mark, if any. See
// http://en.wikipedia.org/wiki/URI_scheme -- the query-string
// syntax is not well-defined. But the query-separator is well-defined:
// it's a ? so I believe this implies that the first ? has to delimit
// the query string.
StringPiece GoogleUrl::AllExceptLeaf() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
size_t last_slash = LeafStartPosition();
if (last_slash == npos) {
// No leaf found.
return StringPiece();
} else {
size_t after_last_slash = last_slash + 1;
return StringPiece(gurl_.spec().data(), after_last_slash);
}
}
StringPiece GoogleUrl::LeafWithQuery() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
size_t last_slash = LeafStartPosition();
if (last_slash == npos) {
// No slashes found.
return StringPiece();
} else {
size_t after_last_slash = last_slash + 1;
const std::string& spec = gurl_.spec();
return StringPiece(spec.data() + after_last_slash,
spec.size() - after_last_slash);
}
}
StringPiece GoogleUrl::LeafSansQuery() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
size_t leaf_start = LeafStartPosition();
if (leaf_start == npos) {
return StringPiece();
}
size_t after_last_slash = leaf_start + 1;
const std::string& spec = gurl_.spec();
size_t leaf_length = spec.size() - after_last_slash;
if (!gurl_.has_query()) {
return StringPiece(spec.data() + after_last_slash, leaf_length);
}
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
if (!parsed.query.is_valid()) {
return StringPiece();
} else {
// parsed.query.len doesn't include the '?'
return StringPiece(spec.data() + after_last_slash,
leaf_length - (parsed.query.len + 1));
}
}
// For "http://a.com/b/c/d?e=f/g returns "http://a.com" without trailing slash
StringPiece GoogleUrl::Origin() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
size_t origin_size = PathStartPosition();
if (origin_size == npos) {
return StringPiece();
} else {
return StringPiece(gurl_.spec().data(), origin_size);
}
}
// For "http://a.com/b/c/d?e=f/g returns "/b/c/d?e=f/g" including leading slash
StringPiece GoogleUrl::PathAndLeaf() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
size_t origin_size = PathStartPosition();
if (origin_size == npos) {
return StringPiece();
} else {
const std::string& spec = gurl_.spec();
return StringPiece(spec.data() + origin_size, spec.size() - origin_size);
}
}
// For "http://a.com/b/c/d/g.html?q=v" returns "/b/c/d/" including leading and
// trailing slashes.
StringPiece GoogleUrl::PathSansLeaf() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
size_t path_start = PathStartPosition();
size_t leaf_start = LeafStartPosition();
if (path_start == npos || leaf_start == npos) {
// Things like data: URLs do not have leaves, etc.
return StringPiece();
} else {
size_t after_last_slash = leaf_start + 1;
return StringPiece(gurl_.spec().data() + path_start,
after_last_slash - path_start);
}
}
StringPiece GoogleUrl::NetPath() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
if (!gurl_.has_scheme()) {
return Spec();
}
const std::string& spec = gurl_.possibly_invalid_spec();
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
// Just remove scheme and : from beginning of URL.
return StringPiece(spec.data() + parsed.scheme.end() + 1,
spec.size() - parsed.scheme.end() - 1);
}
// Extracts the filename portion of the path and returns it. The filename
// is everything after the last slash in the path. This may be empty.
GoogleString GoogleUrl::ExtractFileName() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return "";
}
return gurl_.ExtractFileName();
}
StringPiece GoogleUrl::Host() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
if (!gurl_.has_host()) {
return StringPiece();
}
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
// Just remove scheme and : from beginning of URL.
return StringPiece(gurl_.spec().data() + parsed.host.begin,
parsed.host.len);
}
StringPiece GoogleUrl::HostAndPort() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
if (!gurl_.has_host()) {
return StringPiece();
}
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
return StringPiece(gurl_.spec().data() + parsed.host.begin,
parsed.host.len + parsed.port.len + 1); // Yes, it works.
}
StringPiece GoogleUrl::PathSansQuery() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
size_t path_start = PathStartPosition();
if (path_start == npos || !parsed.path.is_valid()) {
return StringPiece();
} else {
return StringPiece(gurl_.spec().data() + path_start, parsed.path.len);
}
}
StringPiece GoogleUrl::Query() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
if (!gurl_.has_query()) {
return StringPiece();
}
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
return StringPiece(gurl_.spec().data() + parsed.query.begin,
parsed.query.len);
}
StringPiece GoogleUrl::Scheme() const {
if (!gurl_.is_valid()) {
LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
return StringPiece();
}
if (!gurl_.has_scheme()) {
return StringPiece();
}
url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
return StringPiece(gurl_.spec().data() + parsed.scheme.begin,
parsed.scheme.len);
}
StringPiece GoogleUrl::Spec() const {
const std::string& spec = gurl_.spec();
return StringPiece(spec.data(), spec.size());
}
StringPiece GoogleUrl::UncheckedSpec() const {
const std::string& spec = gurl_.possibly_invalid_spec();
return StringPiece(spec.data(), spec.size());
}
UrlRelativity GoogleUrl::FindRelativity(StringPiece url) {
GoogleUrl temp(url);
if (temp.IsAnyValid()) {
return kAbsoluteUrl;
} else if (url.starts_with("//")) {
return kNetPath;
} else if (url.starts_with("/")) {
return kAbsolutePath;
} else {
return kRelativePath;
}
}
StringPiece GoogleUrl::Relativize(UrlRelativity url_relativity,
const GoogleUrl& base_url) const {
// Default, in case we cannot relativize appropriately.
StringPiece result = Spec();
switch (url_relativity) {
case kRelativePath: {
StringPiece url_spec = Spec();
StringPiece relative_path = base_url.AllExceptLeaf();
if (url_spec.starts_with(relative_path)) {
result = url_spec.substr(relative_path.size());
}
break; // TODO(sligocki): Should we fall through here?
}
case kAbsolutePath:
if (Origin() == base_url.Origin()) {
result = PathAndLeaf();
}
break;
case kNetPath:
if (Scheme() == base_url.Scheme()) {
result = NetPath();
}
break;
case kAbsoluteUrl:
result = Spec();
break;
}
// There are several corner cases that the naive algorithm above fails on.
// Ex: http://foo.com/?bar or http://foo.com//bar relative to
// http://foo.com/bar.html. Check if result resolves correctly and if not,
// return absolute URL.
GoogleUrl resolved_result(base_url, result);
if (resolved_result != *this) {
result = Spec();
}
return result;
}
namespace {
// Parsing states for GoogleUrl::Unescape
enum UnescapeState {
NORMAL, // We are not in the middle of parsing an escape.
ESCAPE1, // We just parsed % .
ESCAPE2 // We just parsed %X for some hex digit X.
};
int HexStringToInt(const GoogleString& value) {
uint32 good_val = 0;
for (int c = 0, n = value.size(); c < n; ++c) {
bool ok = AccumulateHexValue(value[c], &good_val);
if (!ok) {
return -1;
}
}
return static_cast<int>(good_val);
}
} // namespace
GoogleString GoogleUrl::UnescapeHelper(StringPiece escaped,
bool convert_plus_to_space) {
GoogleString unescaped, escape_text;
unsigned char escape_value;
UnescapeState state = NORMAL;
int iter = 0;
int n = escaped.size();
while (iter < n) {
char c = escaped[iter];
switch (state) {
case NORMAL:
if (c == '%') {
escape_text.clear();
state = ESCAPE1;
} else {
if ((c == '+') && convert_plus_to_space) {
c = ' ';
}
unescaped.push_back(c);
}
++iter;
break;
case ESCAPE1:
if (IsHexDigit(c)) {
escape_text.push_back(c);
state = ESCAPE2;
++iter;
} else {
// Unexpected, % followed by non-hex chars, pass it through.
unescaped.push_back('%');
state = NORMAL;
}
break;
case ESCAPE2:
if (IsHexDigit(c)) {
escape_text.push_back(c);
escape_value = HexStringToInt(escape_text);
unescaped.push_back(escape_value);
state = NORMAL;
++iter;
} else {
// Unexpected, % followed by non-hex chars, pass it through.
unescaped.push_back('%');
unescaped.append(escape_text);
state = NORMAL;
}
break;
}
}
// Unexpected, % followed by end of string, pass it through.
if (state == ESCAPE1 || state == ESCAPE2) {
unescaped.push_back('%');
unescaped.append(escape_text);
}
return unescaped;
}
GoogleString GoogleUrl::EscapeQueryParam(StringPiece unescaped) {
GoogleString escaped;
for (const char* p = unescaped.data(), *e = p + unescaped.size();
p < e; ++p) {
// See http://en.wikipedia.org/wiki/Query_string#URL_encoding
char c = *p;
if (IsAsciiAlphaNumeric(c) || (c == '.') || (c == '~') || (c == '_') ||
(c == '-')) {
// Do not escape unreserved chars.
escaped.push_back(c);
} else if (c == ' ') {
// Space can be escaped as '+' in query params.
escaped.push_back('+');
} else {
// Escape both reserved chars (ex: '/') and uncategorized chars (ex: ' ').
StrAppend(&escaped, StringPrintf(
"%%%02x", static_cast<unsigned int>(static_cast<unsigned char>(c))));
}
}
return escaped;
}
// From RFC 3986 Section 2.3:
// reserved = gen-delims / sub-delims
//
// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
//
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
const char GoogleUrl::kReservedChars[] = ":/?#[]@!$&'()*+,;=";
bool GoogleUrl::IsReservedChar(char c) {
const char* start = kReservedChars;
const char* end = kReservedChars + STATIC_STRLEN(kReservedChars);
return (std::find(start, end, c) != end);
}
GoogleString GoogleUrl::Sanitize(StringPiece url) {
GoogleString escaped;
for (const char* p = url.data(), *e = p + url.size(); p < e; ++p) {
char c = *p;
if (IsAsciiAlphaNumeric(c) || (c == '.') || (c == '~') || (c == '_') ||
(c == '-') || (c == '%') || IsReservedChar(c)) {
// Do not escape unreserved nor reserved chars (ex: '/', ':', '#', '?')
// nor '%' (to avoid double escaping).
escaped.push_back(c);
} else {
// Escape uncategorized chars (ex: ' ', '^', '"')
StrAppend(&escaped, StringPrintf(
"%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(c))));
}
}
return escaped;
}
} // namespace net_instaweb