src/pagespeed/kernel/http/google_url.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)
 //         nforman@google.com  (Naomi Forman)

 #include "pagespeed/kernel/http/google_url.h"

 #include <algorithm>                    // for std::find
 #include <cstddef>
 #include <string>

 #include "base/logging.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/http/query_params.h"

 namespace net_instaweb {

 const size_t GoogleUrl::npos = std::string::npos;

 GoogleUrl::GoogleUrl()
     : gurl_() {
   Init();
 }

 GoogleUrl::GoogleUrl(const GURL& gurl)
     : gurl_(gurl) {
   Init();
 }

 GoogleUrl::GoogleUrl(const GoogleString& spec)
     : gurl_(spec) {
   Init();
 }

 GoogleUrl::GoogleUrl(StringPiece sp)
     : gurl_(sp.as_string()) {
   Init();
 }

 GoogleUrl::GoogleUrl(const char* str)
     : gurl_(str) {
   Init();
 }

 // The following three constructors create a new GoogleUrl by resolving the
 // String(Piece) against the base.
 GoogleUrl::GoogleUrl(const GoogleUrl& base, const GoogleString& str) {
   Reset(base, str);
 }

 GoogleUrl::GoogleUrl(const GoogleUrl& base, StringPiece sp) {
   Reset(base, sp);
 }

 GoogleUrl::GoogleUrl(const GoogleUrl& base, const char* str) {
   Reset(base, str);
 }

 void GoogleUrl::Swap(GoogleUrl* google_url) {
   gurl_.Swap(&google_url->gurl_);
   bool old_is_web_valid = is_web_valid_;
   bool old_is_web_or_data_valid = is_web_or_data_valid_;
   is_web_valid_ = google_url->is_web_valid_;
   is_web_or_data_valid_ = google_url->is_web_or_data_valid_;
   google_url->is_web_valid_ = old_is_web_valid;
   google_url->is_web_or_data_valid_ = old_is_web_or_data_valid;
 }

 void GoogleUrl::Init() {
   is_web_valid_ = gurl_.is_valid() && (SchemeIs("http") || SchemeIs("https"));
   is_web_or_data_valid_ =
       is_web_valid_ || (gurl_.is_valid() && SchemeIs("data"));
 }

 bool GoogleUrl::ResolveHelper(const GURL& base, const std::string& url) {
   gurl_ = base.Resolve(url);
   Init();
   return gurl_.is_valid();
 }

 bool GoogleUrl::Reset(const GoogleUrl& base, const GoogleString& str) {
   return ResolveHelper(base.gurl_, str);
 }

 bool GoogleUrl::Reset(const GoogleUrl& base, StringPiece sp) {
   return ResolveHelper(base.gurl_, sp.as_string());
 }

 bool GoogleUrl::Reset(const GoogleUrl& base, const char* str) {
   return ResolveHelper(base.gurl_, str);
 }

 bool GoogleUrl::Reset(StringPiece new_value) {
   gurl_ = GURL(new_value.as_string());
   Init();
   return gurl_.is_valid();
 }

 bool GoogleUrl::Reset(const GoogleUrl& new_value) {
   gurl_ = GURL(new_value.gurl_);
   Init();
   return gurl_.is_valid();
 }

 void GoogleUrl::Clear() {
   gurl_ = GURL();
   Init();
 }

 bool GoogleUrl::IsWebValid() const {
   DCHECK(is_web_valid_ ==
          (gurl_.is_valid() && (SchemeIs("http") || SchemeIs("https"))));
   return is_web_valid_;
 }

 bool GoogleUrl::IsWebOrDataValid() const {
   DCHECK(is_web_or_data_valid_ ==
          (gurl_.is_valid() && (SchemeIs("http") || SchemeIs("https") ||
                                SchemeIs("data"))));
   return is_web_or_data_valid_;
 }

 bool GoogleUrl::IsAnyValid() const {
   return gurl_.is_valid();
 }

 GoogleUrl* GoogleUrl::CopyAndAddQueryParam(
     StringPiece unescaped_name, StringPiece unescaped_value) const {
   if (unescaped_value.data() == NULL) {
     return CopyAndAddEscapedQueryParam(EscapeQueryParam(unescaped_name), NULL);
   } else {
     return CopyAndAddEscapedQueryParam(EscapeQueryParam(unescaped_name),
                                        EscapeQueryParam(unescaped_value));
   }
 }

 GoogleUrl* GoogleUrl::CopyAndAddEscapedQueryParam(
     StringPiece escaped_name, StringPiece escaped_value) const {
   QueryParams query_params;
   query_params.ParseFromUrl(*this);
   query_params.AddEscaped(escaped_name, escaped_value);
   GoogleString query_params_string = query_params.ToEscapedString();
   url::Replacements<char> replace_query;
   url::Component query;
   query.len = query_params_string.size();
   replace_query.SetQuery(query_params_string.c_str(), query);
   GoogleUrl* result = new GoogleUrl(gurl_.ReplaceComponents(replace_query));
   return result;
 }

 size_t GoogleUrl::LeafEndPosition(const GURL& gurl) {
   url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
   if (parsed.path.is_valid()) {
     return parsed.path.end();
   }
   if (parsed.port.is_valid()) {
     return parsed.port.end();
   }
   if (parsed.host.is_valid()) {
     return parsed.host.end();
   }
   if (parsed.password.is_valid()) {
     return parsed.password.end();
   }
   if (parsed.username.is_valid()) {
     return parsed.username.end();
   }
   if (parsed.scheme.is_valid()) {
     return parsed.scheme.end();
   }
   return npos;
 }

 // Returns the offset at which the leaf ends in valid url spec.
 // If there is no path, steps backward until valid end is found.
 size_t GoogleUrl::LeafEndPosition() const {
   return LeafEndPosition(gurl_);
 }

 size_t GoogleUrl::LeafStartPosition(const GURL& gurl) {
   url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
   size_t start_reverse_search_from = npos;
   if (parsed.query.is_valid() && (parsed.query.begin > 0)) {
     // query includes '?', so start the search from the character
     // before it.
     start_reverse_search_from = parsed.query.begin - 1;
   }
   return gurl.possibly_invalid_spec().rfind('/', start_reverse_search_from);
 }

 // Returns the offset at which the leaf starts in the fully
 // qualified spec.
 size_t GoogleUrl::LeafStartPosition() const {
   return LeafStartPosition(gurl_);
 }

 size_t GoogleUrl::PathStartPosition(const GURL& gurl) {
   const std::string& spec = gurl.spec();
   url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
   size_t origin_size = parsed.path.begin;
   if (!parsed.path.is_valid()) {
     origin_size = spec.size();
   }
   CHECK_LT(0, static_cast<int>(origin_size));
   CHECK_LE(origin_size, spec.size());
   return origin_size;
 }

 // Find the start of the path, includes '/'
 size_t GoogleUrl::PathStartPosition() const {
   return PathStartPosition(gurl_);
 }

 StringPiece GoogleUrl::AllExceptQuery() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   const std::string& spec = gurl_.possibly_invalid_spec();
   size_t leaf_end = LeafEndPosition();
   if (leaf_end == npos) {
     return StringPiece();
   } else {
     return StringPiece(spec.data(), leaf_end);
   }
 }

 StringPiece GoogleUrl::AllAfterQuery() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   const std::string& spec = gurl_.possibly_invalid_spec();
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   size_t query_end;
   if (gurl_.has_query()) {
     query_end = parsed.query.end();
   } else {
     query_end = LeafEndPosition();
   }
   if (query_end == npos) {
     return StringPiece();
   } else {
     return StringPiece(spec.data() + query_end, spec.size() - query_end);
   }
 }

 // Find the last slash before the question-mark, if any.  See
 // http://en.wikipedia.org/wiki/URI_scheme -- the query-string
 // syntax is not well-defined.  But the query-separator is well-defined:
 // it's a ? so I believe this implies that the first ? has to delimit
 // the query string.
 StringPiece GoogleUrl::AllExceptLeaf() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   size_t last_slash = LeafStartPosition();
   if (last_slash == npos) {
     // No leaf found.
     return StringPiece();
   } else {
     size_t after_last_slash = last_slash + 1;
     return StringPiece(gurl_.spec().data(), after_last_slash);
   }
 }

 StringPiece GoogleUrl::LeafWithQuery() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   size_t last_slash = LeafStartPosition();
   if (last_slash == npos) {
     // No slashes found.
     return StringPiece();
   } else {
     size_t after_last_slash = last_slash + 1;
     const std::string& spec = gurl_.spec();
     return StringPiece(spec.data() + after_last_slash,
                        spec.size() - after_last_slash);
   }
 }

 StringPiece GoogleUrl::LeafSansQuery() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   size_t leaf_start = LeafStartPosition();
   if (leaf_start == npos) {
     return StringPiece();
   }
   size_t after_last_slash = leaf_start + 1;
   const std::string& spec = gurl_.spec();
   size_t leaf_length = spec.size() - after_last_slash;
   if (!gurl_.has_query()) {
     return StringPiece(spec.data() + after_last_slash, leaf_length);
   }
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   if (!parsed.query.is_valid()) {
     return StringPiece();
   } else {
     // parsed.query.len doesn't include the '?'
     return StringPiece(spec.data() + after_last_slash,
                        leaf_length - (parsed.query.len + 1));
   }
 }

 // For "http://a.com/b/c/d?e=f/g returns "http://a.com" without trailing slash
 StringPiece GoogleUrl::Origin() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   size_t origin_size = PathStartPosition();
   if (origin_size == npos) {
     return StringPiece();
   } else {
     return StringPiece(gurl_.spec().data(), origin_size);
   }
 }

 // For "http://a.com/b/c/d?e=f/g returns "/b/c/d?e=f/g" including leading slash
 StringPiece GoogleUrl::PathAndLeaf() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   size_t origin_size = PathStartPosition();
   if (origin_size == npos) {
     return StringPiece();
   } else {
     const std::string& spec = gurl_.spec();
     return StringPiece(spec.data() + origin_size, spec.size() - origin_size);
   }
 }

 // For "http://a.com/b/c/d/g.html?q=v" returns "/b/c/d/" including leading and
 // trailing slashes.
 StringPiece GoogleUrl::PathSansLeaf() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   size_t path_start = PathStartPosition();
   size_t leaf_start = LeafStartPosition();
   if (path_start == npos || leaf_start == npos) {
     // Things like data: URLs do not have leaves, etc.
     return StringPiece();
   } else {
     size_t after_last_slash = leaf_start + 1;
     return StringPiece(gurl_.spec().data() + path_start,
                        after_last_slash - path_start);
   }
 }

 StringPiece GoogleUrl::NetPath() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   if (!gurl_.has_scheme()) {
     return Spec();
   }
   const std::string& spec = gurl_.possibly_invalid_spec();
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   // Just remove scheme and : from beginning of URL.
   return StringPiece(spec.data() + parsed.scheme.end() + 1,
                      spec.size() - parsed.scheme.end() - 1);
 }

 // Extracts the filename portion of the path and returns it. The filename
 // is everything after the last slash in the path. This may be empty.
 GoogleString GoogleUrl::ExtractFileName() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return "";
   }

   return gurl_.ExtractFileName();
 }

 StringPiece GoogleUrl::Host() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   if (!gurl_.has_host()) {
     return StringPiece();
   }
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   // Just remove scheme and : from beginning of URL.
   return StringPiece(gurl_.spec().data() + parsed.host.begin,
                      parsed.host.len);
 }

 StringPiece GoogleUrl::HostAndPort() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   if (!gurl_.has_host()) {
     return StringPiece();
   }
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   return StringPiece(gurl_.spec().data() + parsed.host.begin,
                      parsed.host.len + parsed.port.len + 1);  // Yes, it works.
 }

 StringPiece GoogleUrl::PathSansQuery() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   size_t path_start = PathStartPosition();
   if (path_start == npos || !parsed.path.is_valid()) {
     return StringPiece();
   } else {
     return StringPiece(gurl_.spec().data() + path_start, parsed.path.len);
   }
 }

 StringPiece GoogleUrl::Query() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   if (!gurl_.has_query()) {
     return StringPiece();
   }
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   return StringPiece(gurl_.spec().data() + parsed.query.begin,
                      parsed.query.len);
 }

 StringPiece GoogleUrl::Scheme() const {
   if (!gurl_.is_valid()) {
     LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
     return StringPiece();
   }

   if (!gurl_.has_scheme()) {
     return StringPiece();
   }
   url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
   return StringPiece(gurl_.spec().data() + parsed.scheme.begin,
                      parsed.scheme.len);
 }

 StringPiece GoogleUrl::Spec() const {
   const std::string& spec = gurl_.spec();
   return StringPiece(spec.data(), spec.size());
 }

 StringPiece GoogleUrl::UncheckedSpec() const {
   const std::string& spec = gurl_.possibly_invalid_spec();
   return StringPiece(spec.data(), spec.size());
 }

 UrlRelativity GoogleUrl::FindRelativity(StringPiece url) {
   GoogleUrl temp(url);
   if (temp.IsAnyValid()) {
     return kAbsoluteUrl;
   } else if (url.starts_with("//")) {
     return kNetPath;
   } else if (url.starts_with("/")) {
     return kAbsolutePath;
   } else {
     return kRelativePath;
   }
 }

 StringPiece GoogleUrl::Relativize(UrlRelativity url_relativity,
                                   const GoogleUrl& base_url) const {
   // Default, in case we cannot relativize appropriately.
   StringPiece result = Spec();

   switch (url_relativity) {
     case kRelativePath: {
       StringPiece url_spec = Spec();
       StringPiece relative_path = base_url.AllExceptLeaf();
       if (url_spec.starts_with(relative_path)) {
         result = url_spec.substr(relative_path.size());
       }
       break;  // TODO(sligocki): Should we fall through here?
     }
     case kAbsolutePath:
       if (Origin() == base_url.Origin()) {
         result = PathAndLeaf();
       }
       break;
     case kNetPath:
       if (Scheme() == base_url.Scheme()) {
         result = NetPath();
       }
       break;
     case kAbsoluteUrl:
       result = Spec();
       break;
   }

   // There are several corner cases that the naive algorithm above fails on.
   // Ex: http://foo.com/?bar or http://foo.com//bar relative to
   // http://foo.com/bar.html. Check if result resolves correctly and if not,
   // return absolute URL.
   GoogleUrl resolved_result(base_url, result);
   if (resolved_result != *this) {
     result = Spec();
   }

   return result;
 }

 namespace {

 // Parsing states for GoogleUrl::Unescape
 enum UnescapeState {
   NORMAL,   // We are not in the middle of parsing an escape.
   ESCAPE1,  // We just parsed % .
   ESCAPE2   // We just parsed %X for some hex digit X.
 };

 int HexStringToInt(const GoogleString& value) {
   uint32 good_val = 0;
   for (int c = 0, n = value.size(); c < n; ++c) {
     bool ok = AccumulateHexValue(value[c], &good_val);
     if (!ok) {
       return -1;
     }
   }
   return static_cast<int>(good_val);
 }

 }  // namespace

 GoogleString GoogleUrl::UnescapeHelper(StringPiece escaped,
                                        bool convert_plus_to_space) {
   GoogleString unescaped, escape_text;
   unsigned char escape_value;
   UnescapeState state = NORMAL;
   int iter = 0;
   int n = escaped.size();
   while (iter < n) {
     char c = escaped[iter];
     switch (state) {
       case NORMAL:
         if (c == '%') {
           escape_text.clear();
           state = ESCAPE1;
         } else {
           if ((c == '+') && convert_plus_to_space) {
             c = ' ';
           }
           unescaped.push_back(c);
         }
         ++iter;
         break;
       case ESCAPE1:
         if (IsHexDigit(c)) {
           escape_text.push_back(c);
           state = ESCAPE2;
           ++iter;
         } else {
           // Unexpected, % followed by non-hex chars, pass it through.
           unescaped.push_back('%');
           state = NORMAL;
         }
         break;
       case ESCAPE2:
         if (IsHexDigit(c)) {
           escape_text.push_back(c);
           escape_value = HexStringToInt(escape_text);
           unescaped.push_back(escape_value);
           state = NORMAL;
           ++iter;
         } else {
           // Unexpected, % followed by non-hex chars, pass it through.
           unescaped.push_back('%');
           unescaped.append(escape_text);
           state = NORMAL;
         }
         break;
     }
   }
   // Unexpected, % followed by end of string, pass it through.
   if (state == ESCAPE1 || state == ESCAPE2) {
     unescaped.push_back('%');
     unescaped.append(escape_text);
   }
   return unescaped;
 }

 GoogleString GoogleUrl::EscapeQueryParam(StringPiece unescaped) {
   GoogleString escaped;
   for (const char* p = unescaped.data(), *e = p + unescaped.size();
        p < e; ++p) {
     // See http://en.wikipedia.org/wiki/Query_string#URL_encoding
     char c = *p;
     if (IsAsciiAlphaNumeric(c) || (c == '.') || (c == '~') || (c == '_') ||
         (c == '-')) {
       // Do not escape unreserved chars.
       escaped.push_back(c);
     } else if (c == ' ') {
       // Space can be escaped as '+' in query params.
       escaped.push_back('+');
     } else {
       // Escape both reserved chars (ex: '/') and uncategorized chars (ex: ' ').
       StrAppend(&escaped, StringPrintf(
           "%%%02x", static_cast<unsigned int>(static_cast<unsigned char>(c))));
     }
   }
   return escaped;
 }

 // From RFC 3986 Section 2.3:
 //      reserved    = gen-delims / sub-delims
 //
 //      gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
 //
 //      sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
 //                  / "*" / "+" / "," / ";" / "="
 const char GoogleUrl::kReservedChars[] = ":/?#[]@!$&'()*+,;=";

 bool GoogleUrl::IsReservedChar(char c) {
   const char* start = kReservedChars;
   const char* end   = kReservedChars + STATIC_STRLEN(kReservedChars);
   return (std::find(start, end, c) != end);
 }

 GoogleString GoogleUrl::Sanitize(StringPiece url) {
   GoogleString escaped;
   for (const char* p = url.data(), *e = p + url.size(); p < e; ++p) {
     char c = *p;
     if (IsAsciiAlphaNumeric(c) || (c == '.') || (c == '~') || (c == '_') ||
         (c == '-') || (c == '%') || IsReservedChar(c)) {
       // Do not escape unreserved nor reserved chars (ex: '/', ':', '#', '?')
       // nor '%' (to avoid double escaping).
       escaped.push_back(c);
     } else {
       // Escape uncategorized chars (ex: ' ', '^', '"')
       StrAppend(&escaped, StringPrintf(
           "%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(c))));
     }
   }
   return escaped;
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)
	// nforman@google.com (Naomi Forman)

	#include "pagespeed/kernel/http/google_url.h"

	#include <algorithm> // for std::find
	#include <cstddef>
	#include <string>

	#include "base/logging.h"
	#include "pagespeed/kernel/base/string.h"
	#include "pagespeed/kernel/base/string_util.h"
	#include "pagespeed/kernel/http/query_params.h"

	namespace net_instaweb {

	const size_t GoogleUrl::npos = std::string::npos;

	GoogleUrl::GoogleUrl()
	: gurl_() {
	Init();
	}

	GoogleUrl::GoogleUrl(const GURL& gurl)
	: gurl_(gurl) {
	Init();
	}

	GoogleUrl::GoogleUrl(const GoogleString& spec)
	: gurl_(spec) {
	Init();
	}

	GoogleUrl::GoogleUrl(StringPiece sp)
	: gurl_(sp.as_string()) {
	Init();
	}

	GoogleUrl::GoogleUrl(const char* str)
	: gurl_(str) {
	Init();
	}

	// The following three constructors create a new GoogleUrl by resolving the
	// String(Piece) against the base.
	GoogleUrl::GoogleUrl(const GoogleUrl& base, const GoogleString& str) {
	Reset(base, str);
	}

	GoogleUrl::GoogleUrl(const GoogleUrl& base, StringPiece sp) {
	Reset(base, sp);
	}

	GoogleUrl::GoogleUrl(const GoogleUrl& base, const char* str) {
	Reset(base, str);
	}

	void GoogleUrl::Swap(GoogleUrl* google_url) {
	gurl_.Swap(&google_url->gurl_);
	bool old_is_web_valid = is_web_valid_;
	bool old_is_web_or_data_valid = is_web_or_data_valid_;
	is_web_valid_ = google_url->is_web_valid_;
	is_web_or_data_valid_ = google_url->is_web_or_data_valid_;
	google_url->is_web_valid_ = old_is_web_valid;
	google_url->is_web_or_data_valid_ = old_is_web_or_data_valid;
	}

	void GoogleUrl::Init() {
	is_web_valid_ = gurl_.is_valid() && (SchemeIs("http") \|\| SchemeIs("https"));
	is_web_or_data_valid_ =
	is_web_valid_ \|\| (gurl_.is_valid() && SchemeIs("data"));
	}

	bool GoogleUrl::ResolveHelper(const GURL& base, const std::string& url) {
	gurl_ = base.Resolve(url);
	Init();
	return gurl_.is_valid();
	}

	bool GoogleUrl::Reset(const GoogleUrl& base, const GoogleString& str) {
	return ResolveHelper(base.gurl_, str);
	}

	bool GoogleUrl::Reset(const GoogleUrl& base, StringPiece sp) {
	return ResolveHelper(base.gurl_, sp.as_string());
	}

	bool GoogleUrl::Reset(const GoogleUrl& base, const char* str) {
	return ResolveHelper(base.gurl_, str);
	}

	bool GoogleUrl::Reset(StringPiece new_value) {
	gurl_ = GURL(new_value.as_string());
	Init();
	return gurl_.is_valid();
	}

	bool GoogleUrl::Reset(const GoogleUrl& new_value) {
	gurl_ = GURL(new_value.gurl_);
	Init();
	return gurl_.is_valid();
	}

	void GoogleUrl::Clear() {
	gurl_ = GURL();
	Init();
	}

	bool GoogleUrl::IsWebValid() const {
	DCHECK(is_web_valid_ ==
	(gurl_.is_valid() && (SchemeIs("http") \|\| SchemeIs("https"))));
	return is_web_valid_;
	}

	bool GoogleUrl::IsWebOrDataValid() const {
	DCHECK(is_web_or_data_valid_ ==
	(gurl_.is_valid() && (SchemeIs("http") \|\| SchemeIs("https") \|\|
	SchemeIs("data"))));
	return is_web_or_data_valid_;
	}

	bool GoogleUrl::IsAnyValid() const {
	return gurl_.is_valid();
	}

	GoogleUrl* GoogleUrl::CopyAndAddQueryParam(
	StringPiece unescaped_name, StringPiece unescaped_value) const {
	if (unescaped_value.data() == NULL) {
	return CopyAndAddEscapedQueryParam(EscapeQueryParam(unescaped_name), NULL);
	} else {
	return CopyAndAddEscapedQueryParam(EscapeQueryParam(unescaped_name),
	EscapeQueryParam(unescaped_value));
	}
	}

	GoogleUrl* GoogleUrl::CopyAndAddEscapedQueryParam(
	StringPiece escaped_name, StringPiece escaped_value) const {
	QueryParams query_params;
	query_params.ParseFromUrl(*this);
	query_params.AddEscaped(escaped_name, escaped_value);
	GoogleString query_params_string = query_params.ToEscapedString();
	url::Replacements<char> replace_query;
	url::Component query;
	query.len = query_params_string.size();
	replace_query.SetQuery(query_params_string.c_str(), query);
	GoogleUrl* result = new GoogleUrl(gurl_.ReplaceComponents(replace_query));
	return result;
	}

	size_t GoogleUrl::LeafEndPosition(const GURL& gurl) {
	url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
	if (parsed.path.is_valid()) {
	return parsed.path.end();
	}
	if (parsed.port.is_valid()) {
	return parsed.port.end();
	}
	if (parsed.host.is_valid()) {
	return parsed.host.end();
	}
	if (parsed.password.is_valid()) {
	return parsed.password.end();
	}
	if (parsed.username.is_valid()) {
	return parsed.username.end();
	}
	if (parsed.scheme.is_valid()) {
	return parsed.scheme.end();
	}
	return npos;
	}

	// Returns the offset at which the leaf ends in valid url spec.
	// If there is no path, steps backward until valid end is found.
	size_t GoogleUrl::LeafEndPosition() const {
	return LeafEndPosition(gurl_);
	}

	size_t GoogleUrl::LeafStartPosition(const GURL& gurl) {
	url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
	size_t start_reverse_search_from = npos;
	if (parsed.query.is_valid() && (parsed.query.begin > 0)) {
	// query includes '?', so start the search from the character
	// before it.
	start_reverse_search_from = parsed.query.begin - 1;
	}
	return gurl.possibly_invalid_spec().rfind('/', start_reverse_search_from);
	}

	// Returns the offset at which the leaf starts in the fully
	// qualified spec.
	size_t GoogleUrl::LeafStartPosition() const {
	return LeafStartPosition(gurl_);
	}

	size_t GoogleUrl::PathStartPosition(const GURL& gurl) {
	const std::string& spec = gurl.spec();
	url::Parsed parsed = gurl.parsed_for_possibly_invalid_spec();
	size_t origin_size = parsed.path.begin;
	if (!parsed.path.is_valid()) {
	origin_size = spec.size();
	}
	CHECK_LT(0, static_cast<int>(origin_size));
	CHECK_LE(origin_size, spec.size());
	return origin_size;
	}

	// Find the start of the path, includes '/'
	size_t GoogleUrl::PathStartPosition() const {
	return PathStartPosition(gurl_);
	}

	StringPiece GoogleUrl::AllExceptQuery() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	const std::string& spec = gurl_.possibly_invalid_spec();
	size_t leaf_end = LeafEndPosition();
	if (leaf_end == npos) {
	return StringPiece();
	} else {
	return StringPiece(spec.data(), leaf_end);
	}
	}

	StringPiece GoogleUrl::AllAfterQuery() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	const std::string& spec = gurl_.possibly_invalid_spec();
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	size_t query_end;
	if (gurl_.has_query()) {
	query_end = parsed.query.end();
	} else {
	query_end = LeafEndPosition();
	}
	if (query_end == npos) {
	return StringPiece();
	} else {
	return StringPiece(spec.data() + query_end, spec.size() - query_end);
	}
	}

	// Find the last slash before the question-mark, if any. See
	// http://en.wikipedia.org/wiki/URI_scheme -- the query-string
	// syntax is not well-defined. But the query-separator is well-defined:
	// it's a ? so I believe this implies that the first ? has to delimit
	// the query string.
	StringPiece GoogleUrl::AllExceptLeaf() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	size_t last_slash = LeafStartPosition();
	if (last_slash == npos) {
	// No leaf found.
	return StringPiece();
	} else {
	size_t after_last_slash = last_slash + 1;
	return StringPiece(gurl_.spec().data(), after_last_slash);
	}
	}

	StringPiece GoogleUrl::LeafWithQuery() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	size_t last_slash = LeafStartPosition();
	if (last_slash == npos) {
	// No slashes found.
	return StringPiece();
	} else {
	size_t after_last_slash = last_slash + 1;
	const std::string& spec = gurl_.spec();
	return StringPiece(spec.data() + after_last_slash,
	spec.size() - after_last_slash);
	}
	}

	StringPiece GoogleUrl::LeafSansQuery() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	size_t leaf_start = LeafStartPosition();
	if (leaf_start == npos) {
	return StringPiece();
	}
	size_t after_last_slash = leaf_start + 1;
	const std::string& spec = gurl_.spec();
	size_t leaf_length = spec.size() - after_last_slash;
	if (!gurl_.has_query()) {
	return StringPiece(spec.data() + after_last_slash, leaf_length);
	}
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	if (!parsed.query.is_valid()) {
	return StringPiece();
	} else {
	// parsed.query.len doesn't include the '?'
	return StringPiece(spec.data() + after_last_slash,
	leaf_length - (parsed.query.len + 1));
	}
	}

	// For "http://a.com/b/c/d?e=f/g returns "http://a.com" without trailing slash
	StringPiece GoogleUrl::Origin() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	size_t origin_size = PathStartPosition();
	if (origin_size == npos) {
	return StringPiece();
	} else {
	return StringPiece(gurl_.spec().data(), origin_size);
	}
	}

	// For "http://a.com/b/c/d?e=f/g returns "/b/c/d?e=f/g" including leading slash
	StringPiece GoogleUrl::PathAndLeaf() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	size_t origin_size = PathStartPosition();
	if (origin_size == npos) {
	return StringPiece();
	} else {
	const std::string& spec = gurl_.spec();
	return StringPiece(spec.data() + origin_size, spec.size() - origin_size);
	}
	}

	// For "http://a.com/b/c/d/g.html?q=v" returns "/b/c/d/" including leading and
	// trailing slashes.
	StringPiece GoogleUrl::PathSansLeaf() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	size_t path_start = PathStartPosition();
	size_t leaf_start = LeafStartPosition();
	if (path_start == npos \|\| leaf_start == npos) {
	// Things like data: URLs do not have leaves, etc.
	return StringPiece();
	} else {
	size_t after_last_slash = leaf_start + 1;
	return StringPiece(gurl_.spec().data() + path_start,
	after_last_slash - path_start);
	}
	}

	StringPiece GoogleUrl::NetPath() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	if (!gurl_.has_scheme()) {
	return Spec();
	}
	const std::string& spec = gurl_.possibly_invalid_spec();
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	// Just remove scheme and : from beginning of URL.
	return StringPiece(spec.data() + parsed.scheme.end() + 1,
	spec.size() - parsed.scheme.end() - 1);
	}

	// Extracts the filename portion of the path and returns it. The filename
	// is everything after the last slash in the path. This may be empty.
	GoogleString GoogleUrl::ExtractFileName() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return "";
	}

	return gurl_.ExtractFileName();
	}

	StringPiece GoogleUrl::Host() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	if (!gurl_.has_host()) {
	return StringPiece();
	}
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	// Just remove scheme and : from beginning of URL.
	return StringPiece(gurl_.spec().data() + parsed.host.begin,
	parsed.host.len);
	}

	StringPiece GoogleUrl::HostAndPort() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	if (!gurl_.has_host()) {
	return StringPiece();
	}
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	return StringPiece(gurl_.spec().data() + parsed.host.begin,
	parsed.host.len + parsed.port.len + 1); // Yes, it works.
	}

	StringPiece GoogleUrl::PathSansQuery() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	size_t path_start = PathStartPosition();
	if (path_start == npos \|\| !parsed.path.is_valid()) {
	return StringPiece();
	} else {
	return StringPiece(gurl_.spec().data() + path_start, parsed.path.len);
	}
	}

	StringPiece GoogleUrl::Query() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	if (!gurl_.has_query()) {
	return StringPiece();
	}
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	return StringPiece(gurl_.spec().data() + parsed.query.begin,
	parsed.query.len);
	}

	StringPiece GoogleUrl::Scheme() const {
	if (!gurl_.is_valid()) {
	LOG(DFATAL) << "Invalid URL: " << gurl_.possibly_invalid_spec();
	return StringPiece();
	}

	if (!gurl_.has_scheme()) {
	return StringPiece();
	}
	url::Parsed parsed = gurl_.parsed_for_possibly_invalid_spec();
	return StringPiece(gurl_.spec().data() + parsed.scheme.begin,
	parsed.scheme.len);
	}

	StringPiece GoogleUrl::Spec() const {
	const std::string& spec = gurl_.spec();
	return StringPiece(spec.data(), spec.size());
	}

	StringPiece GoogleUrl::UncheckedSpec() const {
	const std::string& spec = gurl_.possibly_invalid_spec();
	return StringPiece(spec.data(), spec.size());
	}

	UrlRelativity GoogleUrl::FindRelativity(StringPiece url) {
	GoogleUrl temp(url);
	if (temp.IsAnyValid()) {
	return kAbsoluteUrl;
	} else if (url.starts_with("//")) {
	return kNetPath;
	} else if (url.starts_with("/")) {
	return kAbsolutePath;
	} else {
	return kRelativePath;
	}
	}

	StringPiece GoogleUrl::Relativize(UrlRelativity url_relativity,
	const GoogleUrl& base_url) const {
	// Default, in case we cannot relativize appropriately.
	StringPiece result = Spec();

	switch (url_relativity) {
	case kRelativePath: {
	StringPiece url_spec = Spec();
	StringPiece relative_path = base_url.AllExceptLeaf();
	if (url_spec.starts_with(relative_path)) {
	result = url_spec.substr(relative_path.size());
	}
	break; // TODO(sligocki): Should we fall through here?
	}
	case kAbsolutePath:
	if (Origin() == base_url.Origin()) {
	result = PathAndLeaf();
	}
	break;
	case kNetPath:
	if (Scheme() == base_url.Scheme()) {
	result = NetPath();
	}
	break;
	case kAbsoluteUrl:
	result = Spec();
	break;
	}

	// There are several corner cases that the naive algorithm above fails on.
	// Ex: http://foo.com/?bar or http://foo.com//bar relative to
	// http://foo.com/bar.html. Check if result resolves correctly and if not,
	// return absolute URL.
	GoogleUrl resolved_result(base_url, result);
	if (resolved_result != *this) {
	result = Spec();
	}

	return result;
	}

	namespace {

	// Parsing states for GoogleUrl::Unescape
	enum UnescapeState {
	NORMAL, // We are not in the middle of parsing an escape.
	ESCAPE1, // We just parsed % .
	ESCAPE2 // We just parsed %X for some hex digit X.
	};

	int HexStringToInt(const GoogleString& value) {
	uint32 good_val = 0;
	for (int c = 0, n = value.size(); c < n; ++c) {
	bool ok = AccumulateHexValue(value[c], &good_val);
	if (!ok) {
	return -1;
	}
	}
	return static_cast<int>(good_val);
	}

	} // namespace

	GoogleString GoogleUrl::UnescapeHelper(StringPiece escaped,
	bool convert_plus_to_space) {
	GoogleString unescaped, escape_text;
	unsigned char escape_value;
	UnescapeState state = NORMAL;
	int iter = 0;
	int n = escaped.size();
	while (iter < n) {
	char c = escaped[iter];
	switch (state) {
	case NORMAL:
	if (c == '%') {
	escape_text.clear();
	state = ESCAPE1;
	} else {
	if ((c == '+') && convert_plus_to_space) {
	c = ' ';
	}
	unescaped.push_back(c);
	}
	++iter;
	break;
	case ESCAPE1:
	if (IsHexDigit(c)) {
	escape_text.push_back(c);
	state = ESCAPE2;
	++iter;
	} else {
	// Unexpected, % followed by non-hex chars, pass it through.
	unescaped.push_back('%');
	state = NORMAL;
	}
	break;
	case ESCAPE2:
	if (IsHexDigit(c)) {
	escape_text.push_back(c);
	escape_value = HexStringToInt(escape_text);
	unescaped.push_back(escape_value);
	state = NORMAL;
	++iter;
	} else {
	// Unexpected, % followed by non-hex chars, pass it through.
	unescaped.push_back('%');
	unescaped.append(escape_text);
	state = NORMAL;
	}
	break;
	}
	}
	// Unexpected, % followed by end of string, pass it through.
	if (state == ESCAPE1 \|\| state == ESCAPE2) {
	unescaped.push_back('%');
	unescaped.append(escape_text);
	}
	return unescaped;
	}

	GoogleString GoogleUrl::EscapeQueryParam(StringPiece unescaped) {
	GoogleString escaped;
	for (const char* p = unescaped.data(), *e = p + unescaped.size();
	p < e; ++p) {
	// See http://en.wikipedia.org/wiki/Query_string#URL_encoding
	char c = *p;
	if (IsAsciiAlphaNumeric(c) \|\| (c == '.') \|\| (c == '~') \|\| (c == '_') \|\|
	(c == '-')) {
	// Do not escape unreserved chars.
	escaped.push_back(c);
	} else if (c == ' ') {
	// Space can be escaped as '+' in query params.
	escaped.push_back('+');
	} else {
	// Escape both reserved chars (ex: '/') and uncategorized chars (ex: ' ').
	StrAppend(&escaped, StringPrintf(
	"%%%02x", static_cast<unsigned int>(static_cast<unsigned char>(c))));
	}
	}
	return escaped;
	}

	// From RFC 3986 Section 2.3:
	// reserved = gen-delims / sub-delims
	//
	// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
	//
	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
	// / "*" / "+" / "," / ";" / "="
	const char GoogleUrl::kReservedChars[] = ":/?#[]@!$&'()*+,;=";

	bool GoogleUrl::IsReservedChar(char c) {
	const char* start = kReservedChars;
	const char* end = kReservedChars + STATIC_STRLEN(kReservedChars);
	return (std::find(start, end, c) != end);
	}

	GoogleString GoogleUrl::Sanitize(StringPiece url) {
	GoogleString escaped;
	for (const char* p = url.data(), *e = p + url.size(); p < e; ++p) {
	char c = *p;
	if (IsAsciiAlphaNumeric(c) \|\| (c == '.') \|\| (c == '~') \|\| (c == '_') \|\|
	(c == '-') \|\| (c == '%') \|\| IsReservedChar(c)) {
	// Do not escape unreserved nor reserved chars (ex: '/', ':', '#', '?')
	// nor '%' (to avoid double escaping).
	escaped.push_back(c);
	} else {
	// Escape uncategorized chars (ex: ' ', '^', '"')
	StrAppend(&escaped, StringPrintf(
	"%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(c))));
	}
	}
	return escaped;
	}

	} // namespace net_instaweb