| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmarantz@google.com (Joshua Marantz) |
| // nforman@google.com (Naomi Forman) |
| |
| #ifndef PAGESPEED_KERNEL_HTTP_GOOGLE_URL_H_ |
| #define PAGESPEED_KERNEL_HTTP_GOOGLE_URL_H_ |
| |
| #include <cstddef> |
| |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| |
| |
| #include "third_party/chromium/src/googleurl/src/gurl.h" |
| #include "third_party/chromium/src/googleurl/src/url_parse.h" |
| #include "third_party/chromium/src/googleurl/src/url_util.h" |
| |
| namespace net_instaweb { |
| |
| // Prepare for flattening of the namespaces in newer Chromiums. |
| |
| namespace url { |
| |
| using url_canon::Replacements; |
| using url_parse::PORT_UNSPECIFIED; |
| using url_parse::PORT_INVALID; |
| using url_parse::Component; |
| using url_parse::Parsed; |
| using url_util::Initialize; |
| using url_util::Shutdown; |
| |
| }; // namespace url |
| |
| |
| enum UrlRelativity { |
| kAbsoluteUrl, // http://example.com/foo/bar/file.ext?k=v#f |
| kNetPath, // //example.com/foo/bar/file.ext?k=v#f |
| kAbsolutePath, // /foo/bar/file.ext?k=v#f |
| kRelativePath, // bar/file.ext?k=v#f |
| }; |
| |
| class GoogleUrl { |
| public: |
| explicit GoogleUrl(const GoogleString& spec); |
| explicit GoogleUrl(StringPiece sp); |
| explicit GoogleUrl(const char* str); |
| // The following three constructors create a new GoogleUrl by resolving the |
| // String(Piece) against the base. |
| GoogleUrl(const GoogleUrl& base, const GoogleString& relative); |
| GoogleUrl(const GoogleUrl& base, StringPiece relative); |
| GoogleUrl(const GoogleUrl& base, const char* relative); |
| GoogleUrl(); |
| |
| void Swap(GoogleUrl* google_url); |
| |
| bool Reset(StringPiece new_url); |
| bool Reset(const GoogleUrl& new_url); |
| bool Reset(const GoogleUrl& base, const GoogleString& relative); |
| bool Reset(const GoogleUrl& base, StringPiece relative); |
| bool Reset(const GoogleUrl& base, const char* relative); |
| |
| // Resets this URL to be invalid. |
| void Clear(); |
| |
| // Is a valid web (HTTP or HTTPS) URL. Most users will want this. |
| bool IsWebValid() const; |
| // Also allows data: URLs. |
| bool IsWebOrDataValid() const; |
| // Only use for you don't care about scheme, just need to know that URL is |
| // well-formed. Note: This will accept things like "foo:bar". |
| bool IsAnyValid() const; |
| |
| // Returns a new GoogleUrl that is identical to this one but with additional |
| // query param. Name and value should both be unescaped. |
| // This is a factory method that returns a pointer, the caller is responsible |
| // for the management of the new object's memory (the caller owns the |
| // pointer). |
| GoogleUrl* CopyAndAddQueryParam(StringPiece unescaped_name, |
| StringPiece unescaped_value) const; |
| // Same as CopyAndAddQueryParam() but name and value must already be escaped. |
| // Most users should use CopyAndAddQueryParam() instead for safety. |
| GoogleUrl* CopyAndAddEscapedQueryParam( |
| StringPiece escaped_name, StringPiece escaped_value) const; |
| |
| // For "http://a.com/b/c/d?e=f/g#r" returns "http://a.com/b/c/d" |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece AllExceptQuery() const; |
| |
| // For "http://a.com/b/c/d?e=f#r" returns "#r" |
| // For "http://a.com/b/c/d?e=f#r1#r2" returns "#r1#r2" |
| // AllExceptQuery() + Query() + AllAfterQuery() = Spec() when url is valid |
| // Different from Parsed.ref in the case of multiple "#"s after "?" |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece AllAfterQuery() const; |
| |
| // For "http://a.com/b/c/d?e=f/g" returns "http://a.com/b/c/", |
| // including trailing slash. |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece AllExceptLeaf() const; |
| |
| // For "http://a.com/b/c/d?e=f/g" returns "d?e=f/g", omitting leading slash. |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece LeafWithQuery() const; |
| |
| // For "http://a.com/b/c/d?e=f/g" returns "d", omitting leading slash. |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece LeafSansQuery() const; |
| |
| // For "http://a.com/b/c/d?E=f/g" returns "/b/c/d?e=f/g" |
| // including leading slash |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece PathAndLeaf() const; |
| |
| // For "http://a.com/b/c/d/g.html" returns "/b/c/d/" including leading and |
| // trailing slashes. |
| // For queries, "http://a.com/b/c/d?E=f/g" returns "/b/c/". |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece PathSansLeaf() const; |
| |
| // For "http://a.com/b/c/d?E=f/g returns "/b/c/d" including leading slash, |
| // and excluding the query. |
| StringPiece PathSansQuery() const; |
| |
| // Scheme-relative URL. Spec() == Scheme() + ":" + NetPath(). |
| // Named based on http://tools.ietf.org/html/rfc1808#section-2.2 |
| // For "http://a.com/b/c/d?E=f/g#r" returns "//a.com/b/c/d?E=f/g#r". |
| // For "file:///tmp/foo" returns "///tmp/foo". |
| StringPiece NetPath() const; |
| |
| // Extracts the filename portion of the path and returns it. The filename |
| // is everything after the last slash in the path. This may be empty. |
| GoogleString ExtractFileName() const; |
| |
| StringPiece Host() const; |
| |
| // For "http://a.com/b/c.html" returns "a.com". |
| // For "http://a.com:1234/b/c.html" returns "a.com:1234". |
| StringPiece HostAndPort() const; |
| |
| // For "http://a.com/b/c/d?e=f/g returns "http://a.com" |
| // without trailing slash |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| StringPiece Origin() const; |
| |
| // Returns the query-string, not including the "?". Note that the |
| // query will be in escaped syntax, and is suitable for passing to |
| // QueryParams for parsing and unescaping. |
| StringPiece Query() const; |
| |
| // Returns scheme of stored url. |
| StringPiece Scheme() const; |
| |
| // It is illegal to call this for invalid urls (check IsWebValid() first). |
| StringPiece Spec() const; |
| |
| // Returns gurl_.spec_ without checking to see if it's valid or empty. |
| StringPiece UncheckedSpec() const; |
| |
| // This method is primarily for printf purposes. |
| const char* spec_c_str() const { |
| return gurl_.possibly_invalid_spec().c_str(); |
| } |
| |
| int IntPort() const { return gurl_.IntPort(); } |
| |
| // Returns the effective port number, which is dependent on the scheme. |
| int EffectiveIntPort() const { return gurl_.EffectiveIntPort(); } |
| |
| // Returns the default port for given scheme, or url::PORT_UNSPECIFIED |
| // if the scheme isn't recognized. Scheme is expected to be in lowercase. |
| static int DefaultPortForScheme(StringPiece scheme); |
| |
| bool is_empty() const { return gurl_.is_empty(); } |
| bool has_scheme() const { return gurl_.has_scheme(); } |
| bool has_path() const { return gurl_.has_path(); } |
| bool has_query() const { return gurl_.has_query(); } |
| |
| bool SchemeIs(const char* lower_ascii_scheme) const { |
| return gurl_.SchemeIs(lower_ascii_scheme); |
| } |
| |
| // TODO(nforman): get GURL to take a StringPiece so we don't have to do |
| // any copying. |
| bool SchemeIs(StringPiece lower_ascii_scheme) const { |
| return gurl_.SchemeIs(lower_ascii_scheme.as_string().c_str()); |
| } |
| |
| // Find out how relative the URL string is. |
| static UrlRelativity FindRelativity(StringPiece url); |
| |
| // If possible, produce a URL as relative as url_relativity, relative to |
| // base_url. If not possible, simply returns the absolute URL string. |
| // Returns a StringPiece, only valid for the lifetime of this object. |
| // |
| // It is illegal to call this for invalid urls (check IsWebValid() first). |
| StringPiece Relativize(UrlRelativity url_relativity, |
| const GoogleUrl& base_url) const; |
| |
| // Defiant equality operator! |
| bool operator==(const GoogleUrl& other) const { |
| return gurl_ == other.gurl_; |
| } |
| bool operator!=(const GoogleUrl& other) const { |
| return gurl_ != other.gurl_; |
| } |
| |
| // Unescape a query parameter, converting all %XX to the the actual char 0xXX. |
| // This also converts '+' to ' ' which is valid only in query parameters. |
| // For example, this will convert "foo%21bar+baz" to "foo!bar baz". |
| // |
| // This will work with strings that have embedded NULs and %00s. |
| // |
| // TODO(jmarantz): Change signature to return a bool so if the escaped |
| // syntax was not valid, we can help the caller avoid relying on this value. |
| static GoogleString UnescapeQueryParam(StringPiece escaped) { |
| return UnescapeHelper(escaped, true); |
| } |
| |
| // UnescapeQueryParam converts "+" to " ", but that is not correct for other |
| // parts of a URL. |
| static GoogleString UnescapeIgnorePlus(StringPiece escaped) { |
| return UnescapeHelper(escaped, false); |
| } |
| |
| // Escapes a string for use in a URL query param. |
| // |
| // This function escapes reserved chars (ex: '/', ':', '?', '&', etc.). |
| static GoogleString EscapeQueryParam(StringPiece unescaped); |
| |
| // Produces a sanitary, escaped version of a URL. The URL may already have |
| // some mix of escaped and non-escaped sections. This function is idempotent |
| // and can safely be used on any URL without changing the meaning according |
| // to RFC 3986. |
| // |
| // Result will not contain: 0x00-0x1F SPC "<>\^`{|} 0x7F-0xFF |
| // Result may contain: a-z A-Z 0-9 -._~:/?#[]@!$&'()*+,;=% |
| static GoogleString Sanitize(StringPiece url); |
| |
| // Returns the canonical representation of a given path component of URL. |
| // Will also prepend / if it's not there. This will follow the same rules for |
| // what's in %-encoded form and what isn't as GoogleUrl does. |
| static GoogleString CanonicalizePath(StringPiece path); |
| |
| private: |
| // Returned by *Position methods when that position is not well-defined. |
| static const size_t npos; |
| |
| static const char kReservedChars[]; |
| static bool IsReservedChar(char c); |
| |
| explicit GoogleUrl(const GURL& gurl); |
| void Init(); |
| |
| static size_t LeafEndPosition(const GURL& gurl); |
| static size_t LeafStartPosition(const GURL& gurl); |
| static size_t PathStartPosition(const GURL& gurl); |
| size_t LeafEndPosition() const; |
| size_t LeafStartPosition() const; |
| size_t PathStartPosition() const; |
| static GoogleString UnescapeHelper(StringPiece escaped, |
| bool convert_plus_to_space); |
| |
| // Resolves a URL against a base. Returns whether the resolution worked. |
| inline bool ResolveHelper(const GURL& base, const std::string& path_and_leaf); |
| |
| GURL gurl_; |
| bool is_web_valid_; |
| bool is_web_or_data_valid_; |
| |
| DISALLOW_COPY_AND_ASSIGN(GoogleUrl); |
| }; // class GoogleUrl |
| |
| } // namespace net_instaweb |
| |
| |
| #endif // PAGESPEED_KERNEL_HTTP_GOOGLE_URL_H_ |