src/net/instaweb/rewriter/domain_lawyer.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #include "net/instaweb/rewriter/public/domain_lawyer.h"

 #include <map>
 #include <set>
 #include <utility>  // for std::pair
 #include <vector>

 #include "base/logging.h"
 #include "pagespeed/kernel/base/basictypes.h"
 #include "pagespeed/kernel/base/message_handler.h"
 #include "pagespeed/kernel/base/stl_util.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/base/wildcard.h"
 #include "pagespeed/kernel/http/domain_registry.h"
 #include "pagespeed/kernel/http/google_url.h"

 namespace net_instaweb {

 class DomainLawyer::Domain {
  public:
   explicit Domain(const StringPiece& name)
       : wildcard_(name),
         name_(name.data(), name.size()),
         rewrite_domain_(NULL),
         origin_domain_(NULL),
         authorized_(false),
         cycle_breadcrumb_(false),
         is_proxy_(false) {
   }

   bool IsWildcarded() const { return !wildcard_.IsSimple(); }
   bool Match(const StringPiece& domain) { return wildcard_.Match(domain); }
   Domain* rewrite_domain() const { return rewrite_domain_; }
   Domain* origin_domain() const { return origin_domain_; }
   const GoogleString& name() const { return name_; }

   // When multiple domains are mapped to the same rewrite-domain, they
   // should have consistent origins.  If they don't, we print an error
   // message but we keep rolling.  This is because we don't want to
   // introduce an incremental change that would invalidate existing
   // pagespeed.conf files.
   //
   void MergeOrigin(Domain* origin_domain, MessageHandler* handler) {
     if (cycle_breadcrumb_) {
       // See DomainLawyerTest.RewriteOriginCycle
       return;
     }
     cycle_breadcrumb_ = true;
     if ((origin_domain != origin_domain_) && (origin_domain != NULL)) {
       if (origin_domain_ != NULL) {
         if (handler != NULL) {
           handler->Message(kError,
                            "RewriteDomain %s has conflicting origins %s and "
                            "%s, overriding to %s",
                            name_.c_str(),
                            origin_domain_->name_.c_str(),
                            origin_domain->name_.c_str(),
                            origin_domain->name_.c_str());
         }
       }
       origin_domain_ = origin_domain;
       for (int i = 0; i < num_shards(); ++i) {
         shards_[i]->MergeOrigin(origin_domain, handler);
       }
       if (rewrite_domain_ != NULL) {
         rewrite_domain_->MergeOrigin(origin_domain, handler);
       }
     }
     cycle_breadcrumb_ = false;
   }

   // handler==NULL means this is happening from a 'merge' so we will
   // silently let the new rewrite_domain win.
   bool SetRewriteDomain(Domain* rewrite_domain, MessageHandler* handler) {
     if (rewrite_domain == rewrite_domain_) {
       return true;
     }

     // Don't break old configs on this new consistency check
     // for ModPagespeedMapRewriteDomain.  However,
     // ModPagespeedMapProxyDomain has no legacy configuration, and
     // in that context it's a functional problem to have multiple
     // proxy directories mapped to a single origin, so we must fail
     // the configuration.
     if (is_proxy_ && (rewrite_domain_ != NULL)) {
       if (handler != NULL) {
         handler->Message(kError,
                          "ProxyDomain %s has conflicting proxies %s and %s",
                          name_.c_str(),
                          rewrite_domain_->name_.c_str(),
                          rewrite_domain->name_.c_str());
       }
       return false;
     }

     rewrite_domain_ = rewrite_domain;
     rewrite_domain->MergeOrigin(origin_domain_, handler);
     return true;  // don't break old configs on this new consistency check.
   }

   // handler==NULL means this is happening from a 'merge' so we will
   // silently let the new origin_domain win.
   bool SetOriginDomain(Domain* origin_domain, MessageHandler* handler) {
     if (origin_domain == origin_domain_) {
       return true;
     }

     // Don't break old configs on this new consistency check
     // for ModPagespeedMapOriginDomain.  However,
     // ModPagespeedMapProxyDomain has no legacy configuration, and
     // in that context it's a functional problem to have the same
     // proxy directory mapped to multiple origins, so we must fail
     // the configuration.
     if ((origin_domain_ != NULL) &&
         (origin_domain_->is_proxy_ || origin_domain->is_proxy_)) {
       if (handler != NULL) {
         handler->Message(kError,
                          "ProxyDomain %s has conflicting origins %s and %s",
                          name_.c_str(),
                          origin_domain_->name_.c_str(),
                          origin_domain->name_.c_str());
       }
       return false;
     }

     MergeOrigin(origin_domain, handler);
     if (rewrite_domain_ != NULL) {
       rewrite_domain_->MergeOrigin(origin_domain_, handler);
     }

     return true;
   }

   bool SetProxyDomain(Domain* origin_domain, MessageHandler* handler) {
     origin_domain->is_proxy_ = true;
     return (SetOriginDomain(origin_domain, handler) &&
             origin_domain->SetRewriteDomain(this, handler));
   }

   // handler==NULL means this is happening from a 'merge' so we will
   // silently let the new rewrite_domain win.
   bool SetShardFrom(Domain* rewrite_domain, MessageHandler* handler) {
     if ((rewrite_domain_ != rewrite_domain) && (rewrite_domain_ != NULL)) {
       if (handler != NULL) {
         // We only treat this as an error when the handler is non-null.  We
         // use a null handler during merges, and will do the best we can
         // to get correct behavior.
         handler->Message(kError,
                          "Shard %s has conflicting rewrite_domain %s and %s",
                          name_.c_str(),
                          rewrite_domain_->name_.c_str(),
                          rewrite_domain->name_.c_str());
         return false;
       }
     }
     MergeOrigin(rewrite_domain->origin_domain_, handler);
     rewrite_domain->shards_.push_back(this);
     rewrite_domain_ = rewrite_domain;
     return true;
   }

   void set_authorized(bool authorized) { authorized_ = authorized; }
   int num_shards() const { return shards_.size(); }
   void set_host_header(StringPiece x) { x.CopyToString(&host_header_); }
   const GoogleString& host_header() const { return host_header_; }

   // Indicates whether this domain is authorized when found in URLs
   // HTML files are as direct requests to the web server.  Domains
   // get authorized by mentioning them in ModPagespeedDomain,
   // ModPagespeedMapRewriteDomain, ModPagespeedShardDomain, and as
   // the from-list in ModPagespeedMapOriginDomain.  However, the target
   // of ModPagespeedMapOriginDomain is not implicitly authoried --
   // that may be 'localhost'.
   bool authorized() const { return authorized_; }

   Domain* shard(int shard_index) const { return shards_[shard_index]; }
   bool is_proxy() const { return is_proxy_; }
   void set_is_proxy(bool is_proxy) { is_proxy_ = is_proxy; }

   GoogleString Signature() const {
     GoogleString signature;
     StrAppend(&signature, name_, "_",
               authorized_ ? "_a" : "_n", "_");
     // Assuming that there will be no cycle of Domains like Domain A has a
     // rewrite domain to domain B which in turn have the original domain as A.
     if (rewrite_domain_ != NULL) {
       StrAppend(&signature, "R:", rewrite_domain_->name(), "_");
     }
     if (!host_header_.empty()) {
       StrAppend(&signature, "H:", host_header_, "|");
     }
     if (origin_domain_ != NULL) {
       StrAppend(&signature,
                 origin_domain_->is_proxy_ ? "P:" : "O:",
                 origin_domain_->name(), "_");
     }
     for (int index = 0; index < num_shards(); ++index) {
       if (shards_[index] != NULL) {
         StrAppend(&signature, "S:", shards_[index]->name(), "_");
       }
     }
     return signature;
   }

   GoogleString ToString() const {
     GoogleString output = name_;

     if (authorized_) {
       StrAppend(&output, " Auth");
     }

     if (rewrite_domain_ != NULL) {
       StrAppend(&output,
                 is_proxy_ ? " ProxyDomain:" : " RewriteDomain:",
                 rewrite_domain_->name());
     }

     if (origin_domain_ != NULL) {
       StrAppend(&output,
                 (origin_domain_->is_proxy_
                  ? " ProxyOriginDomain:" : " OriginDomain:"),
                 origin_domain_->name());
     }

     if (!shards_.empty()) {
       StrAppend(&output, " Shards:{");
       for (int i = 0, n = shards_.size(); i < n; ++i) {
         StrAppend(&output, (i == 0 ? "" : ", "), shards_[i]->name());
       }
       StrAppend(&output, "}");
     }

     if (!host_header_.empty()) {
       StrAppend(&output, " HostHeader:", host_header_);
     }

     return output;
   }

  private:
   Wildcard wildcard_;
   GoogleString name_;

   // The rewrite_domain, if non-null, gives the location of where this
   // Domain should be rewritten.  This can be used to move resources onto
   // a CDN or onto a cookieless domain.  We also use this pointer to
   // get from shards back to the domain they were sharded from.
   Domain* rewrite_domain_;

   // The origin_domain, if non-null, gives the location of where
   // resources should be fetched from by mod_pagespeed, in lieu of how
   // it is specified in the HTML.  This allows, for example, a CDN to
   // fetch content from an origin domain, or an origin server behind a
   // load-balancer to specify localhost or an IP address of a host to
   // go to directly, skipping DNS resolution and reducing outbound
   // traffic.
   Domain* origin_domain_;

   // Explicitly specified Host header for use with MapOriginDomain.  When
   // empty, this indicates that the domain specified in the URL argument
   // to MapOrigin and MapOriginUrl should be used as the host header.
   GoogleString host_header_;

   // A rewrite_domain keeps track of all its shards.
   DomainVector shards_;

   bool authorized_;

   // This boolean helps us prevent spinning through a cycle in the
   // graph that can be expressed between shards and rewrite domains, e.g.
   //   ModPagespeedMapOriginDomain a b
   //   ModPagespeedMapRewriteDomain b c
   //   ModPagespeedAddShard b c
   bool cycle_breadcrumb_;

   // Identifies origin-domains that have been been used in
   // AddProxyDomainMapping, and thus should not require a modified
   // Host header when fetching resources.
   bool is_proxy_;
 };

 DomainLawyer::~DomainLawyer() {
   Clear();
 }

 bool DomainLawyer::AddDomain(const StringPiece& domain_name,
                              MessageHandler* handler) {
   return (AddDomainHelper(domain_name, true, true, false, handler) != NULL);
 }

 bool DomainLawyer::AddKnownDomain(const StringPiece& domain_name,
                                   MessageHandler* handler) {
   return (AddDomainHelper(domain_name, false, false, false, handler) != NULL);
 }

 GoogleString DomainLawyer::NormalizeDomainName(const StringPiece& domain_name) {
   // Ensure that the following specifications are treated identically:
   //     www.google.com/abc
   //     http://www.google.com/abc
   //     www.google.com/abc
   //     http://www.google.com/abc
   //     WWW.GOOGLE.COM/abc
   // all come out the same, but distinct from
   //     www.google.com/Abc
   // As the path component is case-sensitive.
   //
   // Example: domain-mapping domain-mapping
   // http://musicasacra.lemon42.com/DE/evoscripts/musica_sacra/returnBinaryImage
   // We need to case-fold only "musicasacra.lemon42.com" and not
   // "returnBinaryImage" or "DE".
   GoogleString domain_name_str;
   static const char kSchemeDelim[] = "://";
   stringpiece_ssize_type scheme_delim_start = domain_name.find(kSchemeDelim);
   if (scheme_delim_start == StringPiece::npos) {
     domain_name_str = StrCat("http://", domain_name);
     scheme_delim_start = 4;
   } else {
     domain_name.CopyToString(&domain_name_str);
   }
   EnsureEndsInSlash(&domain_name_str);

   // Lower-case all characters in the string, up until the "/" that terminates
   // the hostname.  We pass origin_start into the find() call to avoid tripping
   // on the "/" in "http://".
   GoogleString::size_type origin_start = scheme_delim_start +
       STATIC_STRLEN(kSchemeDelim);
   GoogleString::size_type slash = domain_name_str.find('/', origin_start);
   DCHECK_NE(GoogleString::npos, slash);
   for (char* p = &(domain_name_str[0]), *e = p + slash; p < e; ++p) {
     *p = LowerChar(*p);
   }

   // For "https", any ":443" in the host is redundant; ditto for :80 and http.
   StringPiece scheme(domain_name_str.data(), scheme_delim_start);
   StringPiece origin(domain_name_str.data() + origin_start,
                      slash - origin_start);
   if ((scheme == "https") && origin.ends_with(":443")) {
     domain_name_str.erase(slash - 4, 4);
   } else if ((scheme == "http") && origin.ends_with(":80")) {
     domain_name_str.erase(slash - 3, 3);
   }

   return domain_name_str;
 }

 DomainLawyer::Domain* DomainLawyer::AddDomainHelper(
     const StringPiece& domain_name, bool warn_on_duplicate,
     bool authorize, bool is_proxy, MessageHandler* handler) {
   if (domain_name.empty()) {
     // handler will be NULL only when called from Merge, which should
     // only have pre-validated (non-empty) domains.  So it should not
     // be possible to get here from Merge.
     if (handler != NULL) {
       handler->MessageS(kWarning, "Empty domain passed to AddDomain");
     }
     return NULL;
   }

   if (authorize && domain_name == "*") {
     authorize_all_domains_ = true;
   }

   // TODO(matterbury): need better data structures to eliminate the O(N) logic:
   // 1) Use a trie for domain_map_ as we need to find the domain whose trie
   //    path matches the beginning of the given domain_name since we no longer
   //    match just the domain name.
   // 2) Use a better lookup structure for wildcard searching.
   GoogleString domain_name_str = NormalizeDomainName(domain_name);
   Domain* domain = NULL;
   std::pair<DomainMap::iterator, bool> p = domain_map_.insert(
       DomainMap::value_type(domain_name_str, domain));
   DomainMap::iterator iter = p.first;
   if (p.second) {
     domain = new Domain(domain_name_str);
     iter->second = domain;
     if (domain->IsWildcarded()) {
       wildcarded_domains_.push_back(domain);
     }
   } else {
     domain = iter->second;
     if (warn_on_duplicate && (authorize == domain->authorized())) {
       handler->Message(kWarning, "AddDomain of domain already in map: %s",
                        domain_name_str.c_str());
       domain = NULL;
     }
   }
   if (domain != NULL) {
     if (authorize) {
       domain->set_authorized(true);
     }
     if (is_proxy) {
       domain->set_is_proxy(true);
     }
   }
   return domain;
 }

 // Looks up the Domain* object by name.  From the Domain object
 // we can tell if it's wildcarded, in which case it cannot be
 // the 'to' field for a map, and whether resources from it should
 // be mapped to a different domain, either for rewriting or for
 // fetching.
 DomainLawyer::Domain* DomainLawyer::FindDomain(const GoogleUrl& gurl) const {
   // First do a quick lookup on the domain name only, since that's the most
   // common case. Failing that, try searching for domain + path.
   // TODO(matterbury): see AddDomainHelper for speed issues.
   Domain* domain = NULL;

   // There may be multiple entries in the map with the same domain,
   // but varying paths.  We want to choose the entry with the longest
   // domain that prefix-matches GURL.  So do the lookup starting
   // with the entire origin+path, then shorten the string removing
   // path components, looking for an exact match till we get to the origin
   // with no path.
   //
   // TODO(jmarantz): IMO the best data structure for this is an explicit
   // tree.  That would allow starting from the top and searching down,
   // rather than starting at the bottom and searching up, with each search
   // a lookup over the entire set of domains.
   GoogleString domain_path;
   gurl.AllExceptLeaf().CopyToString(&domain_path);
   StringPieceVector components;
   SplitStringPieceToVector(gurl.PathSansLeaf(), "/", &components, false);

   // PathSansLeaf gives something like "/a/b/c/" so after splitting with
   // omit_empty_strings==false, the first and last elements are always
   // present and empty.
   //
   // Note that the GURL can be 'about:blank' so be paranoid about getting
   // what we expect.
   if ((2U <= components.size()) &&
       components[0].empty() &&
       components[components.size() - 1].empty()) {
     int component_size = 0;
     for (int i = components.size() - 1; (domain == NULL) && (i >= 1); --i) {
       domain_path.resize(domain_path.size() - component_size);
       DCHECK(StringPiece(domain_path).ends_with("/"));
       DomainMap::const_iterator p = domain_map_.find(domain_path);
       if (p != domain_map_.end()) {
         domain = p->second;
       } else {
         // Remove the path component.  Consider input
         // "http://a.com/x/yy/zzz/w".  We will split PathSansLeaf, which
         // is "/x/yy/zzz/", so we will get StringPieceVector ["", "x",
         // "yy", "zzz", ""].  In the first iteration we want to consider
         // the entire path in the search, so we initialize
         // component_size to 0 above the loop.  In the next iteration we
         // want to chop off "zzz/" so we increment the component size by
         // one to get rid of the slash.  Note that we passed 'false'
         // into SplitStringPieceToVector so if there are double-slashes
         // they will show up as distinct components and we will get rid
         // of them one at a time.
         component_size = components[i - 1].size() + 1;
       }
     }
   }

   if (domain == NULL) {
     for (int i = 0, n = wildcarded_domains_.size(); i < n; ++i) {
       domain = wildcarded_domains_[i];
       if (domain->Match(domain_path)) {
         break;
       } else {
         domain = NULL;
       }
     }
   }
   return domain;
 }

 void DomainLawyer::FindDomainsRewrittenTo(
     const GoogleUrl& original_url,
     ConstStringStarVector* from_domains) const {
   // TODO(rahulbansal): Make this more efficient by maintaining the map of
   // rewrite_domain -> from_domains.
   if (!original_url.IsWebValid()) {
     LOG(ERROR) << "Invalid url " << original_url.Spec();
     return;
   }

   GoogleString domain_name;
   original_url.Origin().CopyToString(&domain_name);
   EnsureEndsInSlash(&domain_name);
   for (DomainMap::const_iterator p = domain_map_.begin();
       p != domain_map_.end(); ++p) {
     Domain* src_domain = p->second;
     if (!src_domain->IsWildcarded() && (src_domain->rewrite_domain() != NULL) &&
         domain_name == src_domain->rewrite_domain()->name()) {
       from_domains->push_back(&src_domain->name());
     }
   }
 }

 bool DomainLawyer::MapRequestToDomain(
     const GoogleUrl& original_request,
     const StringPiece& resource_url,  // relative to original_request
     GoogleString* mapped_domain_name,
     GoogleUrl* resolved_request,
     MessageHandler* handler) const {
   CHECK(original_request.IsAnyValid());
   GoogleUrl original_origin(original_request.Origin());
   resolved_request->Reset(original_request, resource_url);

   bool ret = false;
   // We can map a request to/from http/https.
   if (resolved_request->IsWebValid()) {
     GoogleUrl resolved_origin(resolved_request->Origin());

     // Looks at the resolved domain name + path from the original request
     // and the resource_url (which might override the original request).
     // Gets the Domain* object out of that.
     Domain* resolved_domain = FindDomain(*resolved_request);

     // The origin domain is authorized by default.
     if (resolved_origin == original_origin) {
       resolved_origin.Spec().CopyToString(mapped_domain_name);
       ret = true;
     } else if (resolved_domain != NULL && resolved_domain->authorized()) {
       if (resolved_domain->IsWildcarded()) {
         // This is a sharded domain. We do not do the sharding in this function.
         resolved_origin.Spec().CopyToString(mapped_domain_name);
       } else {
         *mapped_domain_name = resolved_domain->name();
       }
       ret = true;
     }

     // If we actually got a Domain* out of the lookups so far, then a
     // mapping to a different rewrite_domain may be contained there.  This
     // helps move resources to CDNs or cookieless domains.
     //
     // Note that at this point, we are not really caring where we fetch
     // from.  We are only concerned here with what URLs we will write into
     // HTML files.  See MapOrigin below which is used to redirect fetch
     // requests to a different domain (e.g. localhost).
     if (ret && resolved_domain != NULL) {
       Domain* mapped_domain = resolved_domain->rewrite_domain();
       if (mapped_domain != NULL) {
         CHECK(!mapped_domain->IsWildcarded());
         CHECK(mapped_domain != resolved_domain);
         *mapped_domain_name = mapped_domain->name();
         GoogleUrl mapped_request;
         ret = MapUrlHelper(*resolved_domain, *mapped_domain,
                            *resolved_request, &mapped_request);
         if (ret) {
           resolved_request->Swap(&mapped_request);
         }
       }
     }
   }
   return ret;
 }

 bool DomainLawyer::IsDomainAuthorized(const GoogleUrl& original_request,
                                       const GoogleUrl& domain_to_check) const {
   if (authorize_all_domains_) {
     return true;
   }
   bool ret = false;
   if (domain_to_check.IsWebValid()) {
     if (original_request.IsWebValid() &&
         (original_request.Origin() == domain_to_check.Origin())) {
       ret = true;
     } else {
       Domain* path_domain = FindDomain(domain_to_check);
       ret = (path_domain != NULL) && path_domain->authorized();
     }
   }
   return ret;
 }

 bool DomainLawyer::IsOriginKnown(const GoogleUrl& domain_to_check) const {
   if (domain_to_check.IsWebValid()) {
     Domain* path_domain = FindDomain(domain_to_check);
     return (path_domain != NULL);
   }
   return false;
 }

 bool DomainLawyer::MapOrigin(const StringPiece& in, GoogleString* out,
                              GoogleString* host_header, bool* is_proxy) const {
   GoogleUrl gurl(in);
   return gurl.IsWebValid() && MapOriginUrl(gurl, out, host_header, is_proxy);
 }

 bool DomainLawyer::MapOriginUrl(const GoogleUrl& gurl,
                                 GoogleString* out, GoogleString* host_header,
                                 bool* is_proxy) const {
   bool ret = false;
   *is_proxy = false;
   host_header->clear();

   // We can map an origin to/from http/https.
   if (gurl.IsWebValid()) {
     ret = true;
     gurl.Spec().CopyToString(out);
     Domain* domain = FindDomain(gurl);
     if (domain != NULL) {
       Domain* origin_domain = domain->origin_domain();
       if (origin_domain != NULL) {
         GoogleUrl mapped_gurl;
         if (MapUrlHelper(*domain, *origin_domain, gurl, &mapped_gurl)) {
           mapped_gurl.Spec().CopyToString(out);
         }
         *is_proxy = origin_domain->is_proxy();
         const GoogleString& origin_header = origin_domain->host_header();
         if (!origin_header.empty()) {
           *host_header = origin_header;
         }
       }
     }

     if (host_header->empty()) {
       gurl.HostAndPort().CopyToString(host_header);
     }
   }

   return ret;
 }

 bool DomainLawyer::MapUrlHelper(const Domain& from_domain,
                                 const Domain& to_domain,
                                 const GoogleUrl& gurl,
                                 GoogleUrl* mapped_gurl) const {
   CHECK(!to_domain.IsWildcarded());

   GoogleUrl from_domain_gurl(from_domain.name());
   StringPiece from_domain_path(from_domain_gurl.PathSansLeaf());
   StringPiece path_and_leaf(gurl.PathAndLeaf());
   DCHECK(path_and_leaf.starts_with(from_domain_path));

   // Trim the URL's domain we came from based on how it was specifed in the
   // from_domain.  E.g. if you write
   //    ModPagespeedMap*Domain localhost/foo cdn.com/bar
   // and the URL being mapped is
   //    http://cdn.com/bar/x
   // then we set path_and_leaf to "x".  This testcase gets hit in
   // DomainLawyerTest.OriginAndExternWithPaths.
   //
   // Even if the from_domain has no subdirectory, we need to remove
   // the leading slash to make it a relative reference and retain any
   // subdirectory in the to_domain.
   //
   // Note: We must prepend "./" to make sure the path_and_leaf is not an
   // absolute URL, which will cause problems below. For example:
   // "http://www.example.com/data:image/jpeg" should be converted to the
   // relative URL "./data:image/jpeg", not the absolute URL "data:image/jpeg".
   GoogleString rel_url =
       StrCat("./", path_and_leaf.substr(from_domain_path.size()));
   // Make sure this isn't a valid absolute URL.
   DCHECK(!GoogleUrl(rel_url).IsWebValid())
       << "URL " << gurl.Spec() << " is being mapped to absolute URL "
       << rel_url << " which will break many things.";
   GoogleUrl to_domain_gurl(to_domain.name());
   mapped_gurl->Reset(to_domain_gurl, rel_url);
   return mapped_gurl->IsWebValid();
 }

 bool DomainLawyer::AddRewriteDomainMapping(
     const StringPiece& to_domain_name,
     const StringPiece& comma_separated_from_domains,
     MessageHandler* handler) {
   bool result = MapDomainHelper(to_domain_name, comma_separated_from_domains,
                                 "" /* host_header */,
                                 &Domain::SetRewriteDomain,
                                 true /* allow_wildcards */,
                                 true /* allow_map_to_https */,
                                 true /* authorize */,
                                 handler);
   can_rewrite_domains_ |= result;
   return result;
 }

 bool DomainLawyer::DomainNameToTwoProtocols(
     const StringPiece& domain_name,
     GoogleString* http_url, GoogleString* https_url) {
   *http_url = NormalizeDomainName(domain_name);
   StringPiece http_url_piece(*http_url);
   if (!http_url_piece.starts_with("http:")) {
     return false;
   }
   *https_url = StrCat("https", http_url_piece.substr(4));
   return true;
 }

 bool DomainLawyer::TwoProtocolDomainHelper(
       const StringPiece& to_domain_name,
       const StringPiece& from_domain_name,
       const StringPiece& host_header,
       SetDomainFn set_domain_fn,
       bool authorize,
       MessageHandler* handler) {
   GoogleString http_to_url, http_from_url, https_to_url, https_from_url;
   if (!DomainNameToTwoProtocols(to_domain_name, &http_to_url, &https_to_url)) {
     return false;
   }
   if (!DomainNameToTwoProtocols(from_domain_name,
                                 &http_from_url, &https_from_url)) {
     return false;
   }
   if (!MapDomainHelper(http_to_url, http_from_url,
                        host_header,
                        set_domain_fn,
                        false, /* allow_wildcards */
                        false, /* allow_map_to_https */
                        authorize, handler)) {
     return false;
   }
   if (!MapDomainHelper(https_to_url, https_from_url,
                        host_header,
                        set_domain_fn,
                        false, /* allow_wildcards */
                        true, /* allow_map_to_https */
                        authorize, handler)) {
     // Note that we still retain the http domain mapping in this case.
     return false;
   }
   return true;
 }

 bool DomainLawyer::AddTwoProtocolRewriteDomainMapping(
     const StringPiece& to_domain_name,
     const StringPiece& from_domain_name,
     MessageHandler* handler) {
   bool result = TwoProtocolDomainHelper(to_domain_name, from_domain_name,
                                         "" /* host_header */,
                                         &Domain::SetRewriteDomain,
                                         true /*authorize */, handler);
   can_rewrite_domains_ |= result;
   return result;
 }

 bool DomainLawyer::AddOriginDomainMapping(
     const StringPiece& to_domain_name,
     const StringPiece& comma_separated_from_domains,
     const StringPiece& host_header,
     MessageHandler* handler) {
   return MapDomainHelper(to_domain_name, comma_separated_from_domains,
                          host_header,
                          &Domain::SetOriginDomain,
                          true /* allow_wildcards */,
                          true /* allow_map_to_https */,
                          false /* authorize */,
                          handler);
 }

 bool DomainLawyer::AddProxyDomainMapping(
     const StringPiece& proxy_domain_name,
     const StringPiece& origin_domain_name,
     const StringPiece& to_domain_name,
     MessageHandler* handler) {
   bool result;

   if (to_domain_name.empty()) {
     // 1. Rewrite from origin_domain to proxy_domain.
     // 2. Set origin_domain->is_proxy = true.
     // 3. Map origin from proxy_domain to origin_domain.
     result = MapDomainHelper(origin_domain_name, proxy_domain_name,
                              "" /* host_header */,
                              &Domain::SetProxyDomain,
                              false /* allow_wildcards */,
                              true /* allow_map_to_https */,
                              true /* authorize */,
                              handler);
   } else {
     // 1. Rewrite from origin_domain to to_domain.
     // 2. Set origin_domain->is_proxy = true.
     // 3. Map origin from to_domain to origin_domain.
     result = MapDomainHelper(origin_domain_name, to_domain_name,
                              "" /* host_header */,
                              &Domain::SetProxyDomain,
                              false /* allow_wildcards */,
                              true /* allow_map_to_https */,
                              true /* authorize */,
                              handler);
     // 4. Rewrite from proxy_domain to to_domain. This way when the CDN asks us
     // for resources on proxy_domain it knows to use the CDN domain for the
     // cache key.
     result &= MapDomainHelper(to_domain_name, proxy_domain_name,
                               "" /* host_header */,
                               &Domain::SetRewriteDomain,
                               false /* allow_wildcards */,
                               true /* allow_map_to_https */,
                               true /* authorize */,
                               handler);
     // 5. Map origin from proxy_domain to origin_domain. This tells the proxy
     // how to fetch files from the origin for reconstruction.
     result &= MapDomainHelper(origin_domain_name, proxy_domain_name,
                               "" /* host_header */,
                               &Domain::SetOriginDomain,
                               false /* allow wildcards */,
                               true /* allow_map_to_https */,
                               true /* authorize */,
                               handler);
   }
   return result;
 }


 bool DomainLawyer::AddTwoProtocolOriginDomainMapping(
     const StringPiece& to_domain_name,
     const StringPiece& from_domain_name,
     const StringPiece& host_header,
     MessageHandler* handler) {
   return TwoProtocolDomainHelper(to_domain_name, from_domain_name,
                                  host_header,
                                  &Domain::SetOriginDomain,
                                  false /*authorize */, handler);
 }

 bool DomainLawyer::AddShard(
     const StringPiece& shard_domain_name,
     const StringPiece& comma_separated_shards,
     MessageHandler* handler) {
   bool result = MapDomainHelper(shard_domain_name, comma_separated_shards,
                                 "" /* host_header */,
                                 &Domain::SetShardFrom,
                                 false /* allow_wildcards */,
                                 true /* allow_map_to_https */,
                                 true /* authorize */,
                                 handler);
   can_rewrite_domains_ |= result;
   return result;
 }

 bool DomainLawyer::IsSchemeSafeToMapTo(const StringPiece& domain_name,
                                        bool allow_https_scheme) {
   // The scheme defaults to http so that's the same as explicitly saying http.
   return (domain_name.find("://") == GoogleString::npos ||
           domain_name.starts_with("http://") ||
           (allow_https_scheme && domain_name.starts_with("https://")));
 }

 bool DomainLawyer::MapDomainHelper(
     const StringPiece& to_domain_name,
     const StringPiece& comma_separated_from_domains,
     const StringPiece& host_header,
     SetDomainFn set_domain_fn,
     bool allow_wildcards,
     bool allow_map_to_https,
     bool authorize_to_domain,
     MessageHandler* handler) {
   if (!IsSchemeSafeToMapTo(to_domain_name, allow_map_to_https)) {
     return false;
   }
   Domain* to_domain = AddDomainHelper(to_domain_name, false,
                                       authorize_to_domain, false, handler);
   if (to_domain == NULL) {
     return false;
   }

   bool ret = false;
   bool mapped_a_domain = false;
   if (to_domain->IsWildcarded()) {
     handler->Message(kError, "Cannot map to a wildcarded domain: %s",
                      to_domain_name.as_string().c_str());
   } else {
     GoogleUrl to_url(to_domain->name());
     StringPieceVector domains;
     SplitStringPieceToVector(comma_separated_from_domains, ",", &domains, true);
     ret = true;
     for (int i = 0, n = domains.size(); i < n; ++i) {
       const StringPiece& domain_name = domains[i];
       Domain* from_domain = AddDomainHelper(domain_name, false, true, false,
                                             handler);
       if (from_domain != NULL) {
         GoogleUrl from_url(from_domain->name());
         if (to_url.Origin() == from_url.Origin()) {
           // Ignore requests to map to the same scheme://hostname:port/.
         } else if (!allow_wildcards && from_domain->IsWildcarded()) {
           handler->Message(kError, "Cannot map from a wildcarded domain: %s",
                            to_domain_name.as_string().c_str());
           ret = false;
         } else {
           bool ok = (from_domain->*set_domain_fn)(to_domain, handler);
           ret &= ok;
           mapped_a_domain |= ok;
         }
       }
     }
     DCHECK(host_header.empty() || !to_domain->is_proxy())
         << "It makes no sense to specify a host header for a proxy:"
         << host_header << ", " << to_domain_name;
     to_domain->set_host_header(host_header);
   }
   return (ret && mapped_a_domain);
 }

 DomainLawyer::Domain* DomainLawyer::CloneAndAdd(const Domain* src) {
   Domain* dst = AddDomainHelper(src->name(), false, src->authorized(),
                                 src->is_proxy(), NULL);
   dst->set_host_header(src->host_header());
   return dst;
 }

 void DomainLawyer::Merge(const DomainLawyer& src) {
   int num_existing_wildcards = num_wildcarded_domains();
   for (DomainMap::const_iterator
            p = src.domain_map_.begin(),
            e = src.domain_map_.end();
        p != e; ++p) {
     Domain* src_domain = p->second;
     Domain* dst_domain = CloneAndAdd(src_domain);
     Domain* src_rewrite_domain = src_domain->rewrite_domain();
     if (src_rewrite_domain != NULL) {
       dst_domain->SetRewriteDomain(CloneAndAdd(src_rewrite_domain), NULL);
     }
     Domain* src_origin_domain = src_domain->origin_domain();
     if (src_origin_domain != NULL) {
       dst_domain->SetOriginDomain(CloneAndAdd(src_origin_domain), NULL);
     }
     for (int i = 0; i < src_domain->num_shards(); ++i) {
       Domain* src_shard = src_domain->shard(i);
       Domain* dst_shard = CloneAndAdd(src_shard);
       dst_shard->SetShardFrom(dst_domain, NULL);
     }
   }

   // Remove the wildcards we just added in map order, and instead add them
   // in the order they were in src.wildcarded_domains.
   wildcarded_domains_.resize(num_existing_wildcards);
   std::set<Domain*> dup_detector(wildcarded_domains_.begin(),
                                  wildcarded_domains_.end());
   for (int i = 0, n = src.wildcarded_domains_.size(); i < n; ++i) {
     Domain* src_domain = src.wildcarded_domains_[i];
     DomainMap::const_iterator p = domain_map_.find(src_domain->name());
     if (p == domain_map_.end()) {
       LOG(DFATAL) << "Domain " << src_domain->name() << " not found in dst";
     } else {
       Domain* dst_domain = p->second;
       if (dup_detector.find(dst_domain) == dup_detector.end()) {
         wildcarded_domains_.push_back(dst_domain);
       }
     }
   }

   can_rewrite_domains_ |= src.can_rewrite_domains_;
   authorize_all_domains_ |= src.authorize_all_domains_;
   if (!src.proxy_suffix_.empty()) {
     if (!proxy_suffix_.empty() && (proxy_suffix_ != src.proxy_suffix_)) {
       LOG(WARNING)
           << "Merging incompatible proxy suffixes " << proxy_suffix_ << " and "
           << src.proxy_suffix_;
     }
     proxy_suffix_ = src.proxy_suffix_;
   }
 }

 bool DomainLawyer::ShardDomain(const StringPiece& domain_name,
                                uint32 hash,
                                GoogleString* sharded_domain) const {
   GoogleUrl domain_gurl(NormalizeDomainName(domain_name));
   Domain* domain = FindDomain(domain_gurl);
   bool sharded = false;
   if (domain != NULL) {
     if (domain->num_shards() != 0) {
       int shard_index = hash % domain->num_shards();
       domain = domain->shard(shard_index);
       *sharded_domain = domain->name();
       sharded = true;
     }
   }
   return sharded;
 }

 bool DomainLawyer::WillDomainChange(const GoogleUrl& gurl) const {
   Domain* domain = FindDomain(gurl), *mapped_domain = domain;
   if (domain != NULL) {
     // First check a mapping based on AddRewriteDomainMapping.
     mapped_domain = domain->rewrite_domain();
     if (mapped_domain == NULL)  {
       // Even if there was no AddRewriteDomainMapping for this domain, there
       // may still have been shards.
       mapped_domain = domain;
     }

     // Now check mappings from the shard.
     if (mapped_domain->num_shards() != 0) {
       if (mapped_domain->num_shards() == 1) {
         // Usually we don't expect exactly one shard, but if there is,
         // we know exactly what it will be.
         mapped_domain = mapped_domain->shard(0);
       } else {
         // We don't have enough data in this function to determine what
         // the shard index will be, so we assume pessimistically that
         // the domain will change.
         //
         // TODO(jmarantz): rename this method to MayDomainChange, or
         // pass in the sharding index.
         mapped_domain = NULL;
       }
     }
   }
   return domain != mapped_domain;
 }

 bool DomainLawyer::IsProxyMapped(const GoogleUrl& gurl) const {
   Domain* domain = FindDomain(gurl);
   if (domain != NULL) {
     Domain* origin = domain->origin_domain();
     if ((origin != NULL) && origin->is_proxy()) {
       return true;
     }
   }
   return false;
 }

 bool DomainLawyer::DoDomainsServeSameContent(
     const StringPiece& domain1_name, const StringPiece& domain2_name) const {
   GoogleUrl domain1_gurl(NormalizeDomainName(domain1_name));
   Domain* domain1 = FindDomain(domain1_gurl);
   GoogleUrl domain2_gurl(NormalizeDomainName(domain2_name));
   Domain* domain2 = FindDomain(domain2_gurl);
   if ((domain1 == NULL) || (domain2 == NULL)) {
     return false;
   }
   if (domain1 == domain2) {
     return true;
   }
   Domain* rewrite1 = domain1->rewrite_domain();
   Domain* rewrite2 = domain2->rewrite_domain();
   if ((rewrite1 == domain2) || (rewrite2 == domain1)) {
     return true;
   }
   if ((rewrite1 != NULL) && (rewrite1 == rewrite2)) {
     return true;
   }
   return false;
 }

 GoogleString DomainLawyer::Signature() const {
   GoogleString signature;

   for (DomainMap::const_iterator iterator = domain_map_.begin();
       iterator != domain_map_.end(); ++iterator) {
     StrAppend(&signature, "D:", iterator->second->Signature(), "-");
   }
   if (!proxy_suffix_.empty()) {
     StrAppend(&signature, ",PS:", proxy_suffix_);
   }

   return signature;
 }

 GoogleString DomainLawyer::ToString(StringPiece line_prefix) const {
   GoogleString output;
   for (DomainMap::const_iterator iterator = domain_map_.begin();
       iterator != domain_map_.end(); ++iterator) {
     StrAppend(&output, line_prefix, iterator->second->ToString(), "\n");
   }
   if (!proxy_suffix_.empty()) {
     StrAppend(&output, "Proxy Suffix: ", proxy_suffix_);
   }
   return output;
 }

 void DomainLawyer::Clear() {
   STLDeleteValues(&domain_map_);
   can_rewrite_domains_ = false;
   authorize_all_domains_ = false;
   wildcarded_domains_.clear();
   proxy_suffix_.clear();
 }

 bool DomainLawyer::StripProxySuffix(const GoogleUrl& gurl, GoogleString* url,
                                     GoogleString* host) const {
   bool ret = false;
   if (gurl.IsWebValid() && !proxy_suffix_.empty()) {
     StringPiece host_and_port = gurl.HostAndPort();
     if (host_and_port.ends_with(proxy_suffix_)) {
       host_and_port.remove_suffix(proxy_suffix_.size());
       host_and_port.CopyToString(host);  // Remove any other port, I suppose.
       *url = StrCat(gurl.Scheme(), "://", host_and_port, gurl.PathAndLeaf());
       ret = true;
     }
   }
   return ret;
 }

 bool DomainLawyer::AddProxySuffix(const GoogleUrl& base_url,
                                   GoogleString* href) const {
   // Let's say we have a proxy-prefix of ".suffix".  When we visit
   // http://www.example.com.suffix, we can leave relative URLs alone
   // in hyperlinkes.  However, if we see an absolute link to
   // http://www.example.com/foo or http://foo.www.example.com/bar then
   // we want to add the suffix to the hyperlink attribute.
   StringPiece base_host = base_url.Host();
   if (!proxy_suffix_.empty() && base_host.ends_with(proxy_suffix_)) {
     // Remove the suffix from the host so we can find a-tag references to it.
     StringPiece base_host_no_suffix = base_host.substr(
         0, base_host.size() - proxy_suffix_.size());
     GoogleUrl href_gurl(base_url, *href);

     // Note that we purposefully do not check schemes here since we want to
     // permit redirects from http:// to https:// (and likewise inclusion of
     // resources).
     if (href_gurl.IsWebValid() && base_url.IsWebValid()) {
       StringPiece href_domain, base_domain;
       StringPiece href_host = href_gurl.Host();
       if (href_host == base_host_no_suffix) {
         // TODO(jmarantz): handle alternate ports.
         *href = StrCat(href_gurl.Scheme(), "://", base_host,
                        href_gurl.PathAndLeaf());
         return true;
       } else if (domain_registry::MinimalPrivateSuffix(href_host) ==
                  domain_registry::MinimalPrivateSuffix(base_host_no_suffix)) {
         *href = StrCat(href_gurl.Scheme(), "://",
                        href_host, proxy_suffix_,
                        href_gurl.PathAndLeaf());
         return true;
       }
     }
   }
   return false;
 }

 }  // namespace net_instaweb