blob: f610970d8c3ec331de0abb2d9e1a2e86f3a6e62c [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "net/instaweb/rewriter/public/domain_lawyer.h"
#include <map>
#include <set>
#include <utility> // for std::pair
#include <vector>
#include "base/logging.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/message_handler.h"
#include "pagespeed/kernel/base/stl_util.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/wildcard.h"
#include "pagespeed/kernel/http/domain_registry.h"
#include "pagespeed/kernel/http/google_url.h"
namespace net_instaweb {
class DomainLawyer::Domain {
public:
explicit Domain(const StringPiece& name)
: wildcard_(name),
name_(name.data(), name.size()),
rewrite_domain_(NULL),
origin_domain_(NULL),
authorized_(false),
cycle_breadcrumb_(false),
is_proxy_(false) {
}
bool IsWildcarded() const { return !wildcard_.IsSimple(); }
bool Match(const StringPiece& domain) { return wildcard_.Match(domain); }
Domain* rewrite_domain() const { return rewrite_domain_; }
Domain* origin_domain() const { return origin_domain_; }
const GoogleString& name() const { return name_; }
// When multiple domains are mapped to the same rewrite-domain, they
// should have consistent origins. If they don't, we print an error
// message but we keep rolling. This is because we don't want to
// introduce an incremental change that would invalidate existing
// pagespeed.conf files.
//
void MergeOrigin(Domain* origin_domain, MessageHandler* handler) {
if (cycle_breadcrumb_) {
// See DomainLawyerTest.RewriteOriginCycle
return;
}
cycle_breadcrumb_ = true;
if ((origin_domain != origin_domain_) && (origin_domain != NULL)) {
if (origin_domain_ != NULL) {
if (handler != NULL) {
handler->Message(kError,
"RewriteDomain %s has conflicting origins %s and "
"%s, overriding to %s",
name_.c_str(),
origin_domain_->name_.c_str(),
origin_domain->name_.c_str(),
origin_domain->name_.c_str());
}
}
origin_domain_ = origin_domain;
for (int i = 0; i < num_shards(); ++i) {
shards_[i]->MergeOrigin(origin_domain, handler);
}
if (rewrite_domain_ != NULL) {
rewrite_domain_->MergeOrigin(origin_domain, handler);
}
}
cycle_breadcrumb_ = false;
}
// handler==NULL means this is happening from a 'merge' so we will
// silently let the new rewrite_domain win.
bool SetRewriteDomain(Domain* rewrite_domain, MessageHandler* handler) {
if (rewrite_domain == rewrite_domain_) {
return true;
}
// Don't break old configs on this new consistency check
// for ModPagespeedMapRewriteDomain. However,
// ModPagespeedMapProxyDomain has no legacy configuration, and
// in that context it's a functional problem to have multiple
// proxy directories mapped to a single origin, so we must fail
// the configuration.
if (is_proxy_ && (rewrite_domain_ != NULL)) {
if (handler != NULL) {
handler->Message(kError,
"ProxyDomain %s has conflicting proxies %s and %s",
name_.c_str(),
rewrite_domain_->name_.c_str(),
rewrite_domain->name_.c_str());
}
return false;
}
rewrite_domain_ = rewrite_domain;
rewrite_domain->MergeOrigin(origin_domain_, handler);
return true; // don't break old configs on this new consistency check.
}
// handler==NULL means this is happening from a 'merge' so we will
// silently let the new origin_domain win.
bool SetOriginDomain(Domain* origin_domain, MessageHandler* handler) {
if (origin_domain == origin_domain_) {
return true;
}
// Don't break old configs on this new consistency check
// for ModPagespeedMapOriginDomain. However,
// ModPagespeedMapProxyDomain has no legacy configuration, and
// in that context it's a functional problem to have the same
// proxy directory mapped to multiple origins, so we must fail
// the configuration.
if ((origin_domain_ != NULL) &&
(origin_domain_->is_proxy_ || origin_domain->is_proxy_)) {
if (handler != NULL) {
handler->Message(kError,
"ProxyDomain %s has conflicting origins %s and %s",
name_.c_str(),
origin_domain_->name_.c_str(),
origin_domain->name_.c_str());
}
return false;
}
MergeOrigin(origin_domain, handler);
if (rewrite_domain_ != NULL) {
rewrite_domain_->MergeOrigin(origin_domain_, handler);
}
return true;
}
bool SetProxyDomain(Domain* origin_domain, MessageHandler* handler) {
origin_domain->is_proxy_ = true;
return (SetOriginDomain(origin_domain, handler) &&
origin_domain->SetRewriteDomain(this, handler));
}
// handler==NULL means this is happening from a 'merge' so we will
// silently let the new rewrite_domain win.
bool SetShardFrom(Domain* rewrite_domain, MessageHandler* handler) {
if ((rewrite_domain_ != rewrite_domain) && (rewrite_domain_ != NULL)) {
if (handler != NULL) {
// We only treat this as an error when the handler is non-null. We
// use a null handler during merges, and will do the best we can
// to get correct behavior.
handler->Message(kError,
"Shard %s has conflicting rewrite_domain %s and %s",
name_.c_str(),
rewrite_domain_->name_.c_str(),
rewrite_domain->name_.c_str());
return false;
}
}
MergeOrigin(rewrite_domain->origin_domain_, handler);
rewrite_domain->shards_.push_back(this);
rewrite_domain_ = rewrite_domain;
return true;
}
void set_authorized(bool authorized) { authorized_ = authorized; }
int num_shards() const { return shards_.size(); }
void set_host_header(StringPiece x) { x.CopyToString(&host_header_); }
const GoogleString& host_header() const { return host_header_; }
// Indicates whether this domain is authorized when found in URLs
// HTML files are as direct requests to the web server. Domains
// get authorized by mentioning them in ModPagespeedDomain,
// ModPagespeedMapRewriteDomain, ModPagespeedShardDomain, and as
// the from-list in ModPagespeedMapOriginDomain. However, the target
// of ModPagespeedMapOriginDomain is not implicitly authoried --
// that may be 'localhost'.
bool authorized() const { return authorized_; }
Domain* shard(int shard_index) const { return shards_[shard_index]; }
bool is_proxy() const { return is_proxy_; }
void set_is_proxy(bool is_proxy) { is_proxy_ = is_proxy; }
GoogleString Signature() const {
GoogleString signature;
StrAppend(&signature, name_, "_",
authorized_ ? "_a" : "_n", "_");
// Assuming that there will be no cycle of Domains like Domain A has a
// rewrite domain to domain B which in turn have the original domain as A.
if (rewrite_domain_ != NULL) {
StrAppend(&signature, "R:", rewrite_domain_->name(), "_");
}
if (!host_header_.empty()) {
StrAppend(&signature, "H:", host_header_, "|");
}
if (origin_domain_ != NULL) {
StrAppend(&signature,
origin_domain_->is_proxy_ ? "P:" : "O:",
origin_domain_->name(), "_");
}
for (int index = 0; index < num_shards(); ++index) {
if (shards_[index] != NULL) {
StrAppend(&signature, "S:", shards_[index]->name(), "_");
}
}
return signature;
}
GoogleString ToString() const {
GoogleString output = name_;
if (authorized_) {
StrAppend(&output, " Auth");
}
if (rewrite_domain_ != NULL) {
StrAppend(&output,
is_proxy_ ? " ProxyDomain:" : " RewriteDomain:",
rewrite_domain_->name());
}
if (origin_domain_ != NULL) {
StrAppend(&output,
(origin_domain_->is_proxy_
? " ProxyOriginDomain:" : " OriginDomain:"),
origin_domain_->name());
}
if (!shards_.empty()) {
StrAppend(&output, " Shards:{");
for (int i = 0, n = shards_.size(); i < n; ++i) {
StrAppend(&output, (i == 0 ? "" : ", "), shards_[i]->name());
}
StrAppend(&output, "}");
}
if (!host_header_.empty()) {
StrAppend(&output, " HostHeader:", host_header_);
}
return output;
}
private:
Wildcard wildcard_;
GoogleString name_;
// The rewrite_domain, if non-null, gives the location of where this
// Domain should be rewritten. This can be used to move resources onto
// a CDN or onto a cookieless domain. We also use this pointer to
// get from shards back to the domain they were sharded from.
Domain* rewrite_domain_;
// The origin_domain, if non-null, gives the location of where
// resources should be fetched from by mod_pagespeed, in lieu of how
// it is specified in the HTML. This allows, for example, a CDN to
// fetch content from an origin domain, or an origin server behind a
// load-balancer to specify localhost or an IP address of a host to
// go to directly, skipping DNS resolution and reducing outbound
// traffic.
Domain* origin_domain_;
// Explicitly specified Host header for use with MapOriginDomain. When
// empty, this indicates that the domain specified in the URL argument
// to MapOrigin and MapOriginUrl should be used as the host header.
GoogleString host_header_;
// A rewrite_domain keeps track of all its shards.
DomainVector shards_;
bool authorized_;
// This boolean helps us prevent spinning through a cycle in the
// graph that can be expressed between shards and rewrite domains, e.g.
// ModPagespeedMapOriginDomain a b
// ModPagespeedMapRewriteDomain b c
// ModPagespeedAddShard b c
bool cycle_breadcrumb_;
// Identifies origin-domains that have been been used in
// AddProxyDomainMapping, and thus should not require a modified
// Host header when fetching resources.
bool is_proxy_;
};
DomainLawyer::~DomainLawyer() {
Clear();
}
bool DomainLawyer::AddDomain(const StringPiece& domain_name,
MessageHandler* handler) {
return (AddDomainHelper(domain_name, true, true, false, handler) != NULL);
}
bool DomainLawyer::AddKnownDomain(const StringPiece& domain_name,
MessageHandler* handler) {
return (AddDomainHelper(domain_name, false, false, false, handler) != NULL);
}
GoogleString DomainLawyer::NormalizeDomainName(const StringPiece& domain_name) {
// Ensure that the following specifications are treated identically:
// www.google.com/abc
// http://www.google.com/abc
// www.google.com/abc
// http://www.google.com/abc
// WWW.GOOGLE.COM/abc
// all come out the same, but distinct from
// www.google.com/Abc
// As the path component is case-sensitive.
//
// Example: domain-mapping domain-mapping
// http://musicasacra.lemon42.com/DE/evoscripts/musica_sacra/returnBinaryImage
// We need to case-fold only "musicasacra.lemon42.com" and not
// "returnBinaryImage" or "DE".
GoogleString domain_name_str;
static const char kSchemeDelim[] = "://";
stringpiece_ssize_type scheme_delim_start = domain_name.find(kSchemeDelim);
if (scheme_delim_start == StringPiece::npos) {
domain_name_str = StrCat("http://", domain_name);
scheme_delim_start = 4;
} else {
domain_name.CopyToString(&domain_name_str);
}
EnsureEndsInSlash(&domain_name_str);
// Lower-case all characters in the string, up until the "/" that terminates
// the hostname. We pass origin_start into the find() call to avoid tripping
// on the "/" in "http://".
GoogleString::size_type origin_start = scheme_delim_start +
STATIC_STRLEN(kSchemeDelim);
GoogleString::size_type slash = domain_name_str.find('/', origin_start);
DCHECK_NE(GoogleString::npos, slash);
for (char* p = &(domain_name_str[0]), *e = p + slash; p < e; ++p) {
*p = LowerChar(*p);
}
// For "https", any ":443" in the host is redundant; ditto for :80 and http.
StringPiece scheme(domain_name_str.data(), scheme_delim_start);
StringPiece origin(domain_name_str.data() + origin_start,
slash - origin_start);
if ((scheme == "https") && origin.ends_with(":443")) {
domain_name_str.erase(slash - 4, 4);
} else if ((scheme == "http") && origin.ends_with(":80")) {
domain_name_str.erase(slash - 3, 3);
}
return domain_name_str;
}
DomainLawyer::Domain* DomainLawyer::AddDomainHelper(
const StringPiece& domain_name, bool warn_on_duplicate,
bool authorize, bool is_proxy, MessageHandler* handler) {
if (domain_name.empty()) {
// handler will be NULL only when called from Merge, which should
// only have pre-validated (non-empty) domains. So it should not
// be possible to get here from Merge.
if (handler != NULL) {
handler->MessageS(kWarning, "Empty domain passed to AddDomain");
}
return NULL;
}
if (authorize && domain_name == "*") {
authorize_all_domains_ = true;
}
// TODO(matterbury): need better data structures to eliminate the O(N) logic:
// 1) Use a trie for domain_map_ as we need to find the domain whose trie
// path matches the beginning of the given domain_name since we no longer
// match just the domain name.
// 2) Use a better lookup structure for wildcard searching.
GoogleString domain_name_str = NormalizeDomainName(domain_name);
Domain* domain = NULL;
std::pair<DomainMap::iterator, bool> p = domain_map_.insert(
DomainMap::value_type(domain_name_str, domain));
DomainMap::iterator iter = p.first;
if (p.second) {
domain = new Domain(domain_name_str);
iter->second = domain;
if (domain->IsWildcarded()) {
wildcarded_domains_.push_back(domain);
}
} else {
domain = iter->second;
if (warn_on_duplicate && (authorize == domain->authorized())) {
handler->Message(kWarning, "AddDomain of domain already in map: %s",
domain_name_str.c_str());
domain = NULL;
}
}
if (domain != NULL) {
if (authorize) {
domain->set_authorized(true);
}
if (is_proxy) {
domain->set_is_proxy(true);
}
}
return domain;
}
// Looks up the Domain* object by name. From the Domain object
// we can tell if it's wildcarded, in which case it cannot be
// the 'to' field for a map, and whether resources from it should
// be mapped to a different domain, either for rewriting or for
// fetching.
DomainLawyer::Domain* DomainLawyer::FindDomain(const GoogleUrl& gurl) const {
// First do a quick lookup on the domain name only, since that's the most
// common case. Failing that, try searching for domain + path.
// TODO(matterbury): see AddDomainHelper for speed issues.
Domain* domain = NULL;
// There may be multiple entries in the map with the same domain,
// but varying paths. We want to choose the entry with the longest
// domain that prefix-matches GURL. So do the lookup starting
// with the entire origin+path, then shorten the string removing
// path components, looking for an exact match till we get to the origin
// with no path.
//
// TODO(jmarantz): IMO the best data structure for this is an explicit
// tree. That would allow starting from the top and searching down,
// rather than starting at the bottom and searching up, with each search
// a lookup over the entire set of domains.
GoogleString domain_path;
gurl.AllExceptLeaf().CopyToString(&domain_path);
StringPieceVector components;
SplitStringPieceToVector(gurl.PathSansLeaf(), "/", &components, false);
// PathSansLeaf gives something like "/a/b/c/" so after splitting with
// omit_empty_strings==false, the first and last elements are always
// present and empty.
//
// Note that the GURL can be 'about:blank' so be paranoid about getting
// what we expect.
if ((2U <= components.size()) &&
components[0].empty() &&
components[components.size() - 1].empty()) {
int component_size = 0;
for (int i = components.size() - 1; (domain == NULL) && (i >= 1); --i) {
domain_path.resize(domain_path.size() - component_size);
DCHECK(StringPiece(domain_path).ends_with("/"));
DomainMap::const_iterator p = domain_map_.find(domain_path);
if (p != domain_map_.end()) {
domain = p->second;
} else {
// Remove the path component. Consider input
// "http://a.com/x/yy/zzz/w". We will split PathSansLeaf, which
// is "/x/yy/zzz/", so we will get StringPieceVector ["", "x",
// "yy", "zzz", ""]. In the first iteration we want to consider
// the entire path in the search, so we initialize
// component_size to 0 above the loop. In the next iteration we
// want to chop off "zzz/" so we increment the component size by
// one to get rid of the slash. Note that we passed 'false'
// into SplitStringPieceToVector so if there are double-slashes
// they will show up as distinct components and we will get rid
// of them one at a time.
component_size = components[i - 1].size() + 1;
}
}
}
if (domain == NULL) {
for (int i = 0, n = wildcarded_domains_.size(); i < n; ++i) {
domain = wildcarded_domains_[i];
if (domain->Match(domain_path)) {
break;
} else {
domain = NULL;
}
}
}
return domain;
}
void DomainLawyer::FindDomainsRewrittenTo(
const GoogleUrl& original_url,
ConstStringStarVector* from_domains) const {
// TODO(rahulbansal): Make this more efficient by maintaining the map of
// rewrite_domain -> from_domains.
if (!original_url.IsWebValid()) {
LOG(ERROR) << "Invalid url " << original_url.Spec();
return;
}
GoogleString domain_name;
original_url.Origin().CopyToString(&domain_name);
EnsureEndsInSlash(&domain_name);
for (DomainMap::const_iterator p = domain_map_.begin();
p != domain_map_.end(); ++p) {
Domain* src_domain = p->second;
if (!src_domain->IsWildcarded() && (src_domain->rewrite_domain() != NULL) &&
domain_name == src_domain->rewrite_domain()->name()) {
from_domains->push_back(&src_domain->name());
}
}
}
bool DomainLawyer::MapRequestToDomain(
const GoogleUrl& original_request,
const StringPiece& resource_url, // relative to original_request
GoogleString* mapped_domain_name,
GoogleUrl* resolved_request,
MessageHandler* handler) const {
CHECK(original_request.IsAnyValid());
GoogleUrl original_origin(original_request.Origin());
resolved_request->Reset(original_request, resource_url);
bool ret = false;
// We can map a request to/from http/https.
if (resolved_request->IsWebValid()) {
GoogleUrl resolved_origin(resolved_request->Origin());
// Looks at the resolved domain name + path from the original request
// and the resource_url (which might override the original request).
// Gets the Domain* object out of that.
Domain* resolved_domain = FindDomain(*resolved_request);
// The origin domain is authorized by default.
if (resolved_origin == original_origin) {
resolved_origin.Spec().CopyToString(mapped_domain_name);
ret = true;
} else if (resolved_domain != NULL && resolved_domain->authorized()) {
if (resolved_domain->IsWildcarded()) {
// This is a sharded domain. We do not do the sharding in this function.
resolved_origin.Spec().CopyToString(mapped_domain_name);
} else {
*mapped_domain_name = resolved_domain->name();
}
ret = true;
}
// If we actually got a Domain* out of the lookups so far, then a
// mapping to a different rewrite_domain may be contained there. This
// helps move resources to CDNs or cookieless domains.
//
// Note that at this point, we are not really caring where we fetch
// from. We are only concerned here with what URLs we will write into
// HTML files. See MapOrigin below which is used to redirect fetch
// requests to a different domain (e.g. localhost).
if (ret && resolved_domain != NULL) {
Domain* mapped_domain = resolved_domain->rewrite_domain();
if (mapped_domain != NULL) {
CHECK(!mapped_domain->IsWildcarded());
CHECK(mapped_domain != resolved_domain);
*mapped_domain_name = mapped_domain->name();
GoogleUrl mapped_request;
ret = MapUrlHelper(*resolved_domain, *mapped_domain,
*resolved_request, &mapped_request);
if (ret) {
resolved_request->Swap(&mapped_request);
}
}
}
}
return ret;
}
bool DomainLawyer::IsDomainAuthorized(const GoogleUrl& original_request,
const GoogleUrl& domain_to_check) const {
if (authorize_all_domains_) {
return true;
}
bool ret = false;
if (domain_to_check.IsWebValid()) {
if (original_request.IsWebValid() &&
(original_request.Origin() == domain_to_check.Origin())) {
ret = true;
} else {
Domain* path_domain = FindDomain(domain_to_check);
ret = (path_domain != NULL) && path_domain->authorized();
}
}
return ret;
}
bool DomainLawyer::IsOriginKnown(const GoogleUrl& domain_to_check) const {
if (domain_to_check.IsWebValid()) {
Domain* path_domain = FindDomain(domain_to_check);
return (path_domain != NULL);
}
return false;
}
bool DomainLawyer::MapOrigin(const StringPiece& in, GoogleString* out,
GoogleString* host_header, bool* is_proxy) const {
GoogleUrl gurl(in);
return gurl.IsWebValid() && MapOriginUrl(gurl, out, host_header, is_proxy);
}
bool DomainLawyer::MapOriginUrl(const GoogleUrl& gurl,
GoogleString* out, GoogleString* host_header,
bool* is_proxy) const {
bool ret = false;
*is_proxy = false;
host_header->clear();
// We can map an origin to/from http/https.
if (gurl.IsWebValid()) {
ret = true;
gurl.Spec().CopyToString(out);
Domain* domain = FindDomain(gurl);
if (domain != NULL) {
Domain* origin_domain = domain->origin_domain();
if (origin_domain != NULL) {
GoogleUrl mapped_gurl;
if (MapUrlHelper(*domain, *origin_domain, gurl, &mapped_gurl)) {
mapped_gurl.Spec().CopyToString(out);
}
*is_proxy = origin_domain->is_proxy();
const GoogleString& origin_header = origin_domain->host_header();
if (!origin_header.empty()) {
*host_header = origin_header;
}
}
}
if (host_header->empty()) {
gurl.HostAndPort().CopyToString(host_header);
}
}
return ret;
}
bool DomainLawyer::MapUrlHelper(const Domain& from_domain,
const Domain& to_domain,
const GoogleUrl& gurl,
GoogleUrl* mapped_gurl) const {
CHECK(!to_domain.IsWildcarded());
GoogleUrl from_domain_gurl(from_domain.name());
StringPiece from_domain_path(from_domain_gurl.PathSansLeaf());
StringPiece path_and_leaf(gurl.PathAndLeaf());
DCHECK(path_and_leaf.starts_with(from_domain_path));
// Trim the URL's domain we came from based on how it was specifed in the
// from_domain. E.g. if you write
// ModPagespeedMap*Domain localhost/foo cdn.com/bar
// and the URL being mapped is
// http://cdn.com/bar/x
// then we set path_and_leaf to "x". This testcase gets hit in
// DomainLawyerTest.OriginAndExternWithPaths.
//
// Even if the from_domain has no subdirectory, we need to remove
// the leading slash to make it a relative reference and retain any
// subdirectory in the to_domain.
//
// Note: We must prepend "./" to make sure the path_and_leaf is not an
// absolute URL, which will cause problems below. For example:
// "http://www.example.com/data:image/jpeg" should be converted to the
// relative URL "./data:image/jpeg", not the absolute URL "data:image/jpeg".
GoogleString rel_url =
StrCat("./", path_and_leaf.substr(from_domain_path.size()));
// Make sure this isn't a valid absolute URL.
DCHECK(!GoogleUrl(rel_url).IsWebValid())
<< "URL " << gurl.Spec() << " is being mapped to absolute URL "
<< rel_url << " which will break many things.";
GoogleUrl to_domain_gurl(to_domain.name());
mapped_gurl->Reset(to_domain_gurl, rel_url);
return mapped_gurl->IsWebValid();
}
bool DomainLawyer::AddRewriteDomainMapping(
const StringPiece& to_domain_name,
const StringPiece& comma_separated_from_domains,
MessageHandler* handler) {
bool result = MapDomainHelper(to_domain_name, comma_separated_from_domains,
"" /* host_header */,
&Domain::SetRewriteDomain,
true /* allow_wildcards */,
true /* allow_map_to_https */,
true /* authorize */,
handler);
can_rewrite_domains_ |= result;
return result;
}
bool DomainLawyer::DomainNameToTwoProtocols(
const StringPiece& domain_name,
GoogleString* http_url, GoogleString* https_url) {
*http_url = NormalizeDomainName(domain_name);
StringPiece http_url_piece(*http_url);
if (!http_url_piece.starts_with("http:")) {
return false;
}
*https_url = StrCat("https", http_url_piece.substr(4));
return true;
}
bool DomainLawyer::TwoProtocolDomainHelper(
const StringPiece& to_domain_name,
const StringPiece& from_domain_name,
const StringPiece& host_header,
SetDomainFn set_domain_fn,
bool authorize,
MessageHandler* handler) {
GoogleString http_to_url, http_from_url, https_to_url, https_from_url;
if (!DomainNameToTwoProtocols(to_domain_name, &http_to_url, &https_to_url)) {
return false;
}
if (!DomainNameToTwoProtocols(from_domain_name,
&http_from_url, &https_from_url)) {
return false;
}
if (!MapDomainHelper(http_to_url, http_from_url,
host_header,
set_domain_fn,
false, /* allow_wildcards */
false, /* allow_map_to_https */
authorize, handler)) {
return false;
}
if (!MapDomainHelper(https_to_url, https_from_url,
host_header,
set_domain_fn,
false, /* allow_wildcards */
true, /* allow_map_to_https */
authorize, handler)) {
// Note that we still retain the http domain mapping in this case.
return false;
}
return true;
}
bool DomainLawyer::AddTwoProtocolRewriteDomainMapping(
const StringPiece& to_domain_name,
const StringPiece& from_domain_name,
MessageHandler* handler) {
bool result = TwoProtocolDomainHelper(to_domain_name, from_domain_name,
"" /* host_header */,
&Domain::SetRewriteDomain,
true /*authorize */, handler);
can_rewrite_domains_ |= result;
return result;
}
bool DomainLawyer::AddOriginDomainMapping(
const StringPiece& to_domain_name,
const StringPiece& comma_separated_from_domains,
const StringPiece& host_header,
MessageHandler* handler) {
return MapDomainHelper(to_domain_name, comma_separated_from_domains,
host_header,
&Domain::SetOriginDomain,
true /* allow_wildcards */,
true /* allow_map_to_https */,
false /* authorize */,
handler);
}
bool DomainLawyer::AddProxyDomainMapping(
const StringPiece& proxy_domain_name,
const StringPiece& origin_domain_name,
const StringPiece& to_domain_name,
MessageHandler* handler) {
bool result;
if (to_domain_name.empty()) {
// 1. Rewrite from origin_domain to proxy_domain.
// 2. Set origin_domain->is_proxy = true.
// 3. Map origin from proxy_domain to origin_domain.
result = MapDomainHelper(origin_domain_name, proxy_domain_name,
"" /* host_header */,
&Domain::SetProxyDomain,
false /* allow_wildcards */,
true /* allow_map_to_https */,
true /* authorize */,
handler);
} else {
// 1. Rewrite from origin_domain to to_domain.
// 2. Set origin_domain->is_proxy = true.
// 3. Map origin from to_domain to origin_domain.
result = MapDomainHelper(origin_domain_name, to_domain_name,
"" /* host_header */,
&Domain::SetProxyDomain,
false /* allow_wildcards */,
true /* allow_map_to_https */,
true /* authorize */,
handler);
// 4. Rewrite from proxy_domain to to_domain. This way when the CDN asks us
// for resources on proxy_domain it knows to use the CDN domain for the
// cache key.
result &= MapDomainHelper(to_domain_name, proxy_domain_name,
"" /* host_header */,
&Domain::SetRewriteDomain,
false /* allow_wildcards */,
true /* allow_map_to_https */,
true /* authorize */,
handler);
// 5. Map origin from proxy_domain to origin_domain. This tells the proxy
// how to fetch files from the origin for reconstruction.
result &= MapDomainHelper(origin_domain_name, proxy_domain_name,
"" /* host_header */,
&Domain::SetOriginDomain,
false /* allow wildcards */,
true /* allow_map_to_https */,
true /* authorize */,
handler);
}
return result;
}
bool DomainLawyer::AddTwoProtocolOriginDomainMapping(
const StringPiece& to_domain_name,
const StringPiece& from_domain_name,
const StringPiece& host_header,
MessageHandler* handler) {
return TwoProtocolDomainHelper(to_domain_name, from_domain_name,
host_header,
&Domain::SetOriginDomain,
false /*authorize */, handler);
}
bool DomainLawyer::AddShard(
const StringPiece& shard_domain_name,
const StringPiece& comma_separated_shards,
MessageHandler* handler) {
bool result = MapDomainHelper(shard_domain_name, comma_separated_shards,
"" /* host_header */,
&Domain::SetShardFrom,
false /* allow_wildcards */,
true /* allow_map_to_https */,
true /* authorize */,
handler);
can_rewrite_domains_ |= result;
return result;
}
bool DomainLawyer::IsSchemeSafeToMapTo(const StringPiece& domain_name,
bool allow_https_scheme) {
// The scheme defaults to http so that's the same as explicitly saying http.
return (domain_name.find("://") == GoogleString::npos ||
domain_name.starts_with("http://") ||
(allow_https_scheme && domain_name.starts_with("https://")));
}
bool DomainLawyer::MapDomainHelper(
const StringPiece& to_domain_name,
const StringPiece& comma_separated_from_domains,
const StringPiece& host_header,
SetDomainFn set_domain_fn,
bool allow_wildcards,
bool allow_map_to_https,
bool authorize_to_domain,
MessageHandler* handler) {
if (!IsSchemeSafeToMapTo(to_domain_name, allow_map_to_https)) {
return false;
}
Domain* to_domain = AddDomainHelper(to_domain_name, false,
authorize_to_domain, false, handler);
if (to_domain == NULL) {
return false;
}
bool ret = false;
bool mapped_a_domain = false;
if (to_domain->IsWildcarded()) {
handler->Message(kError, "Cannot map to a wildcarded domain: %s",
to_domain_name.as_string().c_str());
} else {
GoogleUrl to_url(to_domain->name());
StringPieceVector domains;
SplitStringPieceToVector(comma_separated_from_domains, ",", &domains, true);
ret = true;
for (int i = 0, n = domains.size(); i < n; ++i) {
const StringPiece& domain_name = domains[i];
Domain* from_domain = AddDomainHelper(domain_name, false, true, false,
handler);
if (from_domain != NULL) {
GoogleUrl from_url(from_domain->name());
if (to_url.Origin() == from_url.Origin()) {
// Ignore requests to map to the same scheme://hostname:port/.
} else if (!allow_wildcards && from_domain->IsWildcarded()) {
handler->Message(kError, "Cannot map from a wildcarded domain: %s",
to_domain_name.as_string().c_str());
ret = false;
} else {
bool ok = (from_domain->*set_domain_fn)(to_domain, handler);
ret &= ok;
mapped_a_domain |= ok;
}
}
}
DCHECK(host_header.empty() || !to_domain->is_proxy())
<< "It makes no sense to specify a host header for a proxy:"
<< host_header << ", " << to_domain_name;
to_domain->set_host_header(host_header);
}
return (ret && mapped_a_domain);
}
DomainLawyer::Domain* DomainLawyer::CloneAndAdd(const Domain* src) {
Domain* dst = AddDomainHelper(src->name(), false, src->authorized(),
src->is_proxy(), NULL);
dst->set_host_header(src->host_header());
return dst;
}
void DomainLawyer::Merge(const DomainLawyer& src) {
int num_existing_wildcards = num_wildcarded_domains();
for (DomainMap::const_iterator
p = src.domain_map_.begin(),
e = src.domain_map_.end();
p != e; ++p) {
Domain* src_domain = p->second;
Domain* dst_domain = CloneAndAdd(src_domain);
Domain* src_rewrite_domain = src_domain->rewrite_domain();
if (src_rewrite_domain != NULL) {
dst_domain->SetRewriteDomain(CloneAndAdd(src_rewrite_domain), NULL);
}
Domain* src_origin_domain = src_domain->origin_domain();
if (src_origin_domain != NULL) {
dst_domain->SetOriginDomain(CloneAndAdd(src_origin_domain), NULL);
}
for (int i = 0; i < src_domain->num_shards(); ++i) {
Domain* src_shard = src_domain->shard(i);
Domain* dst_shard = CloneAndAdd(src_shard);
dst_shard->SetShardFrom(dst_domain, NULL);
}
}
// Remove the wildcards we just added in map order, and instead add them
// in the order they were in src.wildcarded_domains.
wildcarded_domains_.resize(num_existing_wildcards);
std::set<Domain*> dup_detector(wildcarded_domains_.begin(),
wildcarded_domains_.end());
for (int i = 0, n = src.wildcarded_domains_.size(); i < n; ++i) {
Domain* src_domain = src.wildcarded_domains_[i];
DomainMap::const_iterator p = domain_map_.find(src_domain->name());
if (p == domain_map_.end()) {
LOG(DFATAL) << "Domain " << src_domain->name() << " not found in dst";
} else {
Domain* dst_domain = p->second;
if (dup_detector.find(dst_domain) == dup_detector.end()) {
wildcarded_domains_.push_back(dst_domain);
}
}
}
can_rewrite_domains_ |= src.can_rewrite_domains_;
authorize_all_domains_ |= src.authorize_all_domains_;
if (!src.proxy_suffix_.empty()) {
if (!proxy_suffix_.empty() && (proxy_suffix_ != src.proxy_suffix_)) {
LOG(WARNING)
<< "Merging incompatible proxy suffixes " << proxy_suffix_ << " and "
<< src.proxy_suffix_;
}
proxy_suffix_ = src.proxy_suffix_;
}
}
bool DomainLawyer::ShardDomain(const StringPiece& domain_name,
uint32 hash,
GoogleString* sharded_domain) const {
GoogleUrl domain_gurl(NormalizeDomainName(domain_name));
Domain* domain = FindDomain(domain_gurl);
bool sharded = false;
if (domain != NULL) {
if (domain->num_shards() != 0) {
int shard_index = hash % domain->num_shards();
domain = domain->shard(shard_index);
*sharded_domain = domain->name();
sharded = true;
}
}
return sharded;
}
bool DomainLawyer::WillDomainChange(const GoogleUrl& gurl) const {
Domain* domain = FindDomain(gurl), *mapped_domain = domain;
if (domain != NULL) {
// First check a mapping based on AddRewriteDomainMapping.
mapped_domain = domain->rewrite_domain();
if (mapped_domain == NULL) {
// Even if there was no AddRewriteDomainMapping for this domain, there
// may still have been shards.
mapped_domain = domain;
}
// Now check mappings from the shard.
if (mapped_domain->num_shards() != 0) {
if (mapped_domain->num_shards() == 1) {
// Usually we don't expect exactly one shard, but if there is,
// we know exactly what it will be.
mapped_domain = mapped_domain->shard(0);
} else {
// We don't have enough data in this function to determine what
// the shard index will be, so we assume pessimistically that
// the domain will change.
//
// TODO(jmarantz): rename this method to MayDomainChange, or
// pass in the sharding index.
mapped_domain = NULL;
}
}
}
return domain != mapped_domain;
}
bool DomainLawyer::IsProxyMapped(const GoogleUrl& gurl) const {
Domain* domain = FindDomain(gurl);
if (domain != NULL) {
Domain* origin = domain->origin_domain();
if ((origin != NULL) && origin->is_proxy()) {
return true;
}
}
return false;
}
bool DomainLawyer::DoDomainsServeSameContent(
const StringPiece& domain1_name, const StringPiece& domain2_name) const {
GoogleUrl domain1_gurl(NormalizeDomainName(domain1_name));
Domain* domain1 = FindDomain(domain1_gurl);
GoogleUrl domain2_gurl(NormalizeDomainName(domain2_name));
Domain* domain2 = FindDomain(domain2_gurl);
if ((domain1 == NULL) || (domain2 == NULL)) {
return false;
}
if (domain1 == domain2) {
return true;
}
Domain* rewrite1 = domain1->rewrite_domain();
Domain* rewrite2 = domain2->rewrite_domain();
if ((rewrite1 == domain2) || (rewrite2 == domain1)) {
return true;
}
if ((rewrite1 != NULL) && (rewrite1 == rewrite2)) {
return true;
}
return false;
}
GoogleString DomainLawyer::Signature() const {
GoogleString signature;
for (DomainMap::const_iterator iterator = domain_map_.begin();
iterator != domain_map_.end(); ++iterator) {
StrAppend(&signature, "D:", iterator->second->Signature(), "-");
}
if (!proxy_suffix_.empty()) {
StrAppend(&signature, ",PS:", proxy_suffix_);
}
return signature;
}
GoogleString DomainLawyer::ToString(StringPiece line_prefix) const {
GoogleString output;
for (DomainMap::const_iterator iterator = domain_map_.begin();
iterator != domain_map_.end(); ++iterator) {
StrAppend(&output, line_prefix, iterator->second->ToString(), "\n");
}
if (!proxy_suffix_.empty()) {
StrAppend(&output, "Proxy Suffix: ", proxy_suffix_);
}
return output;
}
void DomainLawyer::Clear() {
STLDeleteValues(&domain_map_);
can_rewrite_domains_ = false;
authorize_all_domains_ = false;
wildcarded_domains_.clear();
proxy_suffix_.clear();
}
bool DomainLawyer::StripProxySuffix(const GoogleUrl& gurl, GoogleString* url,
GoogleString* host) const {
bool ret = false;
if (gurl.IsWebValid() && !proxy_suffix_.empty()) {
StringPiece host_and_port = gurl.HostAndPort();
if (host_and_port.ends_with(proxy_suffix_)) {
host_and_port.remove_suffix(proxy_suffix_.size());
host_and_port.CopyToString(host); // Remove any other port, I suppose.
*url = StrCat(gurl.Scheme(), "://", host_and_port, gurl.PathAndLeaf());
ret = true;
}
}
return ret;
}
bool DomainLawyer::AddProxySuffix(const GoogleUrl& base_url,
GoogleString* href) const {
// Let's say we have a proxy-prefix of ".suffix". When we visit
// http://www.example.com.suffix, we can leave relative URLs alone
// in hyperlinkes. However, if we see an absolute link to
// http://www.example.com/foo or http://foo.www.example.com/bar then
// we want to add the suffix to the hyperlink attribute.
StringPiece base_host = base_url.Host();
if (!proxy_suffix_.empty() && base_host.ends_with(proxy_suffix_)) {
// Remove the suffix from the host so we can find a-tag references to it.
StringPiece base_host_no_suffix = base_host.substr(
0, base_host.size() - proxy_suffix_.size());
GoogleUrl href_gurl(base_url, *href);
// Note that we purposefully do not check schemes here since we want to
// permit redirects from http:// to https:// (and likewise inclusion of
// resources).
if (href_gurl.IsWebValid() && base_url.IsWebValid()) {
StringPiece href_domain, base_domain;
StringPiece href_host = href_gurl.Host();
if (href_host == base_host_no_suffix) {
// TODO(jmarantz): handle alternate ports.
*href = StrCat(href_gurl.Scheme(), "://", base_host,
href_gurl.PathAndLeaf());
return true;
} else if (domain_registry::MinimalPrivateSuffix(href_host) ==
domain_registry::MinimalPrivateSuffix(base_host_no_suffix)) {
*href = StrCat(href_gurl.Scheme(), "://",
href_host, proxy_suffix_,
href_gurl.PathAndLeaf());
return true;
}
}
}
return false;
}
} // namespace net_instaweb