blob: 44846f06d75971fc4c882a8993f7b28d87212539 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
#include "net/instaweb/automatic/public/proxy_interface.h"
#include "base/logging.h"
#include "net/instaweb/automatic/public/blink_flow_critical_line.h"
#include "net/instaweb/automatic/public/cache_html_flow.h"
#include "net/instaweb/automatic/public/flush_early_flow.h"
#include "net/instaweb/automatic/public/proxy_fetch.h"
#include "net/instaweb/http/public/async_fetch.h"
#include "net/instaweb/http/public/content_type.h"
#include "net/instaweb/http/public/log_record.h"
#include "net/instaweb/http/public/logging_proto_impl.h"
#include "net/instaweb/http/public/meta_data.h"
#include "net/instaweb/http/public/request_context.h"
#include "net/instaweb/http/public/request_headers.h"
#include "net/instaweb/http/public/response_headers.h"
#include "net/instaweb/http/public/user_agent_matcher.h"
#include "net/instaweb/rewriter/public/blink_util.h"
#include "net/instaweb/rewriter/public/furious_matcher.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "net/instaweb/rewriter/public/resource_fetch.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/url_namer.h"
#include "net/instaweb/util/public/basictypes.h"
#include "net/instaweb/util/public/google_url.h"
#include "net/instaweb/util/public/hostname_util.h"
#include "net/instaweb/util/public/property_cache.h"
#include "net/instaweb/util/public/ref_counted_ptr.h"
#include "net/instaweb/util/public/request_trace.h"
#include "net/instaweb/util/public/scoped_ptr.h"
#include "net/instaweb/util/public/statistics.h"
#include "net/instaweb/util/public/string.h"
#include "net/instaweb/util/public/string_util.h"
#include "net/instaweb/util/public/thread_system.h"
namespace net_instaweb {
class AbstractMutex;
class MessageHandler;
const char ProxyInterface::kBlinkRequestCount[] = "blink-requests";
const char ProxyInterface::kBlinkCriticalLineRequestCount[] =
"blink-critical-line-requests";
namespace {
// Names for Statistics variables.
const char kTotalRequestCount[] = "all-requests";
const char kPagespeedRequestCount[] = "pagespeed-requests";
const char kBlinkRequestCount[] = "blink-requests";
const char kRejectedRequestCount[] = "publisher-rejected-requests";
const char kRejectedRequestHtmlResponse[] = "Unable to serve "
"content as the content is blocked by the administrator of the domain.";
bool UrlMightHavePropertyCacheEntry(const GoogleUrl& url) {
const ContentType* type = NameExtensionToContentType(url.LeafSansQuery());
if (type == NULL) {
return true; // http://www.example.com/ -- no extension; could be HTML.
}
// Use a complete switch-statement rather than type()->IsHtmlLike()
// so that every time we add a new content-type we make an explicit
// decision about whether it should induce a pcache read.
//
// TODO(jmarantz): currently this returns false for ".txt". Thus we will
// do no optimizations relying on property-cache on HTML files ending with
// ".txt". We should determine whether this is the right thing or not.
switch (type->type()) {
case ContentType::kHtml:
case ContentType::kXhtml:
case ContentType::kCeHtml:
return true;
case ContentType::kJavascript:
case ContentType::kCss:
case ContentType::kText:
case ContentType::kXml:
case ContentType::kPng:
case ContentType::kGif:
case ContentType::kJpeg:
case ContentType::kSwf:
case ContentType::kWebp:
case ContentType::kIco:
case ContentType::kPdf:
case ContentType::kOther:
case ContentType::kJson:
case ContentType::kVideo:
case ContentType::kOctetStream:
return false;
}
LOG(DFATAL) << "URL " << url.Spec() << ": unexpected type:" << type->type()
<< "; " << type->mime_type() << "; " << type->file_extension();
return false;
}
bool HasRejectedHeader(const StringPiece& header_name,
const RequestHeaders* request_headers,
const RewriteOptions* options) {
ConstStringStarVector header_values;
if (request_headers->Lookup(header_name, &header_values)) {
for (int i = 0, n = header_values.size(); i < n; ++i) {
if (options->IsRejectedRequest(header_name, *header_values[i])) {
return true;
}
}
}
return false;
}
// Provides a callback whose Done() function is executed once we have
// rewrite options.
class ProxyInterfaceUrlNamerCallback : public UrlNamer::Callback {
public:
ProxyInterfaceUrlNamerCallback(
bool is_resource_fetch,
GoogleUrl* request_url,
AsyncFetch* async_fetch,
ProxyInterface* proxy_interface,
RewriteOptions* query_options,
MessageHandler* handler)
: is_resource_fetch_(is_resource_fetch),
request_url_(request_url),
async_fetch_(async_fetch),
property_callback_(NULL),
handler_(handler),
proxy_interface_(proxy_interface),
query_options_(query_options) {
}
virtual ~ProxyInterfaceUrlNamerCallback() {}
virtual void Done(RewriteOptions* rewrite_options) {
proxy_interface_->ProxyRequestCallback(
is_resource_fetch_, request_url_, async_fetch_, rewrite_options,
query_options_, handler_);
delete this;
}
private:
bool is_resource_fetch_;
GoogleUrl* request_url_;
AsyncFetch* async_fetch_;
ProxyFetchPropertyCallbackCollector* property_callback_;
MessageHandler* handler_;
ProxyInterface* proxy_interface_;
RewriteOptions* query_options_;
DISALLOW_COPY_AND_ASSIGN(ProxyInterfaceUrlNamerCallback);
};
} // namespace
ProxyInterface::ProxyInterface(const StringPiece& hostname, int port,
ServerContext* manager,
Statistics* stats)
: server_context_(manager),
fetcher_(NULL),
timer_(NULL),
handler_(manager->message_handler()),
hostname_(hostname.as_string()),
port_(port),
all_requests_(stats->GetTimedVariable(kTotalRequestCount)),
pagespeed_requests_(stats->GetTimedVariable(kPagespeedRequestCount)),
blink_requests_(stats->GetTimedVariable(kBlinkRequestCount)),
blink_critical_line_requests_(
stats->GetTimedVariable(kBlinkCriticalLineRequestCount)),
rejected_requests_(stats->GetTimedVariable(kRejectedRequestCount)) {
proxy_fetch_factory_.reset(new ProxyFetchFactory(manager));
}
ProxyInterface::~ProxyInterface() {
}
void ProxyInterface::InitStats(Statistics* statistics) {
statistics->AddTimedVariable(kTotalRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kPagespeedRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kBlinkRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kBlinkCriticalLineRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kRejectedRequestCount,
ServerContext::kStatisticsGroup);
BlinkFlowCriticalLine::InitStats(statistics);
CacheHtmlFlow::InitStats(statistics);
FlushEarlyFlow::InitStats(statistics);
}
bool ProxyInterface::IsWellFormedUrl(const GoogleUrl& url) {
bool ret = false;
if (url.is_valid()) {
if (url.has_path()) {
StringPiece path = url.PathAndLeaf();
GoogleString filename = url.ExtractFileName();
int path_len = path.size() - filename.size();
if (path_len >= 0) {
ret = true;
}
} else if (!url.has_scheme()) {
LOG(ERROR) << "URL has no scheme: " << url.Spec();
} else {
LOG(ERROR) << "URL has no path: " << url.Spec();
}
}
return ret;
}
bool ProxyInterface::UrlAndPortMatchThisServer(const GoogleUrl& url) {
bool ret = false;
if (url.is_valid() && (url.EffectiveIntPort() == port_)) {
// TODO(atulvasu): This should support matching the actual host this
// machine can receive requests from. Ideally some flag control would
// help. For example this server could be running multiple virtual
// servers, and we would like to know what server we are catering to for
// pagespeed only queries.
//
// Allow for exact hostname matches, as well as a URL typed into the
// browser window like "exeda.cam", which should match
// "exeda.cam.corp.google.com".
StringPiece host = url.Host();
if (IsLocalhost(host, hostname_) ||
StringPiece(hostname_).starts_with(StrCat(host, "."))) {
ret = true;
}
}
return ret;
}
void ProxyInterface::Fetch(const GoogleString& requested_url_string,
MessageHandler* handler,
AsyncFetch* async_fetch) {
const GoogleUrl requested_url(requested_url_string);
const bool is_get_or_head =
(async_fetch->request_headers()->method() == RequestHeaders::kGet) ||
(async_fetch->request_headers()->method() == RequestHeaders::kHead);
all_requests_->IncBy(1);
if (!(requested_url.is_valid() && IsWellFormedUrl(requested_url))) {
LOG(WARNING) << "Bad URL, failing request: " << requested_url_string;
async_fetch->response_headers()->SetStatusAndReason(HttpStatus::kNotFound);
async_fetch->Done(false);
} else {
// Try to handle this as a .pagespeed. resource.
if (server_context_->IsPagespeedResource(requested_url) &&
is_get_or_head) {
pagespeed_requests_->IncBy(1);
LOG(INFO) << "Serving URL as pagespeed resource: "
<< requested_url.Spec();
ProxyRequest(true, requested_url, async_fetch, handler);
} else if (UrlAndPortMatchThisServer(requested_url)) {
// Just respond with a 404 for now.
async_fetch->response_headers()->SetStatusAndReason(
HttpStatus::kNotFound);
LOG(INFO) << "Returning 404 for URL: " << requested_url.Spec();
async_fetch->Done(false);
} else {
// Otherwise we proxy it (rewriting if it is HTML).
LOG(INFO) << "Proxying URL normally: " << requested_url.Spec();
ProxyRequest(false, requested_url, async_fetch, handler);
}
}
}
void ProxyInterface::ProxyRequest(bool is_resource_fetch,
const GoogleUrl& request_url,
AsyncFetch* async_fetch,
MessageHandler* handler) {
scoped_ptr<GoogleUrl> gurl(new GoogleUrl);
gurl->Reset(request_url);
// Stripping ModPagespeed query params before the property cache lookup to
// make cache key consistent for both lookup and storing in cache.
ServerContext::OptionsBoolPair query_options_success =
server_context_->GetQueryOptions(gurl.get(),
async_fetch->request_headers(),
NULL);
if (!query_options_success.second) {
async_fetch->response_headers()->SetStatusAndReason(
HttpStatus::kMethodNotAllowed);
async_fetch->Write("Invalid PageSpeed query-params/request headers",
handler);
async_fetch->Done(false);
return;
}
// Owned by ProxyInterfaceUrlNamerCallback.
GoogleUrl* released_gurl(gurl.release());
ProxyInterfaceUrlNamerCallback* proxy_interface_url_namer_callback =
new ProxyInterfaceUrlNamerCallback(is_resource_fetch, released_gurl,
async_fetch, this,
query_options_success.first, handler);
server_context_->url_namer()->DecodeOptions(
*released_gurl, *async_fetch->request_headers(),
proxy_interface_url_namer_callback, handler);
}
ProxyFetchPropertyCallbackCollector*
ProxyInterface::InitiatePropertyCacheLookup(
bool is_resource_fetch,
const GoogleUrl& request_url,
RewriteOptions* options,
AsyncFetch* async_fetch,
bool* added_page_property_callback) {
RequestContextPtr request_ctx = async_fetch->request_context();
DCHECK(request_ctx.get() != NULL);
if (request_ctx->root_trace_context() != NULL) {
request_ctx->root_trace_context()->TracePrintf(
"PropertyCache lookup start");
}
StringPiece user_agent =
async_fetch->request_headers()->Lookup1(HttpAttributes::kUserAgent);
scoped_ptr<ProxyFetchPropertyCallbackCollector> callback_collector(
new ProxyFetchPropertyCallbackCollector(
server_context_, request_url.Spec(), request_ctx, options,
user_agent));
bool added_callback = false;
PropertyPageStarVector property_callbacks;
ProxyFetchPropertyCallback* client_callback = NULL;
PropertyCache* page_property_cache = server_context_->page_property_cache();
PropertyCache* client_property_cache =
server_context_->client_property_cache();
if (!is_resource_fetch &&
server_context_->page_property_cache()->enabled() &&
UrlMightHavePropertyCacheEntry(request_url) &&
async_fetch->request_headers()->method() == RequestHeaders::kGet) {
if (options != NULL) {
server_context_->ComputeSignature(options);
}
for (int i = 0;
i < static_cast<int>(UserAgentMatcher::kEndOfDeviceType);
++i) {
UserAgentMatcher::DeviceType device_type =
static_cast<UserAgentMatcher::DeviceType>(i);
AbstractMutex* mutex = server_context_->thread_system()->NewMutex();
const StringPiece& device_type_suffix =
UserAgentMatcher::DeviceTypeSuffix(device_type);
GoogleString page_key = server_context_->GetPagePropertyCacheKey(
request_url.Spec(), options, device_type_suffix);
ProxyFetchPropertyCallback* property_callback =
new ProxyFetchPropertyCallback(
ProxyFetchPropertyCallback::kPagePropertyCache,
*page_property_cache,
page_key, device_type, callback_collector.get(), mutex);
property_callbacks.push_back(property_callback);
callback_collector->AddCallback(property_callback);
}
added_callback = true;
if (added_page_property_callback != NULL) {
*added_page_property_callback = true;
}
}
// Initiate client property cache lookup.
if (async_fetch != NULL) {
const char* client_id = async_fetch->request_headers()->Lookup1(
HttpAttributes::kXGooglePagespeedClientId);
if (client_id != NULL) {
if (client_property_cache->enabled()) {
AbstractMutex* mutex = server_context_->thread_system()->NewMutex();
client_callback = new ProxyFetchPropertyCallback(
ProxyFetchPropertyCallback::kClientPropertyCache,
*client_property_cache, client_id,
UserAgentMatcher::kEndOfDeviceType,
callback_collector.get(), mutex);
callback_collector->AddCallback(client_callback);
added_callback = true;
}
}
}
// All callbacks need to be registered before Reads to avoid race.
if (!property_callbacks.empty()) {
page_property_cache->MultiRead(&property_callbacks);
}
if (client_callback != NULL) {
client_property_cache->Read(client_callback);
}
if (!added_callback) {
callback_collector.reset(NULL);
}
return callback_collector.release();
}
void ProxyInterface::ProxyRequestCallback(
bool is_resource_fetch,
GoogleUrl* url,
AsyncFetch* async_fetch,
RewriteOptions* domain_options,
RewriteOptions* query_options,
MessageHandler* handler) {
scoped_ptr<GoogleUrl> request_url(url);
RewriteOptions* options = server_context_->GetCustomOptions(
async_fetch->request_headers(), domain_options, query_options);
GoogleString url_string;
RequestHeaders* request_headers = async_fetch->request_headers();
request_url->Spec().CopyToString(&url_string);
if ((options != NULL) &&
(options->IsRejectedUrl(url_string) ||
HasRejectedHeader(
HttpAttributes::kUserAgent, request_headers, options) ||
HasRejectedHeader(
HttpAttributes::kXForwardedFor, request_headers, options))) {
rejected_requests_->IncBy(1);
ResponseHeaders* response_headers = async_fetch->response_headers();
response_headers->SetStatusAndReason(HttpStatus::kProxyDeclinedRequest);
response_headers->Replace(HttpAttributes::kContentType,
kContentTypeText.mime_type());
response_headers->Replace(HttpAttributes::kCacheControl,
"private, max-age=0");
async_fetch->Write(kRejectedRequestHtmlResponse, handler);
async_fetch->Done(false);
delete options;
return;
}
scoped_ptr<ProxyFetchPropertyCallbackCollector> property_callback;
// Update request_headers.
// We deal with encodings. So strip the users Accept-Encoding headers.
async_fetch->request_headers()->RemoveAll(HttpAttributes::kAcceptEncoding);
// Note: We preserve the User-Agent and Cookies so that the origin servers
// send us the correct HTML. We will need to consider this for caching HTML.
// Start fetch and rewrite. If GetCustomOptions found options for us,
// the RewriteDriver created by StartNewProxyFetch will take ownership.
if (is_resource_fetch) {
// TODO(sligocki): Set using_spdy appropriately.
bool using_spdy = false;
// TODO(pulkitg): Set is_original_resource_cacheable to false if pagespeed
// resource is not cacheable.
ResourceFetch::Start(*request_url, options, using_spdy,
server_context_, async_fetch);
} else {
// TODO(nforman): If we are not running an experiment, remove the
// furious cookie.
// If we don't already have custom options, and the global options
// say we're running furious, then clone them into custom_options so we
// can manipulate custom options without affecting the global options.
if (options == NULL) {
RewriteOptions* global_options = server_context_->global_options();
if (global_options->running_furious()) {
options = global_options->Clone();
}
}
// TODO(anupama): Adapt the below furious experiment logic for
// FlushEarlyFlow as well.
bool need_to_store_experiment_data = false;
if (options != NULL && options->running_furious()) {
need_to_store_experiment_data = server_context_->furious_matcher()->
ClassifyIntoExperiment(*async_fetch->request_headers(), options);
options->set_need_to_store_experiment_data(need_to_store_experiment_data);
}
const char* user_agent = async_fetch->request_headers()->Lookup1(
HttpAttributes::kUserAgent);
bool is_blink_request = BlinkUtil::IsBlinkRequest(
*request_url, async_fetch, options, user_agent,
server_context_->user_agent_matcher(),
RewriteOptions::kPrioritizeVisibleContent);
bool apply_blink_critical_line =
BlinkUtil::ShouldApplyBlinkFlowCriticalLine(server_context_,
options);
bool page_callback_added = false;
property_callback.reset(InitiatePropertyCacheLookup(
is_resource_fetch, *request_url, options, async_fetch,
&page_callback_added));
if (options != NULL) {
server_context_->ComputeSignature(options);
LogRecord* log_record = async_fetch->request_context()->log_record();
{
ScopedMutex lock(log_record->mutex());
log_record->logging_info()->set_options_signature_hash(
server_context_->contents_hasher()->HashToUint64(
options->signature()));
}
}
if (is_blink_request && apply_blink_critical_line && page_callback_added) {
// In blink flow, we have to modify RewriteOptions after the
// property cache read is completed. Hence, we clear the signature to
// unfreeze RewriteOptions, which was frozen during signature computation
// for generating key for property cache read.
// Warning: Please note that using this method is extremely risky and
// should be avoided as much as possible. If you are planning to use
// this, please discuss this with your team-mates and ensure that you
// clearly understand its implications. Also, please do repeat this
// warning at every place you use this method.
options->ClearSignatureWithCaution();
// TODO(rahulbansal): Remove this LOG once we expect to have
// a lot of such requests.
LOG(INFO) << "Triggering Blink flow critical line for url "
<< url_string;
blink_critical_line_requests_->IncBy(1);
BlinkFlowCriticalLine::Start(url_string, async_fetch, options,
proxy_fetch_factory_.get(),
server_context_,
property_callback.release());
} else {
RewriteDriver* driver = NULL;
RequestContextPtr request_ctx = async_fetch->request_context();
DCHECK(request_ctx.get() != NULL) << "Async fetch must have a request"
<< "context but does not.";
if (options == NULL) {
driver = server_context_->NewRewriteDriver(request_ctx);
} else {
// NewCustomRewriteDriver takes ownership of custom_options_.
driver = server_context_->NewCustomRewriteDriver(options, request_ctx);
}
// TODO(mmohabey): Remove duplicate setting of user agent and
// request headers for different flows.
if (user_agent != NULL) {
VLOG(1) << "Setting user-agent to " << user_agent;
driver->SetUserAgent(user_agent);
} else {
VLOG(1) << "User-agent empty";
}
driver->set_request_headers(async_fetch->request_headers());
// TODO(mmohabey): Factor out the below checks so that they are not
// repeated in BlinkUtil::IsBlinkRequest().
if (driver->options() != NULL && driver->options()->enabled() &&
property_callback != NULL &&
driver->options()->IsAllowed(url_string)) {
bool is_cache_html_request = BlinkUtil::IsBlinkRequest(
*request_url, async_fetch, driver->options(),
driver->user_agent().c_str(),
server_context_->user_agent_matcher(), RewriteOptions::kCacheHtml);
if (is_cache_html_request) {
CacheHtmlFlow::Start(url_string, async_fetch, driver,
proxy_fetch_factory_.get(),
property_callback.release());
return;
}
if (driver->SupportsFlushEarly()) {
FlushEarlyFlow::Start(url_string, &async_fetch, driver,
proxy_fetch_factory_.get(),
property_callback.get());
}
}
proxy_fetch_factory_->StartNewProxyFetch(
url_string, async_fetch, driver, property_callback.release(), NULL);
}
}
if (property_callback.get() != NULL) {
// If management of the callback was not transferred to proxy fetch,
// then we must detach it so it deletes itself when complete.
property_callback.release()->Detach(HttpStatus::kUnknownStatusCode);
}
}
} // namespace net_instaweb