blob: 21d23c52b2557236b997fd1bcf98975c4452a934 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
#include "pagespeed/automatic/proxy_interface.h"
#include "base/callback.h"
#include "base/logging.h"
#include "net/instaweb/config/rewrite_options_manager.h"
#include "net/instaweb/http/public/async_fetch.h"
#include "net/instaweb/http/public/log_record.h"
#include "net/instaweb/http/public/logging_proto_impl.h"
#include "net/instaweb/http/public/request_context.h"
#include "net/instaweb/http/public/request_timing_info.h"
#include "net/instaweb/rewriter/public/blink_util.h"
#include "net/instaweb/rewriter/public/experiment_matcher.h"
#include "net/instaweb/rewriter/public/resource_fetch.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/rewrite_query.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "pagespeed/automatic/cache_html_flow.h"
#include "pagespeed/automatic/flush_early_flow.h"
#include "pagespeed/automatic/proxy_fetch.h"
#include "pagespeed/kernel/base/abstract_mutex.h"
#include "pagespeed/kernel/base/hasher.h"
#include "pagespeed/kernel/base/hostname_util.h"
#include "pagespeed/kernel/base/ref_counted_ptr.h"
#include "pagespeed/kernel/base/scoped_ptr.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/content_type.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/http_names.h"
#include "pagespeed/kernel/http/query_params.h"
#include "pagespeed/kernel/http/request_headers.h"
#include "pagespeed/kernel/http/response_headers.h"
namespace net_instaweb {
class MessageHandler;
const char ProxyInterface::kCacheHtmlRequestCount[] =
"cache-html-requests";
namespace {
// Names for Statistics variables.
const char kTotalRequestCount[] = "all-requests";
const char kPagespeedRequestCount[] = "pagespeed-requests";
const char kRejectedRequestCount[] = "publisher-rejected-requests";
const char kRejectedRequestHtmlResponse[] = "Unable to serve "
"content as the content is blocked by the administrator of the domain.";
const char kNoDomainConfigRequestCount[] = "without-domain-config-requests";
const char kNoDomainConfigResourceRequestCount[] =
"without-domain-config-resource-requests";
} // namespace
struct ProxyInterface::RequestData {
bool is_resource_fetch;
scoped_ptr<GoogleUrl> request_url;
AsyncFetch* async_fetch;
MessageHandler* handler;
};
ProxyInterface::ProxyInterface(const StringPiece& hostname, int port,
ServerContext* server_context,
Statistics* stats)
: server_context_(server_context),
hostname_(hostname.as_string()),
port_(port),
all_requests_(stats->GetTimedVariable(kTotalRequestCount)),
pagespeed_requests_(stats->GetTimedVariable(kPagespeedRequestCount)),
cache_html_flow_requests_(
stats->GetTimedVariable(kCacheHtmlRequestCount)),
rejected_requests_(stats->GetTimedVariable(kRejectedRequestCount)),
requests_without_domain_config_(
stats->GetTimedVariable(kNoDomainConfigRequestCount)),
resource_requests_without_domain_config_(
stats->GetTimedVariable(kNoDomainConfigResourceRequestCount)) {
proxy_fetch_factory_.reset(new ProxyFetchFactory(server_context));
}
ProxyInterface::~ProxyInterface() {
}
void ProxyInterface::InitStats(Statistics* statistics) {
statistics->AddTimedVariable(kTotalRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kPagespeedRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kCacheHtmlRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kRejectedRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kRejectedRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kNoDomainConfigRequestCount,
ServerContext::kStatisticsGroup);
statistics->AddTimedVariable(kNoDomainConfigResourceRequestCount,
ServerContext::kStatisticsGroup);
CacheHtmlFlow::InitStats(statistics);
FlushEarlyFlow::InitStats(statistics);
}
bool ProxyInterface::IsWellFormedUrl(const GoogleUrl& url) {
bool ret = false;
if (url.IsWebValid()) {
if (url.has_path()) {
StringPiece path = url.PathAndLeaf();
GoogleString filename = url.ExtractFileName();
int path_len = path.size() - filename.size();
if (path_len >= 0) {
ret = true;
}
} else {
LOG(ERROR) << "URL has no path: " << url.Spec();
}
}
return ret;
}
bool ProxyInterface::UrlAndPortMatchThisServer(const GoogleUrl& url) {
bool ret = false;
if (url.IsWebValid() && (url.EffectiveIntPort() == port_)) {
// TODO(atulvasu): This should support matching the actual host this
// machine can receive requests from. Ideally some flag control would
// help. For example this server could be running multiple virtual
// servers, and we would like to know what server we are catering to for
// pagespeed only queries.
//
// Allow for exact hostname matches, as well as a URL typed into the
// browser window like "exeda.cam", which should match
// "exeda.cam.corp.google.com".
StringPiece host = url.Host();
if (IsLocalhost(host, hostname_) ||
StringPiece(hostname_).starts_with(StrCat(host, "."))) {
ret = true;
}
}
return ret;
}
void ProxyInterface::Fetch(const GoogleString& requested_url_string,
MessageHandler* handler,
AsyncFetch* async_fetch) {
GoogleUrl requested_url(requested_url_string);
const bool is_get_or_head =
(async_fetch->request_headers()->method() == RequestHeaders::kGet) ||
(async_fetch->request_headers()->method() == RequestHeaders::kHead);
all_requests_->IncBy(1);
if (!(requested_url.IsWebValid() && IsWellFormedUrl(requested_url))) {
LOG(WARNING) << "Bad URL, failing request: " << requested_url_string;
async_fetch->response_headers()->SetStatusAndReason(HttpStatus::kNotFound);
async_fetch->Done(false);
} else {
// Try to handle this as a .pagespeed. resource.
if (is_get_or_head && server_context_->IsPagespeedResource(requested_url)) {
pagespeed_requests_->IncBy(1);
LOG(INFO) << "Serving URL as pagespeed resource: "
<< requested_url.Spec();
ProxyRequest(true, requested_url, async_fetch, handler);
} else if (UrlAndPortMatchThisServer(requested_url)) {
// Just respond with a 404 for now.
async_fetch->response_headers()->SetStatusAndReason(
HttpStatus::kNotFound);
LOG(INFO) << "Returning 404 for URL: " << requested_url.Spec();
async_fetch->Done(false);
} else {
// Otherwise we proxy it (rewriting if it is HTML).
LOG(INFO) << "Proxying URL normally: " << requested_url.Spec();
ProxyRequest(false, requested_url, async_fetch, handler);
}
}
}
void ProxyInterface::ProxyRequest(bool is_resource_fetch,
const GoogleUrl& request_url,
AsyncFetch* async_fetch,
MessageHandler* handler) {
RequestData* request_data = new RequestData;
request_data->is_resource_fetch = is_resource_fetch;
request_data->request_url.reset(new GoogleUrl);
request_data->request_url->Reset(request_url);
request_data->async_fetch = async_fetch;
request_data->handler = handler;
server_context_->rewrite_options_manager()->GetRewriteOptions(
request_url,
*async_fetch->request_headers(),
NewCallback(this, &ProxyInterface::GetRewriteOptionsDone, request_data));
}
ProxyFetchPropertyCallbackCollector*
ProxyInterface::InitiatePropertyCacheLookup(
bool is_resource_fetch,
const GoogleUrl& request_url,
RewriteOptions* options,
AsyncFetch* async_fetch,
const bool requires_blink_cohort) {
return ProxyFetchFactory::InitiatePropertyCacheLookup(
is_resource_fetch, request_url, server_context_, options, async_fetch,
requires_blink_cohort);
}
void ProxyInterface::GetRewriteOptionsDone(RequestData* request_data,
RewriteOptions* domain_options) {
scoped_ptr<RequestData> request_data_deleter(request_data);
scoped_ptr<RewriteOptions> scoped_domain_options(domain_options);
bool is_resource_fetch = request_data->is_resource_fetch;
GoogleUrl* request_url = request_data->request_url.get();
AsyncFetch* async_fetch = request_data->async_fetch;
MessageHandler* handler = request_data->handler;
if (domain_options == NULL) {
requests_without_domain_config_->IncBy(1);
if (is_resource_fetch) {
resource_requests_without_domain_config_->IncBy(1);
}
}
// Parse the query options, headers, and cookies.
RewriteQuery query;
if (!server_context_->GetQueryOptions(async_fetch->request_context(),
domain_options, request_url,
async_fetch->request_headers(),
NULL /* response_headers */, &query)) {
async_fetch->response_headers()->SetStatusAndReason(
HttpStatus::kMethodNotAllowed);
async_fetch->Write("Invalid PageSpeed query-params/request headers",
handler);
async_fetch->Done(false);
return;
}
RewriteOptions* options = server_context_->GetCustomOptions(
async_fetch->request_headers(), scoped_domain_options.release(),
query.ReleaseOptions());
GoogleString url_string;
request_url->Spec().CopyToString(&url_string);
RequestHeaders* request_headers = async_fetch->request_headers();
if (options != NULL &&
options->IsRequestDeclined(url_string, request_headers)) {
rejected_requests_->IncBy(1);
ResponseHeaders* response_headers = async_fetch->response_headers();
response_headers->SetStatusAndReason(HttpStatus::kProxyDeclinedRequest);
response_headers->Replace(HttpAttributes::kContentType,
kContentTypeText.mime_type());
response_headers->Replace(HttpAttributes::kCacheControl,
"private, max-age=0");
async_fetch->Write(kRejectedRequestHtmlResponse, handler);
async_fetch->Done(false);
delete options;
return;
}
if (ServerContext::ScanSplitHtmlRequest(
async_fetch->request_context(), options, &url_string)) {
request_url->Reset(url_string);
}
// Update request_headers.
// We deal with encodings. So strip the users Accept-Encoding headers.
async_fetch->request_headers()->RemoveAll(HttpAttributes::kAcceptEncoding);
// Note: We preserve the User-Agent and Cookies so that the origin servers
// send us the correct HTML. We will need to consider this for caching HTML.
async_fetch->request_context()->mutable_timing_info()->ProcessingStarted();
AbstractLogRecord* log_record = async_fetch->request_context()->log_record();
{
ScopedMutex lock(log_record->mutex());
log_record->logging_info()->set_is_pagespeed_resource(is_resource_fetch);
}
// Start fetch and rewrite. If GetCustomOptions found options for us,
// the RewriteDriver created by StartNewProxyFetch will take ownership.
if (is_resource_fetch) {
// TODO(sligocki): Set using_spdy appropriately.
bool using_spdy = false;
// TODO(pulkitg): Set is_original_resource_cacheable to false if pagespeed
// resource is not cacheable.
const RewriteOptions* these_options =
(options == NULL ? server_context_->global_options() : options);
// TODO(sligocki): Should we be setting default options and then overriding
// here? It seems like it would be better to only set once, but that
// involves a lot of complicated code changes.
async_fetch->request_context()->ResetOptions(
these_options->ComputeHttpOptions());
ResourceFetch::Start(*request_url, options, using_spdy,
server_context_, async_fetch);
} else {
// TODO(nforman): If we are not running an experiment, remove the
// experiment cookie.
// If we don't already have custom options, and the global options say we're
// running an experiment, then clone them into custom_options so we can
// manipulate custom options without affecting the global options.
if (options == NULL) {
RewriteOptions* global_options = server_context_->global_options();
if (global_options->running_experiment()) {
options = global_options->Clone();
}
}
// TODO(anupama): Adapt the experiment logic below for the FlushEarlyFlow as
// well.
bool need_to_store_experiment_data = false;
if (options != NULL && options->running_experiment()) {
need_to_store_experiment_data =
server_context_->experiment_matcher()->ClassifyIntoExperiment(
*async_fetch->request_headers(),
*server_context_->user_agent_matcher(), options);
options->set_need_to_store_experiment_data(need_to_store_experiment_data);
}
const char* user_agent = async_fetch->request_headers()->Lookup1(
HttpAttributes::kUserAgent);
// Whether it's a cache html request should not change despite the fact
// a new driver is created later on.
const bool is_cache_html_request = BlinkUtil::IsBlinkRequest(
*request_url, async_fetch, options,
user_agent, server_context_,
RewriteOptions::kCachePartialHtml);
ProxyFetchPropertyCallbackCollector* property_callback = NULL;
if (options == NULL ||
(options->enabled() && options->IsAllowed(request_url->Spec()))) {
// Ownership of "property_callback" is eventually assumed by either
// CacheHtmlFlow or ProxyFetch.
property_callback = InitiatePropertyCacheLookup(is_resource_fetch,
*request_url,
options,
async_fetch,
is_cache_html_request);
}
if (options != NULL) {
server_context_->ComputeSignature(options);
{
ScopedMutex lock(log_record->mutex());
log_record->logging_info()->set_options_signature_hash(
server_context_->contents_hasher()->HashToUint64(
options->signature()));
}
}
RewriteDriver* driver = NULL;
RequestContextPtr request_ctx = async_fetch->request_context();
DCHECK(request_ctx.get() != NULL) << "Async fetch must have a request"
<< "context but does not.";
if (options == NULL) {
driver = server_context_->NewRewriteDriver(request_ctx);
} else {
// NewCustomRewriteDriver takes ownership of custom_options_.
driver = server_context_->NewCustomRewriteDriver(options, request_ctx);
}
// TODO(sligocki): Should we be setting default options and then overriding
// here? It seems like it would be better to only set once, but that
// involves a lot of complicated code changes.
request_ctx->ResetOptions(driver->options()->ComputeHttpOptions());
driver->SetRequestHeaders(*async_fetch->request_headers());
// TODO(mmohabey): Factor out the below checks so that they are not
// repeated in BlinkUtil::IsBlinkRequest().
// Copy over any PageSpeed query parameters so we can re-add them if we
// receive a redirection response to our fetch request.
driver->set_pagespeed_query_params(
query.pagespeed_query_params().ToEscapedString());
// Copy over any PageSpeed cookies so we know which ones to clear in
// ProxyFetch::HandleHeadersComplete().
driver->set_pagespeed_option_cookies(
query.pagespeed_option_cookies().ToEscapedString());
if (driver->options() != NULL && driver->options()->enabled() &&
property_callback != NULL &&
driver->options()->IsAllowed(url_string)) {
if (is_cache_html_request) {
cache_html_flow_requests_->IncBy(1);
CacheHtmlFlow::Start(url_string,
async_fetch,
driver,
proxy_fetch_factory_.get(),
// Takes ownership of property_callback.
property_callback);
return;
}
// NOTE: The FlushEarly flow will run in parallel with the ProxyFetch,
// but will not begin (FlushEarlyFlwow::FlushEarly) until the
// PropertyCache lookup has completed.
// Also it does NOT take ownership of property_callback.
// FlushEarlyFlow might not start if the request is not GET or if the
// useragent is unsupported etc.
FlushEarlyFlow::TryStart(url_string, &async_fetch, driver,
proxy_fetch_factory_.get(),
property_callback);
}
// Takes ownership of property_callback.
proxy_fetch_factory_->StartNewProxyFetch(
url_string, async_fetch, driver, property_callback, NULL);
}
}
} // namespace net_instaweb