blob: 878d91d24fbe94f582913854292f6050f6a33aa8 [file] [log] [blame]
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jmarantz@google.com (Joshua Marantz)
// lsong@google.com (Libo Song)
#include "pagespeed/apache/instaweb_context.h"
#include "base/logging.h"
#include "pagespeed/apache/apache_server_context.h"
#include "pagespeed/apache/header_util.h"
#include "pagespeed/apache/mod_instaweb.h"
#include "net/instaweb/http/public/request_context.h"
#include "net/instaweb/rewriter/public/experiment_matcher.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "net/instaweb/util/public/property_cache.h"
#include "pagespeed/kernel/base/atomic_bool.h"
#include "pagespeed/kernel/base/stack_buffer.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/thread_system.h"
#include "pagespeed/kernel/base/timer.h"
#include "pagespeed/kernel/http/content_type.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/http_names.h"
#include "pagespeed/kernel/http/query_params.h"
#include "pagespeed/kernel/http/request_headers.h"
#include "pagespeed/kernel/http/response_headers.h"
#include "pagespeed/kernel/http/user_agent_matcher.h"
#include "pagespeed/kernel/util/gzip_inflater.h"
#include "apr_strings.h"
#include "http_config.h"
#include "http_core.h"
namespace net_instaweb {
// Number of times to go down the request->prev chain looking for an
// absolute url.
const int kRequestChainLimit = 5;
namespace {
// Tracks a single property-cache lookup.
class PropertyCallback : public PropertyPage {
public:
PropertyCallback(const StringPiece& url,
const StringPiece& options_signature_hash,
UserAgentMatcher::DeviceType device_type,
RewriteDriver* driver,
ThreadSystem* thread_system)
: PropertyPage(PropertyPage::kPropertyCachePage,
url,
options_signature_hash,
UserAgentMatcher::DeviceTypeSuffix(device_type),
driver->request_context(),
thread_system->NewMutex(),
driver->server_context()->page_property_cache()),
driver_(driver) {
}
bool done() const { return done_.value(); }
protected:
virtual void Done(bool success) {
driver_->set_property_page(this);
done_.set_value(true);
}
private:
RewriteDriver* driver_;
AtomicBool done_;
DISALLOW_COPY_AND_ASSIGN(PropertyCallback);
};
} // namespace
InstawebContext::InstawebContext(request_rec* request,
RequestHeaders* request_headers,
const ContentType& content_type,
ApacheServerContext* server_context,
const GoogleString& absolute_url,
const RequestContextPtr& request_context,
const QueryParams& pagespeed_query_params,
const QueryParams& pagespeed_option_cookies,
bool use_custom_options,
const RewriteOptions& options)
: content_encoding_(kNone),
content_type_(content_type),
server_context_(server_context),
string_writer_(&output_),
absolute_url_(absolute_url),
request_headers_(request_headers),
started_parse_(false),
sent_headers_(false),
populated_headers_(false) {
if (options.running_experiment()) {
// The experiment framework requires custom options because it has to make
// changes based on what ExperimentSpec the user should be seeing.
use_custom_options = true;
}
if (use_custom_options) {
// TODO(jmarantz): this is a temporary hack until we sort out better
// memory management of RewriteOptions. This will drag on performance.
// We need to do this because we are changing RewriteDriver to keep
// a reference to its options throughout its lifetime to refer to the
// domain lawyer and other options.
RewriteOptions* custom_options = options.Clone();
// If we're running an experiment, determine the state of this request and
// reset the options accordingly.
if (custom_options->running_experiment()) {
SetExperimentStateAndCookie(request, custom_options);
}
server_context_->ComputeSignature(custom_options);
rewrite_driver_ = server_context_->NewCustomRewriteDriver(
custom_options, request_context);
} else {
rewrite_driver_ = server_context_->NewRewriteDriver(request_context);
}
// Set or clear sticky option cookies as appropriate.
rewrite_driver_->set_pagespeed_query_params(
pagespeed_query_params.ToEscapedString());
rewrite_driver_->set_pagespeed_option_cookies(
pagespeed_option_cookies.ToEscapedString());
GoogleUrl gurl(absolute_url_);
// Temporary headers for our cookies.
ResponseHeaders resp_headers(
rewrite_driver_->options()->ComputeHttpOptions());
if (rewrite_driver_->SetOrClearPageSpeedOptionCookies(gurl, &resp_headers)) {
// TODO(matterbury): Rationalize how we add/update response headers.
// This context has a response_headers_ member that's barely used, although
// it is used by the related RewriteDriver; should we just add these to that
// and rely on them being converted to Apache headers later?
ResponseHeadersToApacheRequest(resp_headers, request);
}
BlockingPropertyCacheLookup();
rewrite_driver_->EnableBlockingRewrite(request_headers);
ComputeContentEncoding(request);
apr_pool_cleanup_register(request->pool,
this, apache_cleanup<InstawebContext>,
apr_pool_cleanup_null);
bucket_brigade_ = apr_brigade_create(request->pool,
request->connection->bucket_alloc);
if (content_encoding_ == kGzip || content_encoding_ == kDeflate) {
// TODO(jmarantz): consider keeping a pool of these if they are expensive
// to initialize.
if (content_encoding_ == kGzip) {
inflater_.reset(new GzipInflater(GzipInflater::kGzip));
} else {
inflater_.reset(new GzipInflater(GzipInflater::kDeflate));
}
inflater_->Init();
}
// Make the entire request headers available to filters.
rewrite_driver_->SetRequestHeaders(*request_headers_.get());
response_headers_.reset(
new ResponseHeaders(rewrite_driver_->options()->ComputeHttpOptions()));
rewrite_driver_->set_response_headers_ptr(response_headers_.get());
// TODO(lsong): Bypass the string buffer, write data directly to the next
// apache bucket.
rewrite_driver_->SetWriter(&string_writer_);
}
InstawebContext::~InstawebContext() {
}
void InstawebContext::Rewrite(const char* input, int size) {
if (inflater_.get() != NULL) {
char buf[kStackBufferSize];
inflater_->SetInput(input, size);
while (inflater_->HasUnconsumedInput()) {
int num_inflated_bytes = inflater_->InflateBytes(buf, kStackBufferSize);
DCHECK_LE(0, num_inflated_bytes) << "Corrupted zip inflation";
if (num_inflated_bytes > 0) {
ProcessBytes(buf, num_inflated_bytes);
}
}
} else {
DCHECK_LE(0, size) << "negatively sized buffer passed from apache";
if (size > 0) {
ProcessBytes(input, size);
}
}
}
void InstawebContext::Flush() {
if (html_detector_.already_decided() && started_parse_) {
rewrite_driver_->Flush();
}
}
void InstawebContext::Finish() {
if (!html_detector_.already_decided()) {
// We couldn't determine whether this is HTML or not till the very end,
// so serve it unmodified.
html_detector_.ReleaseBuffered(&output_);
}
if (started_parse_) {
rewrite_driver_->FinishParse();
} else {
rewrite_driver_->Cleanup();
}
}
void InstawebContext::PopulateHeaders(request_rec* request) {
if (!populated_headers_) {
ApacheRequestToResponseHeaders(*request, response_headers_.get(), NULL);
populated_headers_ = true;
}
}
void InstawebContext::ProcessBytes(const char* input, int size) {
CHECK_LT(0, size);
if (!html_detector_.already_decided()) {
if (html_detector_.ConsiderInput(StringPiece(input, size))) {
if (html_detector_.probable_html()) {
// Note that we use started_parse_ and not probable_html()
// in all other spots as an error fallback.
started_parse_ = rewrite_driver_->StartParseWithType(absolute_url_,
content_type_);
}
// If we buffered up any bytes in previous calls, make sure to
// release them.
GoogleString buffer;
html_detector_.ReleaseBuffered(&buffer);
if (!buffer.empty()) {
// Recurse on initial buffer of whitespace before processing
// this call's input below.
ProcessBytes(buffer.data(), buffer.size());
}
}
}
// Either as effect of above or initially at entry.
if (html_detector_.already_decided()) {
if (started_parse_) {
rewrite_driver_->ParseText(input, size);
} else {
// Looks like something that's not HTML. Send it directly to the
// output buffer.
output_.append(input, size);
}
}
}
void InstawebContext::ComputeContentEncoding(request_rec* request) {
// Check if the content is gzipped. Steal from mod_deflate.
const char* encoding = apr_table_get(request->headers_out,
HttpAttributes::kContentEncoding);
if (encoding != NULL) {
const char* err_enc = apr_table_get(request->err_headers_out,
HttpAttributes::kContentEncoding);
if (err_enc != NULL) {
// We don't properly handle stacked encodings now.
content_encoding_ = kOther;
}
} else {
encoding = apr_table_get(request->err_headers_out,
HttpAttributes::kContentEncoding);
}
if (encoding != NULL) {
if (StringCaseEqual(encoding, HttpAttributes::kGzip)) {
content_encoding_ = kGzip;
} else if (StringCaseEqual(encoding, HttpAttributes::kDeflate)) {
content_encoding_ = kDeflate;
} else {
content_encoding_ = kOther;
}
}
}
void InstawebContext::BlockingPropertyCacheLookup() {
PropertyCallback* property_callback = NULL;
if (server_context_->page_property_cache()->enabled()) {
const UserAgentMatcher* user_agent_matcher =
server_context_->user_agent_matcher();
UserAgentMatcher::DeviceType device_type =
user_agent_matcher->GetDeviceTypeForUA(rewrite_driver_->user_agent());
GoogleString options_signature_hash =
server_context_->GetRewriteOptionsSignatureHash(
rewrite_driver_->options());
property_callback = new PropertyCallback(
absolute_url_,
options_signature_hash,
device_type,
rewrite_driver_,
server_context_->thread_system());
server_context_->page_property_cache()->Read(property_callback);
DCHECK(property_callback->done());
}
}
ApacheServerContext* InstawebContext::ServerContextFromServerRec(
server_rec* server) {
return static_cast<ApacheServerContext*>
ap_get_module_config(server->module_config, &pagespeed_module);
}
// This function stores the request uri on the first call, and then
// uses that value for all future calls. This should prevent the url
// from changing due to changes to the reqeust from other modules.
// In some code paths, a new request is made that throws away the old
// url. Therefore, if we have not yet stored the url, check to see if
// there was a previous request in this chain, and use its url as the
// original.
const char* InstawebContext::MakeRequestUrl(
const RewriteOptions& global_options, request_rec* request) {
const char* url = apr_table_get(request->notes, kPagespeedOriginalUrl);
if (url == NULL) {
// Go down the prev chain to see if there this request was a rewrite
// from another one. We want to store the uri the user passed in,
// not what we re-wrote it to. We should not iterate down this
// chain more than once (MakeRequestUrl will already have been
// called for request->prev, before this request is created).
// However, max out at 5 iterations, just in case.
request_rec *prev = request->prev;
for (int i = 0; (url == NULL) && (prev != NULL) && (i < kRequestChainLimit);
++i, prev = prev->prev) {
url = apr_table_get(prev->notes, kPagespeedOriginalUrl);
}
// Chase 'main' chain as well, clamping at kRequestChainLimit loops.
// This will eliminate spurious 'index.html' noise we've seen from
// slurps. See 'make apache_debug_slurp_test' -- the attempt to
// slurp 'www.example.com'. The reason this is necessary is that
// mod_dir.c's fixup_dir() calls ap_internal_fast_redirect in
// http_request.c, which mutates the original requests's uri fields,
// leaving little trace of the url we actually need to resolve. Also
// note that http_request.c:ap_internal_fast_redirect 'overlays'
// the source r.notes onto the dest r.notes, which in this case would
// work against us if we don't first propagate the OriginalUrl.
request_rec *main = request->main;
for (int i = 0; (url == NULL) && (main != NULL) && (i < kRequestChainLimit);
++i, main = main->main) {
url = apr_table_get(main->notes, kPagespeedOriginalUrl);
}
// ap_construct_url() only works when request->unparsed_uri is relative.
// But sometimes (with mod_proxy/slurping) we see request->unparsed_uri
// as an absolute URL. So we check if request->unparsed_uri is already
// an absolute URL first. If so, use it as-is, otherwise ap_construct_url().
if (url == NULL) {
if (request->unparsed_uri == NULL) {
LOG(DFATAL) <<
"build_context_for_request should verify unparsed_uri is non-null.";
return NULL;
}
GoogleUrl gurl(request->unparsed_uri);
if (gurl.IsAnyValid()) {
url = apr_pstrdup(request->pool, request->unparsed_uri);
} else {
url = ap_construct_url(request->pool, request->unparsed_uri, request);
}
}
// Fix URL based on X-Forwarded-Proto.
// http://code.google.com/p/modpagespeed/issues/detail?id=546
// For example, if Apache gives us the URL "http://www.example.com/"
// and there is a header: "X-Forwarded-Proto: https", then we update
// this base URL to "https://www.example.com/".
if (global_options.respect_x_forwarded_proto()) {
const char* x_forwarded_proto =
apr_table_get(request->headers_in, HttpAttributes::kXForwardedProto);
if (x_forwarded_proto != NULL) {
if (StringCaseEqual(x_forwarded_proto, "http") ||
StringCaseEqual(x_forwarded_proto, "https")) {
StringPiece url_sp(url);
StringPiece::size_type colon_pos = url_sp.find(":");
if (colon_pos != StringPiece::npos) {
// Replace URL protocol with that specified in X-Forwarded-Proto.
GoogleString new_url =
StrCat(x_forwarded_proto, url_sp.substr(colon_pos));
url = apr_pstrdup(request->pool, new_url.c_str());
}
} else {
LOG(WARNING) << "Unsupported X-Forwarded-Proto: " << x_forwarded_proto
<< " for URL " << url << " protocol not changed.";
}
}
}
// Note: apr_table_setn does not take ownership of url, it is owned by
// the Apache pool.
apr_table_setn(request->notes, kPagespeedOriginalUrl, url);
}
return url;
}
void InstawebContext::SetExperimentStateAndCookie(request_rec* request,
RewriteOptions* options) {
// If we didn't get a valid (i.e. currently-running experiment) value from
// the cookie, determine which experiment this request should end up in
// and set the cookie accordingly.
bool need_cookie =
server_context_->experiment_matcher()->ClassifyIntoExperiment(
*request_headers_, *server_context_->user_agent_matcher(), options);
if (need_cookie) {
ResponseHeaders resp_headers(options->ComputeHttpOptions());
const char* url = apr_table_get(request->notes, kPagespeedOriginalUrl);
int experiment_value = options->experiment_id();
server_context_->experiment_matcher()->StoreExperimentData(
experiment_value, url,
(server_context_->timer()->NowMs() +
options->experiment_cookie_duration_ms()),
&resp_headers);
ResponseHeadersToApacheRequest(resp_headers, request);
}
}
} // namespace net_instaweb