blob: 05d28c0b590474d94cf5f51ad770fb40eeff9b3a [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "net/instaweb/rewriter/public/cache_extender.h"
#include <memory>
#include "base/logging.h"
#include "net/instaweb/http/public/http_cache.h"
#include "net/instaweb/http/public/log_record.h"
#include "net/instaweb/rewriter/cached_result.pb.h"
#include "net/instaweb/rewriter/public/domain_lawyer.h"
#include "net/instaweb/rewriter/public/javascript_code_block.h"
#include "net/instaweb/rewriter/public/output_resource.h"
#include "net/instaweb/rewriter/public/output_resource_kind.h"
#include "net/instaweb/rewriter/public/resource.h"
#include "net/instaweb/rewriter/public/resource_slot.h"
#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "net/instaweb/rewriter/public/single_rewrite_context.h"
#include "net/instaweb/rewriter/public/url_namer.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/statistics.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/string_writer.h"
#include "pagespeed/kernel/base/timer.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/http/content_type.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/response_headers.h"
#include "pagespeed/kernel/http/semantic_type.h"
#include "pagespeed/opt/logging/enums.pb.h"
namespace net_instaweb {
class MessageHandler;
class RewriteContext;
// names for Statistics variables.
const char CacheExtender::kCacheExtensions[] = "cache_extensions";
const char CacheExtender::kNotCacheable[] = "not_cacheable";
// We do not want to bother to extend the cache lifetime for any resource
// that is already cached for a month.
const int64 kMinThresholdMs = Timer::kMonthMs;
class CacheExtender::Context : public SingleRewriteContext {
public:
Context(CacheExtender* extender, RewriteDriver* driver,
RewriteContext* parent)
: SingleRewriteContext(driver, parent,
NULL /* no resource context */),
extender_(extender) {}
virtual ~Context() {}
virtual void Render();
virtual void RewriteSingle(const ResourcePtr& input,
const OutputResourcePtr& output);
virtual const char* id() const { return extender_->id(); }
virtual OutputResourceKind kind() const { return kOnTheFlyResource; }
private:
CacheExtender* extender_;
DISALLOW_COPY_AND_ASSIGN(Context);
};
CacheExtender::CacheExtender(RewriteDriver* driver)
: RewriteFilter(driver) {
Statistics* stats = server_context()->statistics();
extension_count_ = stats->GetVariable(kCacheExtensions);
not_cacheable_count_ = stats->GetVariable(kNotCacheable);
}
CacheExtender::~CacheExtender() {}
void CacheExtender::InitStats(Statistics* statistics) {
statistics->AddVariable(kCacheExtensions);
statistics->AddVariable(kNotCacheable);
}
bool CacheExtender::ShouldRewriteResource(
const ResponseHeaders* headers, int64 now_ms,
const ResourcePtr& input_resource, const StringPiece& url,
CachedResult* result) const {
const ContentType* input_resource_type = input_resource->type();
if (input_resource_type == NULL) {
return false;
}
if (input_resource_type->type() == ContentType::kJavascript &&
driver()->options()->avoid_renaming_introspective_javascript() &&
JavascriptCodeBlock::UnsafeToRename(
input_resource->ExtractUncompressedContents())) {
CHECK(result != NULL);
result->add_debug_message(JavascriptCodeBlock::kIntrospectionComment);
return false;
}
if ((headers->CacheExpirationTimeMs() - now_ms) < kMinThresholdMs) {
// This also includes the case where a previous filter rewrote this.
return true;
}
UrlNamer* url_namer = driver()->server_context()->url_namer();
GoogleUrl origin_gurl(url);
// We won't initiate a CacheExtender::Context with a pagespeed
// resource URL. However, an upstream filter might have rewritten
// the resource after we queued the request, but before our
// context is asked to rewrite it. So we have to check again now
// that the resource URL is finalized.
if (server_context()->IsPagespeedResource(origin_gurl)) {
return false;
}
if (url_namer->ProxyMode()) {
return !url_namer->IsProxyEncoded(origin_gurl);
}
const DomainLawyer* lawyer = driver()->options()->domain_lawyer();
// We return true for IsProxyMapped because when reconstructing
// MAPPED_DOMAIN/file.pagespeed.ce.HASH.ext we won't be changing
// the domain (WillDomainChange==false) but we want this function
// to return true so that we can reconstruct the cache-extension and
// serve the result with long public caching. Without IsProxyMapped,
// we'd serve the result with cache-control:private,max-age=300.
return (lawyer->IsProxyMapped(origin_gurl) ||
lawyer->WillDomainChange(origin_gurl));
}
void CacheExtender::StartElementImpl(HtmlElement* element) {
resource_tag_scanner::UrlCategoryVector attributes;
resource_tag_scanner::ScanElement(element, driver()->options(), &attributes);
for (int i = 0, n = attributes.size(); i < n; ++i) {
bool may_load = false;
switch (attributes[i].category) {
case semantic_type::kStylesheet:
may_load = driver()->MayCacheExtendCss();
break;
case semantic_type::kImage:
may_load = driver()->MayCacheExtendImages();
break;
case semantic_type::kScript:
may_load = driver()->MayCacheExtendScripts();
break;
default:
// Does the url in the attribute end in .pdf, ignoring query params?
if (attributes[i].url->DecodedValueOrNull() != NULL
&& driver()->MayCacheExtendPdfs()) {
GoogleUrl url(driver()->base_url(),
attributes[i].url->DecodedValueOrNull());
if (url.IsWebValid() && StringCaseEndsWith(
url.LeafSansQuery(), kContentTypePdf.file_extension())) {
may_load = true;
}
}
break;
}
if (!may_load) {
continue;
}
// TODO(jmarantz): We ought to be able to domain-shard even if the
// resources are non-cacheable or privately cacheable.
if (driver()->IsRewritable(element)) {
ResourcePtr input_resource(CreateInputResourceOrInsertDebugComment(
attributes[i].url->DecodedValueOrNull(), element));
if (input_resource.get() == NULL) {
continue;
}
GoogleUrl input_gurl(input_resource->url());
if (server_context()->IsPagespeedResource(input_gurl)) {
continue;
}
ResourceSlotPtr slot(driver()->GetSlot(
input_resource, element, attributes[i].url));
Context* context = new Context(this, driver(), NULL /* not nested */);
context->AddSlot(slot);
driver()->InitiateRewrite(context);
}
}
}
bool CacheExtender::ComputeOnTheFly() const {
return true;
}
void CacheExtender::Context::RewriteSingle(
const ResourcePtr& input_resource,
const OutputResourcePtr& output_resource) {
// We only add link: rel = canonical to images and PDF; people don't normally
// use search engines to look for .css and .js files, so adding it
// there would just be a waste of bytes.
if (input_resource->type() != NULL &&
(input_resource->type()->IsImage() ||
input_resource->type()->type() == ContentType::kPdf)) {
AddLinkRelCanonical(input_resource, output_resource);
}
RewriteDone(
extender_->RewriteLoadedResource(
input_resource, output_resource, output_partition(0)), 0);
}
void CacheExtender::Context::Render() {
if (num_output_partitions() == 1 && output_partition(0)->optimizable()) {
extender_->extension_count_->Add(1);
// Log applied rewriter id. Here, we care only about non-nested
// cache extensions, and that too, those occurring in synchronous
// flows only.
if (Driver() != NULL) {
ResourceSlotPtr the_slot = slot(0);
if (the_slot->resource().get() != NULL &&
the_slot->resource()->type() != NULL) {
const char* filter_id = id();
const ContentType* type = the_slot->resource()->type();
if (type->type() == ContentType::kCss) {
filter_id = RewriteOptions::FilterId(
RewriteOptions::kExtendCacheCss);
} else if (type->type() == ContentType::kJavascript) {
filter_id = RewriteOptions::FilterId(
RewriteOptions::kExtendCacheScripts);
} else if (type->IsImage()) {
filter_id = RewriteOptions::FilterId(
RewriteOptions::kExtendCacheImages);
}
// TODO(anupama): Log cache extension for pdfs etc.
Driver()->log_record()->SetRewriterLoggingStatus(
filter_id,
the_slot->resource()->url(),
RewriterApplication::APPLIED_OK);
}
}
}
}
RewriteResult CacheExtender::RewriteLoadedResource(
const ResourcePtr& input_resource,
const OutputResourcePtr& output_resource,
// TODO(jmaessen): does this belong in CacheExtender::Context? to this
// method and ShouldRewriteResource.
CachedResult* result) {
CHECK(input_resource->loaded());
MessageHandler* message_handler = driver()->message_handler();
const ResponseHeaders* headers = input_resource->response_headers();
GoogleString url = input_resource->url();
int64 now_ms = server_context()->timer()->NowMs();
// See if the resource is cacheable; and if so whether there is any need
// to cache extend it.
bool ok = false;
const ContentType* output_type = NULL;
if (!server_context()->http_cache()->force_caching() &&
!headers->IsProxyCacheable()) {
// Note: RewriteContextTest.PreserveNoCacheWithFailedRewrites
// relies on CacheExtender failing rewrites in this case.
// If you change this behavior that test MUST be updated as it covers
// security.
not_cacheable_count_->Add(1);
} else if (ShouldRewriteResource(
headers, now_ms, input_resource,url, result)) {
// We must be careful what Content-Types we allow to be cache extended.
// Specifically, we do not want to cache extend any Content-Types that
// could execute scripts when loaded in a browser because that could
// open XSS vectors in case of system misconfiguration.
//
// We whitelist a set of safe Content-Types here.
//
// TODO(sligocki): Should we whitelist more Content-Types as well?
// We would also have to find and rewrite the URLs to these resources
// if we want to cache extend them.
const ContentType* input_type = input_resource->type();
if (input_type->IsImage() || // images get sniffed only to other images
(input_type->type() == ContentType::kPdf &&
driver()->MayCacheExtendPdfs()) || // Don't accept PDFs by default.
input_type->type() == ContentType::kCss || // CSS + JS left as-is.
input_type->type() == ContentType::kJavascript) {
output_type = input_type;
ok = true;
} else {
// Fail to cache extend a file that isn't an approved type.
ok = false;
// If we decide not to fail to cache extend unapproved types, we
// should convert their Content-Type to text/plain because as per
// http://mimesniff.spec.whatwg.org/ it will never get turned into
// anything dangerous.
output_type = &kContentTypeText;
}
}
if (!ok) {
return kRewriteFailed;
}
StringPiece contents(input_resource->ExtractUncompressedContents());
GoogleString transformed_contents;
StringWriter writer(&transformed_contents);
GoogleUrl input_resource_gurl(input_resource->url());
if (output_type->type() == ContentType::kCss) {
switch (driver()->ResolveCssUrls(input_resource_gurl,
output_resource->resolved_base(),
contents, &writer, message_handler)) {
case RewriteDriver::kNoResolutionNeeded:
break;
case RewriteDriver::kWriteFailed:
return kRewriteFailed;
case RewriteDriver::kSuccess:
// TODO(jmarantz): find a mechanism to write this directly into
// the HTTPValue so we can reduce the number of times that we
// copy entire resources.
contents = transformed_contents;
break;
}
}
server_context()->MergeNonCachingResponseHeaders(
input_resource, output_resource);
if (driver()->Write(ResourceVector(1, input_resource),
contents,
output_type,
input_resource->charset(),
output_resource.get())) {
return kRewriteOk;
} else {
return kRewriteFailed;
}
}
RewriteContext* CacheExtender::MakeRewriteContext() {
return new Context(this, driver(), NULL /*not nested*/);
}
RewriteContext* CacheExtender::MakeNestedContext(
RewriteContext* parent, const ResourceSlotPtr& slot) {
Context* context = new Context(this, NULL /* driver*/, parent);
context->AddSlot(slot);
return context;
}
} // namespace net_instaweb