blob: 2a345bedf9ba56a3b756f54937adadbdb634604c [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_CONTEXT_H_
#define NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_CONTEXT_H_
#include <set>
#include <vector>
#include "net/instaweb/http/public/http_cache.h"
#include "net/instaweb/rewriter/cached_result.pb.h"
#include "net/instaweb/rewriter/public/output_resource_kind.h"
#include "net/instaweb/rewriter/public/resource.h"
#include "net/instaweb/rewriter/public/resource_slot.h"
#include "net/instaweb/rewriter/public/rewrite_result.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/scoped_ptr.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/response_headers.h"
#include "pagespeed/kernel/util/url_segment_encoder.h"
namespace net_instaweb {
class AsyncFetch;
class MessageHandler;
class NamedLock;
class RequestTrace;
class RewriteDriver;
class RewriteOptions;
class Statistics;
class Variable;
class FreshenMetadataUpdateManager;
// RewriteContext manages asynchronous rewriting of some n >= 1 resources (think
// CSS, JS, or images) into m >= 0 improved versions (typically, n = m = 1).
// It also helps update the references in the containing document (called
// slots), such as <img src=> in HTML, or background-image: url() in CSS,
// and make any other changes to it needed to commit the optimization.
//
// It is normally used as a base class, with its own code helping take care
// of caching, fetching, etc., while subclasses describe how to transform the
// resources, and how to update the document containing them with the new
// version by overriding some virtuals like Rewrite() and Render().
//
// Filters parsing HTML create their RewriteContext subclasses for every
// group of resources they think should be optimized together (such as one
// RewriteContext for every image for image re-compression, or one for a group
// of CSS files that have compatible HTML markup for CSS combining). The
// framework may also ask a filter to make its RewriteContext subclass via
// MakeRewriteContext() in case it need to reconstruct an optimized resource
// that's not available in the cache.
//
// In the case of combining filters, a single RewriteContext may
// result in multiple rewritten resources that are partitioned based
// on data semantics. Most filters will just work on one resource,
// and those can inherit from SingleRewriteContext which is simpler
// to implement.
//
// The most basic transformation steps subclasses will want to implement are:
//
// Partition:
// Determines how many outputs, if any, will be created from all the inputs.
// For example, a spriter may create separate partitions for groups of images
// with similar colormaps. This step is also responsible for deciding what to
// do if some inputs were not loaded successfully. SingleRewriteContext
// provides the correct implementation for transformations that take in one
// file and optimize it.
//
// Rewrite:
// Takes inputs from one partition, and tries to produce an optimized output
// for it, as well as a CachedResult, which caches any auxiliary information
// that may be needed to update the container document. For example, the image
// filter will store image dimensions inside the CachedResult object.
//
// If a better version can be created, the subclass should call
// RewriteDriver::Write with its data, and then RewriteDone(kRewriteOk).
//
// If no improvement is possible, it should call RewriteDone(kRewriteFailed).
// Note that this does not mean that nothing can be done, just that no new
// resource has been created (for example an image filter might still insert
// dimensions into the <img> tag even if it can't compress the image better).
//
// Render:
// Updates the document based on information stored in CachedResult.
// This is the only step that can touch the HTML DOM. Note that you
// do not need to implement it if you just want to update the URL to the new
// version: the ResourceSlot's will do it automatically.
//
// Which of the steps get invoked depends on how much information has been
// cached, as well as on timing of things (since the system tries not to
// hold up the web page noticeably to wait for an optimization). Common
// scenarios are:
//
// 1) New rewrite, finishes quickly:
// Partition -> Rewrite -> Render
// 2) New rewrite, but too slow to render:
// Partition -> Rewrite
// 3) Metadata cache hit:
// Render
// 4) Reconstructing output from a .pagespeed. URL:
// Rewrite
//
// Note in particular that (3) means that all rendering should be doable just
// from information inside the CachedResult.
//
// Top-level RewriteContexts are initialized from the HTML thread, by filters
// responding to parser events. In particular, from this thread they can be
// constructed, and AddSlot() and Initiate() can be called. Once Initiate is
// called, the RewriteContext runs purely in its two threads, until
// it completes. At that time it will self-delete in coordination with
// RewriteDriver.
//
// RewriteContexts can also be nested, in which case they are constructed,
// slotted, and Initated all within the rewrite threads. However, they
// are Propagated and destructed by their parent, which was initiated by the
// RewriteDriver.
//
// RewriteContext utilizes two threads (via QueuedWorkerPool::Sequence)
// to do most of its work. The "high priority" thread is used to run the
// dataflow graph: queue up fetches and cache requests, partition inputs,
// render results, etc. The actual Rewrite() methods, however, are invoked
// in the "low priority" thread and can be canceled during extreme load
// or shutdown.
//
// TODO(jmarantz): add support for controlling TTL on failures.
class RewriteContext {
public:
typedef std::vector<InputInfo*> InputInfoStarVector;
static const char kNumRewritesAbandonedForLockContention[];
static const char kNumDeadlineAlarmInvocations[];
static const char kNumDistributedRewriteSuccesses[];
static const char kNumDistributedRewriteFailures[];
static const char kNumDistributedMetadataFailures[];
// The extension used for all distributed fetch URLs.
static const char kDistributedExt[];
// The hash value used for all distributed fetch URLs.
static const char kDistributedHash[];
static const char kHashMismatchMessage[];
// Used to pass the result of the metadata cache lookups. Recipient must
// take ownership.
struct CacheLookupResult {
CacheLookupResult()
: cache_ok(false),
can_revalidate(false),
useable_cache_content(false),
is_stale_rewrite(false),
partitions(new OutputPartitions) {}
bool cache_ok;
bool can_revalidate;
bool useable_cache_content;
bool is_stale_rewrite;
InputInfoStarVector revalidate;
scoped_ptr<OutputPartitions> partitions;
};
// Used for LookupMetadataForOutputResource.
class CacheLookupResultCallback {
public:
CacheLookupResultCallback() {}
virtual ~CacheLookupResultCallback();
virtual void Done(const GoogleString& cache_key,
CacheLookupResult* result) = 0;
private:
DISALLOW_COPY_AND_ASSIGN(CacheLookupResultCallback);
};
// Takes ownership of resource_context, which must be NULL or
// allocated with 'new'.
RewriteContext(RewriteDriver* driver, // exactly one of driver & parent
RewriteContext* parent, // is non-null
ResourceContext* resource_context);
virtual ~RewriteContext();
// Random access to slots. This is not thread-safe. Prior to
// Initialize(), these can be called by the constructing thread.
// After Initiate(), these should only be called by the Rewrite
// thread.
int num_slots() const { return slots_.size(); }
ResourceSlotPtr slot(int index) const { return slots_[index]; }
// Random access to outputs. These should only be accessed by
// the RewriteThread.
int num_outputs() const { return outputs_.size(); }
OutputResourcePtr output(int i) const { return outputs_[i]; }
// These are generally accessed in the Rewrite thread,
// but may also be accessed in ::Render.
int num_output_partitions() const;
const CachedResult* output_partition(int i) const;
CachedResult* output_partition(int i);
// Returns true if this context is chained to some predecessors, and
// must therefore be started by a predecessor and not RewriteDriver.
bool chained() const { return chained_; }
// Resource slots must be added to a Rewrite before Initiate() can
// be called. Starting the rewrite sets in motion a sequence
// of async cache-lookups &/or fetches.
void AddSlot(const ResourceSlotPtr& slot);
// Remove the last slot from the context's slot list. This
// context must be the last one attached to the slot.
void RemoveLastSlot();
// Adds a new nested RewriteContext. This RewriteContext will not
// be considered complete until all nested contexts have completed.
// This may be useful, for example for a CSS optimizer that also wants to
// optimize images referred to from CSS (in which case the image rewrite
// context will be nested inside the CSS context).
void AddNestedContext(RewriteContext* context);
void CallFetchInputs();
void CallLockFailed();
void CallStartFetchImpl();
// Starts a resource rewrite. Once Inititated, the Rewrite object
// should only be accessed from the Rewrite thread, until it
// Completes, at which point top-level Contexts will call
// RewriteComplete on their driver, and nested Contexts will call
// NestedRewriteComplete on their parent. Nested rewrites will be
// Started directly from their parent context, and Initiate will not
// be called.
//
// Precondition: this rewrite isn't anyone's successor (e.g. chain() == false)
// and has not been started before.
void Initiate();
// Fetch the specified output resource by reconstructing it from
// its inputs, sending output into fetch.
//
// True is returned if an asynchronous fetch got queued up.
// If false, fetch->Done() will not be called.
bool Fetch(const OutputResourcePtr& output_resource,
AsyncFetch* fetch,
MessageHandler* message_handler);
// If true, we have determined that this job can't be rendered just
// from metadata cache (including all prerequisites).
bool slow() const { return slow_; }
// This particular rewrite was a metadata cache miss.
bool is_metadata_cache_miss() const { return is_metadata_cache_miss_; }
// Returns true if this is a nested rewriter.
bool has_parent() const { return parent_ != NULL; }
// Returns true if this is a child rewriter and its parent has the given
// id.
bool IsNestedIn(StringPiece id) const;
// Allows a nested rewriter to walk up its parent hierarchy.
RewriteContext* parent() { return parent_; }
const RewriteContext* parent() const { return parent_; }
// Accessors for the nested rewrites.
int num_nested() const { return nested_.size(); }
RewriteContext* nested(int i) const { return nested_[i]; }
RewriteDriver* Driver() const {
return driver_;
}
// If called with true, forces a rewrite and re-generates the output.
void set_force_rewrite(bool x) { force_rewrite_ = x; }
bool rewrite_uncacheable() const { return rewrite_uncacheable_; }
void set_rewrite_uncacheable(bool rewrite_uncacheable) {
rewrite_uncacheable_ = rewrite_uncacheable;
}
const ResourceContext* resource_context() const {
return resource_context_.get();
}
// Returns debug information about this RewriteContext.
GoogleString ToString() const;
GoogleString ToStringWithPrefix(StringPiece prefix) const;
// Initializes statistics.
static void InitStats(Statistics* stats);
protected:
typedef std::vector<GoogleUrl*> GoogleUrlStarVector;
// -----------------------------------------------------------------------
// Resource transformation APIs. If you are implementing an optimization,
// you'll be dealing mainly with these.
// -----------------------------------------------------------------------
// Finds the ServerContext associated with this context. Note that
// this method might have to climb up the parent-tree, but it's typically
// not a deep tree. Same with Driver() and Options().
ServerContext* FindServerContext() const;
const RewriteOptions* Options() const;
OutputPartitions* partitions() { return partitions_.get(); }
// Add a dummy other_dependency that will force the rewrite's OutputPartitions
// to be rechecked after a modest TTL.
void AddRecheckDependency();
// If this returns true, running the rewriter isn't required for
// correctness of the page, so the engine will be permitted to drop
// the rewrite if needed to preserve system responsiveness.
virtual bool OptimizationOnly() const { return true; }
// Partitions the input resources into one or more outputs. Return
// 'true' if the partitioning could complete (whether a rewrite was
// found or not), false if the attempt was abandoned and no
// conclusion can be drawn.
//
// Note that if partitioner finds that the resources are not
// rewritable, it will still return true; it will simply have
// an empty inputs-array in OutputPartitions and leave
// 'outputs' unmodified. 'false' is only returned if the subclass
// skipped the rewrite attempt due to a lock conflict.
//
// You must override one of Partition() or PartitionAsync(). Partition()
// is normally fine unless you need to do computations that can take a
// noticeable amount of time, since there are some scenarios under which
// page output may end up being held up for a partitioning step. If you
// do need to do something computationally expensive in partitioning steps,
// override PartitionAsync() instead.
virtual bool Partition(OutputPartitions* partitions,
OutputResourceVector* outputs);
// As above, but you report the result asynchronously by calling
// PartitionDone(), which must be done from the main rewrite
// sequence. One of Partition or PartitionAsync() must be overridden in
// the subclass. The default implementation is implemented in terms of
// Partition().
virtual void PartitionAsync(OutputPartitions* partitions,
OutputResourceVector* outputs);
// Call this from the main rewrite sequence to report results of
// PartitionAsync. If the client is not in the main rewrite sequence,
// use CrossThreadPartitionDone() instead.
void PartitionDone(RewriteResult result);
// Helper for queuing invocation of PartitionDone to run in the
// main rewrite sequence.
void CrossThreadPartitionDone(RewriteResult result);
// Takes a completed rewrite partition and rewrites it. When
// complete, implementations should call RewriteDone(kRewriteOk) if
// they successfully created an output resource using RewriteDriver::Write,
// and RewriteDone(kRewriteFailed) if they didn't. They may also call
// RewriteDone(kTooBusy) in case system load/resource usage makes it
// dangerous for the filter to do optimization at this time.
//
// Any information about the inputs or output that may be needed to update
// the containing document should be stored inside the CachedResult.
//
// If implementors wish to rewrite resources referred to from within the
// inputs (e.g. images in CSS), they may create nested rewrite contexts
// and call AddNestedContext() on each, and then StartNestedTasks()
// when all have been added.
//
// TODO(jmarantz): check for resource completion from a different
// thread (while we were waiting for resource fetches) when Rewrite
// gets called.
virtual void Rewrite(int partition_index,
CachedResult* partition,
const OutputResourcePtr& output) = 0;
// Called by subclasses when an individual rewrite partition is
// done. Note that RewriteDone may 'delete this' so no
// further references to 'this' should follow a call to RewriteDone.
// This method can run in any thread.
void RewriteDone(RewriteResult result, int partition_index);
// Sends a a response to the the client via the AsyncFetch, transforming
// output if needed (e.g. css absolutification) and controlling chunked
// encoding hints as needed.
//
// This is called in case a rewrite fails in the fetch path or a deadline
// is exceeded. Default implementation is just to write the input.
// But contexts may need to specialize this to actually absolutify
// subresources if the fetched resource is served on a different path
// than the input resource.
virtual bool SendFallbackResponse(StringPiece output_url_base,
StringPiece contents,
AsyncFetch* async_fetch,
MessageHandler* handler);
// Called on the parent to initiate all nested tasks. This is so
// that they can all be added before any of them are started.
// May be called from any thread.
void StartNestedTasks();
// Once any nested rewrites have completed, the results of these
// can be incorporated into the rewritten data. For contexts that
// do not require any nested RewriteContexts, it is OK to skip
// overriding this method -- the empty default implementation is fine.
virtual void Harvest();
// Performs rendering activities that span multiple HTML slots. For
// example, in a filter that combines N slots to 1, N-1 of the HTML
// elements might need to be removed. That can be performed in
// Render(). This method is optional; the base-class implementation
// is empty.
//
// Note that unlike Harvest(), this method runs in the HTML thread (for
// top-level rewrites), and only runs if the rewrite completes prior to
// the rewrite-deadline. If the rewrite does make it by the deadline,
// RewriteContext::Render() will be invoked regardless of whether any slots
// were actually optimized successfully.
virtual void Render();
// Notifies the subclass that the filter will not be able to render its
// output to the containing HTML document, because it wasn't ready in time.
// Note that neither Render() nor WillNotRender() may be called in case
// this rewrite got canceled due to disable_further_processing(), or in case
// Partition() failed. This is called from the HTML thread, but should only be
// used for read access, and subclasss implementations are required to be
// reasonably quick since it's called with rewrite_mutex() held. It's called
// after any earlier contexts in filter order had completed their rendering,
// if any, but with no order guarantees with respect to other WillNotRender()
// invocations.
virtual void WillNotRender();
// This method is invoked (in Rewrite thread) if this context got canceled
// due to an earlier filter sharing a slot with it having called
// set_disable_further_processing. Default implementation does nothing.
virtual void Cancel();
// This final set of protected methods can be optionally overridden
// by subclasses.
// All RewriteContexts define how they encode URLs and other
// associated information needed for a rewrite into a URL.
// The default implementation handles a single URL with
// no extra data. The RewriteContext owns the encoder.
//
// TODO(jmarantz): remove the encoder from RewriteFilter.
virtual const UrlSegmentEncoder* encoder() const;
// Allows subclasses to add additional text to be appended to the
// metadata cache key. The default implementation returns "".
virtual GoogleString CacheKeySuffix() const;
// Indicates user agent capabilities that must be stored in the cache key.
//
// Note that the context may be NULL as it may not be set before this. Since
// it isn't going to be modified in the method, ResourceContext is passed
// as a const pointer.
// TODO(morlovich): This seems to overlap with CacheKeySuffix.
virtual GoogleString UserAgentCacheKey(
const ResourceContext* context) const {
return "";
}
// Encodes User Agent into the ResourceContext.
// A subclass ResourceContext should normally call
// RewriteFilter::EncodeUserAgentIntoResourceContext if it has access to
// a RewriteFilter.
virtual void EncodeUserAgentIntoResourceContext(ResourceContext* context) {}
// Returns the filter ID.
virtual const char* id() const = 0;
// Rewrites come in three flavors, as described in output_resource_kind.h,
// so this method must be defined by subclasses to indicate which it is.
//
// For example, we will avoid caching output_resource content in the HTTP
// cache for rewrites that are so quick to complete that it's fine to
// do the rewrite on every request. extend_cache is obviously in
// this category, and it's arguable we could treat js minification
// that way too (though we don't at the moment).
virtual OutputResourceKind kind() const = 0;
// -----------------------------------------------------------------------
// Tracing API.
// -----------------------------------------------------------------------
// Creates a new request trace associated with this context with a given
// |label|.
void AttachDependentRequestTrace(const StringPiece& label);
// Provides the dependent request trace associated with this context, if any.
// Note that this is distinct from the root user request trace, available
// in Driver().
RequestTrace* dependent_request_trace() { return dependent_request_trace_; }
// A convenience wrapper to log a trace annotation in both the request
// trace (if present) as well as the root user request trace (if present).
void TracePrintf(const char* fmt, ...);
// -----------------------------------------------------------------------
// Fetch state machine override APIs, as well as exports of some general
// state machine state for overriders to use. If you just want to write an
// optimization, you do not need these --- they are useful if you want to
// write a new state machine that's similar but not quite identical to
// what RewriteContext provides.
// -----------------------------------------------------------------------
// Called in fetch path if we have not found the resource available
// in HTTP cache under an alternate location suggested by metadata cache
// such as a different hash or the original, and thus need to fully
// reconstruct it.
//
// The base implementation will do an asynchronous locking attempt,
// scheduling to run FetchInputs when complete. Subclasses may override
// this method to preload inputs in a different manner, and may delay
// calling of base version until that is complete.
virtual void StartFetchReconstruction();
// Determines if the given rewrite should be distributed. This is based on
// whether distributed servers have been configured, if the current filter is
// configured to be distributed, where a filter is in a chain, if a
// distributed fetcher is in place, and if distribution has been explicitly
// disabled for this context.
bool ShouldDistributeRewrite() const;
// Determines if this rewrite-context is acting on behalf of a distributed
// rewrite request from an HTML rewrite. Verifies the distributed rewrite key.
bool IsDistributedRewriteForHtml() const;
// Dispatches the rewrite to another task with a distributed fetcher. Should
// not be called without first getting true from ShoulDistributeRewrite() as
// it has guards (such as checking the number of slots).
void DistributeRewrite();
// Makes the rest of a fetch run in background, not producing
// a result or invoking callbacks. Will arrange for appropriate
// memory management with the rewrite driver itself; but the caller
// is responsible for delivering results itself and invoking the
// callback.
void DetachFetch();
// Decodes the output resource to find the resources to be fetched. The
// default behavior decodes the output resource name into multiple paths and
// absolutifies them with respect to the output resource base. Returns true if
// the decoding is successful and false otherwise.
virtual bool DecodeFetchUrls(const OutputResourcePtr& output_resource,
MessageHandler* message_handler,
GoogleUrlStarVector* url_vector);
// Adjust headers sent out for a stale or in-place result. We may send out
// stale results in the fallback fetch pathway, but these results should not
// be cached much. By default we strip Set-Cookie* headers and Etags, and
// convert Cache-Control headers to private, max-age=300.
virtual void FixFetchFallbackHeaders(const CachedResult& cached_result,
ResponseHeaders* headers);
// Callback once the fetch is done. This calls Driver()->FetchComplete() if
// notify_driver_on_fetch_done is true.
virtual void FetchCallbackDone(bool success);
// Attempts to fetch a given URL from HTTP cache, and serves it
// (with shortened HTTP headers) if available. If not, fallback to normal
// full reconstruction path. Note that the hash can be an empty string if the
// url is not rewritten.
virtual void FetchTryFallback(const GoogleString& url,
const StringPiece& hash);
// Freshens resources proactively to avoid expiration in the near future.
void Freshen();
bool notify_driver_on_fetch_done() const {
return notify_driver_on_fetch_done_;
}
void set_notify_driver_on_fetch_done(bool value) {
notify_driver_on_fetch_done_ = value;
}
// Returns true if this context will prevent any attempt at distributing a
// rewrite (although its nested context still may be distributed). See
// ShouldDistributeRewrite for more detail on when a rewrite should be
// distributed.
bool block_distribute_rewrite() const { return block_distribute_rewrite_; }
void set_block_distribute_rewrite(const bool x) {
block_distribute_rewrite_ = x;
}
// Note that the following must only be called in the fetch flow.
AsyncFetch* async_fetch();
// Is fetch_ detached? Only call this in the fetch flow.
bool FetchContextDetached();
// The message handler for the fetch.
MessageHandler* fetch_message_handler();
// Indicates whether we are serving a stale rewrite.
bool stale_rewrite() const { return stale_rewrite_; }
// Returns an interval in milliseconds to wait when configuring the deadline
// alarm in FetchContext::SetupDeadlineAlarm(). Subclasses may configure the
// deadline based on rewrite type, e.g., IPRO vs. HTML-path.
virtual int64 GetRewriteDeadlineAlarmMs() const;
// Should the context call LockForCreation before checking the cache?
virtual bool CreationLockBeforeStartFetch() const;
// Should the context fail to serve the rewritten resource if the hash
// doesn't match user requested hash?
// By default, we do not fail and simply serve with limited Caching headers
// assuming that an out-of-date resource is better than none. But for
// resources like source maps, out-of-date versions are worse than nothing
// because they are complete non-sense if not associated with the exact
// expected contents.
virtual bool FailOnHashMismatch() const { return false; }
// Backend to RewriteDriver::LookupMetadataForOutputResource, with
// the RewriteContext of appropriate type and the OutputResource already
// created. Takes ownership of rewrite_context.
static bool LookupMetadataForOutputResourceImpl(
OutputResourcePtr output_resource,
const GoogleUrl& gurl,
RewriteContext* rewrite_context,
RewriteDriver* driver,
GoogleString* error_out,
CacheLookupResultCallback* callback);
private:
class DistributedRewriteCallback;
class DistributedRewriteFetch;
class OutputCacheCallback;
class WriteIfChanged;
class LookupMetadataForOutputResourceCallback;
class HTTPCacheCallback;
class ResourceCallbackUtils;
class ResourceFetchCallback;
class ResourceReconstructCallback;
class ResourceRevalidateCallback;
class InvokeRewriteFunction;
class RewriteFreshenCallback;
friend class RewriteDriver;
typedef std::set<RewriteContext*> ContextSet;
// This is passed to CanFetchFallbackToOriginal when trying to determine
// whether using the 0th input resource would be an acceptable substitute
// for output when:
enum FallbackCondition {
kFallbackDiscretional, // trying to produce result quicker to improve
// latency
kFallbackEmergency // rewrite failed and output would otherwise not
// be available
};
// Callback helper functions.
void Start();
void SetPartitionKey();
void StartFetch();
void StartFetchImpl();
void CancelFetch();
void OutputCacheDone(CacheLookupResult* cache_result);
void OutputCacheHit(bool write_partitions);
void OutputCacheRevalidate(const InputInfoStarVector& to_revalidate);
void OutputCacheMiss();
void ResourceFetchDone(bool success, ResourcePtr resource, int slot_index);
void ResourceRevalidateDone(InputInfo* input_info, bool success);
void LogMetadataCacheInfo(bool cache_ok, bool can_revalidate);
// When a RewriteContext 'B' discovers that it's doing the exact same rewrite
// as a previous RewriteContext 'A', B adds itself to A->repeated_, and
// suspends its work, expecting 'A' to call B->RepeatedSuccess(A) or
// B->RepeatedFailure() to give it the result of the rewrite.
void RepeatedSuccess(const RewriteContext* primary);
void RepeatedFailure();
// After a Rewrite is complete, writes the metadata for the rewrite
// operation to the cache, and runs any further rewites that are
// dependent on this one.
//
// If there are pending nested rewrites then this call has no
// effect. Once all the nested rewrites have been accounted for via
// NestedRewriteDone() then Finalize can queue up its render and
// enable successor rewrites to proceed.
void Finalize();
// Get reference to lock_, lazy-initializing if necessary.
NamedLock* Lock();
// Initiates an asynchronous fetch for the resources associated with
// each slot, calling ResourceFetchDone() when complete.
//
// To avoid concurrent fetches across multiple processes or threads, the
// caller must first lock each input by name, blocking or abandoning rewriting
// as necessary. Input fetches done on behalf of resource fetches must
// succeed to avoid sending 404s to clients, and so they will break locks.
// Input fetches done for async rewrite initiations should fail fast to help
// avoid having multiple concurrent processes attempt the same rewrite.
void FetchInputs();
// Called when we fail to acquire the lock for the output resource.
void LockFailed();
// Callback to a distributed rewrite fetch. Queued to run in the high-priority
// thread. Fetch path: If the fetch succeeded then the rest of the flow is
// skipped and that result is used, otherwise the original resource is fetched
// and returned, bypassing rewriting.
void DistributeRewriteDone(bool success);
// If the response_headers have metadata in them, strip the metadata from the
// headers, parse them and write them to cache_result. Returns true if
// the parse was successful otherwise false.
bool ParseAndRemoveMetadataFromResponseHeaders(
ResponseHeaders* response_headers, CacheLookupResult* cache_result);
// Create an OutputResource initialized from CachedResult, response headers,
// and content.
bool CreateOutputResourceFromContent(const CachedResult& cached_result,
const ResponseHeaders& response_headers,
StringPiece content,
OutputResourcePtr* output_resource);
// The distributed rewrite path for HTML rewrites works by converting the
// input URL on the ingress task into a .pagespeed. fetch for the distributed
// task to reconstruct using the corresponding filter id. This function maps
// the given input resource URL into a .pagespeed. URL for reconstruction. It
// uses a hash value of 0 and an extension of "distributed". Returns an empty
// string if the URL could not be constructed (e.g., was too long).
//
// Ex. input: http://www.example.com/a.png with an image compression context
// output: http://www.example.com/50x50xa.png.pagespeed.ic.0.distributed
GoogleString DistributedFetchUrl(StringPiece url);
// Returns true if this rewrite context was created to fetch a resource (e.g.,
// IPRO or .pagespeed. URLs) and false otherwise.
bool IsFetchRewrite() const { return fetch_.get() != NULL; }
// Called on the parent from a nested Rewrite when it is complete.
// Note that we don't track rewrite success/failure here. We only
// care whether the nested rewrites are complete, and whether there
// are any dependencies.
void NestedRewriteDone(const RewriteContext* context);
// Generally a RewriteContext is waiting for one or more
// asynchronous events to take place. Activate is called
// to run some action to help us advance to the next state.
void Activate();
// Runs after all Rewrites have been completed, and all nested
// RewriteContexts have completed and harvested.
//
// For top-level Rewrites, this must be called from the HTML thread.
// For nested Rewrites it runs from the Rewrite thread.
//
// If render_slots is true, then all the slots owned by this context
// will have Render() called on them. For top-level Rewrites, this
// should only be done if the rewrite completes before the rewrite
// deadline expires. After that, the HTML elements referred to by
// the slots have already been flushed to the network. For nested
// Rewrites it's done unconditionally.
//
// Rewriting and propagation continue even after this deadline, so
// that we may cache the rewritten results, allowing the deadline to
// be easier-to-hit next time the same resources need to be
// rewritten.
//
// And in all cases, the successors Rewrites are queued up in the
// Rewrite thread once any nested propagation is complete. And, in
// particular, each slot must be updated with any rewritten
// resources, before the successors can be run, independent of
// whether the slots can be rendered into HTML.
void Propagate(bool render_slots);
// With all resources loaded, the rewrite can now be done, writing:
// The metadata into the cache
// The output resource into the cache
// if the driver has not been detached,
// the url+data->rewritten_resource is written into the rewrite
// driver's map, for each of the URLs.
void StartRewriteForHtml();
void StartRewriteForFetch();
// Determines whether the Context is in a state where it's ready to
// rewrite. This requires:
// - no preceding RewriteContexts in progress
// - no outstanding cache lookups
// - no outstanding fetches
// - rewriting not already complete.
bool ReadyToRewrite() const;
// Removes this RewriteContext from all slots. This is done normally when
// a RewriteContext is completed and we are ready to run the successors.
// It is also done when aborting a RewriteContext due to cache being
// unhealthy.
void DetachSlots();
// Activate any Rewrites that come after this one, for serializability
// of access to common slots.
void RunSuccessors();
// Writes out the partition-table into the metadata cache (checking
// ok_to_write_output_partitions_)
void WritePartition();
// Does all the bookkeeping needed after rewrite in HTML completes ---
// writes out cache data, notifies any repeated rewrites, queues up
// successors, cleans things up, etc.
//
// This method may call 'delete this' so it should be the last call at its
// call-site.
//
// It will *not* call 'delete this' if there is a live RewriteDriver,
// waiting for a convenient point to render the rewrites into HTML.
void FinalizeRewriteForHtml();
// Arranges for commit of all the state (if permit_render is true), and
// notification of parents, rewrite driver, etc., as well as running of
// successors if applicable. This is the tail portion of
// FinalizeRewriteForHtml that must be called even if we didn't
// actually get as far as computing a partition_key_.
void RetireRewriteForHtml(bool permit_render);
// Marks this job and any dependents slow as appropriate, notifying the
// RewriteDriver of any changes.
void MarkSlow();
// Notes that we dropped parts of this rewrite due to system load, so we
// should not cache it.
void MarkTooBusy();
// Collect all non-nested contexts that depend on this one (including
// itself). Note that this might exclude some repeated jobs that haven't
// gotten far enough to realize that yet.
void CollectDependentTopLevel(ContextSet* contexts);
// Actual implementation of RewriteDone that's queued to run in
// high-priority rewrite thread.
void RewriteDoneImpl(RewriteResult result, int partition_index);
// Actual implementation of StartNestedTasks that's queued to run in
// high-priority rewrite thread.
void StartNestedTasksImpl();
// Establishes that a slot has been rewritten. So when Propagate()
// is called, the resource update that has been written to this slot can
// be propagated to the DOM.
void RenderPartitionOnDetach(int partition_index);
// Sets up all the state needed for Fetch, but doesn't register this context
// or actually start the rewrite process.
bool PrepareFetch(
const OutputResourcePtr& output_resource,
AsyncFetch* fetch,
MessageHandler* message_handler);
// Creates an output resource that corresponds to a full URL stored in
// metadata cache.
bool CreateOutputResourceForCachedOutput(const CachedResult* cached_result,
OutputResourcePtr* output_resource);
// Callback for metadata lookup on fetch path.
void FetchCacheDone(CacheLookupResult* cache_result);
// Callback for HTTP lookup on fetch path where the metadata cache suggests
// we should try either serving a different path or the original.
void FetchFallbackCacheDone(HTTPCache::FindResult result,
HTTPCache::Callback* data);
// Returns true if we can attempt to serve the original file for a fetch
// request in case something goes wrong with rewriting (circumstance ==
// kFallbackEmergency) or the system thinks that would avoid a latency
// spike or overload (kFallbackDiscretional).
bool CanFetchFallbackToOriginal(FallbackCondition circumstance) const;
// Checks whether an other dependency input info already exists in the
// partition with the same data. Used to de-dup the field.
bool HasDuplicateOtherDependency(const InputInfo& input);
// Check if there is a duplicate and if there is none, add to the other
// dependencies. Updates the internal other_dependency map that is used to
// de-dup the contents.
void CheckAndAddOtherDependency(const InputInfo& input);
// Perform checks and freshen the input resource. Also updates metadata if
// required.
void CheckAndFreshenResource(const InputInfo& input_info,
ResourcePtr resource, int partition_index,
int input_index,
FreshenMetadataUpdateManager* freshen_manager);
ResourcePtr CreateUrlResource(const StringPiece& input_url);
// To perform a rewrite, we need to have data for all of its input slots.
ResourceSlotVector slots_;
// Not all of the slots require rendering from this RewriteContext. If an
// optimization was deemed non-beneficial then we skip rendering the slot.
// So keep the slots requiring rendering in a bitvector.
std::vector<bool> render_slots_;
// It's feasible that callbacks for different resources will be delivered
// on different threads, thus we must protect these counters with a mutex
// or make them using atomic integers.
//
// TODO(jmarantz): keep the outstanding fetches as a set so they can be
// terminated cleanly and immediately, allowing fast process shutdown.
// For example, if Apache notifies our process that it's being shut down
// then we should have a mechanism to cancel all pending fetches. This
// would require a new cancellation interface from both CacheInterface and
// UrlAsyncFetcher.
bool started_;
scoped_ptr<OutputPartitions> partitions_;
OutputResourceVector outputs_;
int outstanding_fetches_;
int outstanding_rewrites_;
scoped_ptr<ResourceContext> resource_context_;
GoogleString partition_key_;
UrlSegmentEncoder default_encoder_;
// Lock guarding output partitioning and rewriting. Lazily initialized by
// Lock(), unlocked on destruction or the end of Finish().
scoped_ptr<NamedLock> lock_;
// When this rewrite object is created on behalf of a fetch, we must
// keep the response_writer, request_headers, and callback in the
// FetchContext so they can be used once the inputs are available.
class FetchContext;
scoped_ptr<FetchContext> fetch_;
// Track the RewriteContexts that must be run after this one because they
// share a slot.
std::vector<RewriteContext*> successors_;
// Other places on the page (or CSS) that should be rewritten the same
// way 'this' is (e.g. because they refer to the same URL, filter and
// settings).
std::vector<RewriteContext*> repeated_;
// Track the number of nested contexts that must be completed before
// this one can be marked complete. Nested contexts are typically
// added during the Rewrite() phase.
int num_pending_nested_;
std::vector<RewriteContext*> nested_;
// If this context is nested, the parent is the context that 'owns' it.
RewriteContext* parent_;
// If this context was initiated from a RewriteDriver, either due to
// a Resource Fetch or an HTML Rewrite, then we keep track of the
// RewriteDriver, and notify it when the RewriteContext is complete.
// That way it can stay around and 'own' all the resources associated
// with all the resources it spawns, directly or indirectly.
//
// Nested RewriteContexts obtain their driver from their parent, but
// store it here to permit Driver() to be a simple getter.
RewriteDriver* driver_;
// Track the number of ResourceContexts that must be run before this one.
int num_predecessors_;
// If true, this context's execution must follow some other context's
// completion (which may have occurred already).
bool chained_;
// TODO(jmarantz): Refactor to replace a bunch bool member variables with
// an explicit state_ member variable, with a set of possibilties that
// look something like this:
//
// enum State {
// kCluster, // Inputs are being clustered into RewriteContexts.
// kLookup, // Looking up partitions & rewritten URLs in the cache.
// // - If successsful, skip to Render.
// kFetch, // Waiting for URL fetches to complete.
// kPartition, // Fetches complete; ready to partition into
// // OutputResources.
// kRewrite, // Partitioning complete, ready to Rewrite.
// kHarvest, // Nested RewriteContexts complete, ready to harvest
// // results.
// kRender, // Ready to render the rewrites into the DOM.
// kComplete // Ready to delete.
// };
// True if all the rewriting is done for this context.
bool rewrite_done_;
// True if it's valid to write the partition table to the metadata cache.
// We would *not* want to do that if one of the Rewrites completed
// with status kTooBusy or if we've just read these very partitions from
// the metadata cache.
//
// Because both failure (kTooBusy) and success (we just read this from cache)
// lead to ok_to_write_output_partitions_ being turned off, this is not copied
// from nested rewrite contexts. In the success case we want the parent to
// write iff it has made changes, which is what it will do if we copy nothing;
// in the failure case we also set was_too_busy_, which does get copied to the
// parent.
bool ok_to_write_output_partitions_;
// True if the rewrite was incomplete due to heavy load; if this is true
// ok_to_write_output_partitions_ must be false. This is copied from nested
// rewrite contexts because if one rewrite fails none should be saved.
bool was_too_busy_;
// We mark a job as "slow" when we cannot render it entirely from the
// metadata cache (including rendering its predecessors). We only do this
// for top-level jobs.
bool slow_;
// Starts at true, set to false if any content-change checks failed.
bool revalidate_ok_;
// Indicates that the context should call driver()->FetchComplete() once the
// fetch is done.
bool notify_driver_on_fetch_done_;
// Indicates whether we want to force a rewrite. If true, we skip reading
// from the metadata cache.
bool force_rewrite_;
// Indicates that the current rewrite involves at least one resource which
// is stale.
bool stale_rewrite_;
// Indicates whether we have a metadata miss (or an unsuccessful revalidation
// attempt) on the html path.
bool is_metadata_cache_miss_;
// If set to true, we'll try to rewrite un-cacheable resources.
// The flag is expected to be set to true only from IPRO context.
bool rewrite_uncacheable_;
// An optional request trace associated with this context. May be NULL.
// Always owned externally.
RequestTrace* dependent_request_trace_;
// Set true if this rewrite context should be blocked from distributing its
// rewrite.
bool block_distribute_rewrite_;
// Stores the resulting headers and content of a distributed rewrite.
scoped_ptr<DistributedRewriteFetch> distributed_fetch_;
// Map to dedup partitions other dependency field.
StringIntMap other_dependency_map_;
Variable* const num_rewrites_abandoned_for_lock_contention_;
Variable* const num_distributed_rewrite_failures_;
Variable* const num_distributed_rewrite_successes_;
Variable* const num_distributed_metadata_failures_;
DISALLOW_COPY_AND_ASSIGN(RewriteContext);
};
} // namespace net_instaweb
#endif // NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_CONTEXT_H_