| /* |
| * Copyright 2011 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmarantz@google.com (Joshua Marantz) |
| |
| #ifndef NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_CONTEXT_H_ |
| #define NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_CONTEXT_H_ |
| |
| #include <set> |
| #include <vector> |
| |
| #include "net/instaweb/http/public/http_cache.h" |
| #include "net/instaweb/rewriter/cached_result.pb.h" |
| #include "net/instaweb/rewriter/public/output_resource_kind.h" |
| #include "net/instaweb/rewriter/public/resource.h" |
| #include "net/instaweb/rewriter/public/resource_slot.h" |
| #include "net/instaweb/rewriter/public/rewrite_result.h" |
| #include "net/instaweb/rewriter/public/server_context.h" |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/scoped_ptr.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/http/google_url.h" |
| #include "pagespeed/kernel/http/response_headers.h" |
| #include "pagespeed/kernel/util/url_segment_encoder.h" |
| |
| namespace net_instaweb { |
| |
| class AsyncFetch; |
| class MessageHandler; |
| class NamedLock; |
| class RequestTrace; |
| class RewriteDriver; |
| class RewriteOptions; |
| class Statistics; |
| class Variable; |
| class FreshenMetadataUpdateManager; |
| |
| // RewriteContext manages asynchronous rewriting of some n >= 1 resources (think |
| // CSS, JS, or images) into m >= 0 improved versions (typically, n = m = 1). |
| // It also helps update the references in the containing document (called |
| // slots), such as <img src=> in HTML, or background-image: url() in CSS, |
| // and make any other changes to it needed to commit the optimization. |
| // |
| // It is normally used as a base class, with its own code helping take care |
| // of caching, fetching, etc., while subclasses describe how to transform the |
| // resources, and how to update the document containing them with the new |
| // version by overriding some virtuals like Rewrite() and Render(). |
| // |
| // Filters parsing HTML create their RewriteContext subclasses for every |
| // group of resources they think should be optimized together (such as one |
| // RewriteContext for every image for image re-compression, or one for a group |
| // of CSS files that have compatible HTML markup for CSS combining). The |
| // framework may also ask a filter to make its RewriteContext subclass via |
| // MakeRewriteContext() in case it need to reconstruct an optimized resource |
| // that's not available in the cache. |
| // |
| // In the case of combining filters, a single RewriteContext may |
| // result in multiple rewritten resources that are partitioned based |
| // on data semantics. Most filters will just work on one resource, |
| // and those can inherit from SingleRewriteContext which is simpler |
| // to implement. |
| // |
| // The most basic transformation steps subclasses will want to implement are: |
| // |
| // Partition: |
| // Determines how many outputs, if any, will be created from all the inputs. |
| // For example, a spriter may create separate partitions for groups of images |
| // with similar colormaps. This step is also responsible for deciding what to |
| // do if some inputs were not loaded successfully. SingleRewriteContext |
| // provides the correct implementation for transformations that take in one |
| // file and optimize it. |
| // |
| // Rewrite: |
| // Takes inputs from one partition, and tries to produce an optimized output |
| // for it, as well as a CachedResult, which caches any auxiliary information |
| // that may be needed to update the container document. For example, the image |
| // filter will store image dimensions inside the CachedResult object. |
| // |
| // If a better version can be created, the subclass should call |
| // RewriteDriver::Write with its data, and then RewriteDone(kRewriteOk). |
| // |
| // If no improvement is possible, it should call RewriteDone(kRewriteFailed). |
| // Note that this does not mean that nothing can be done, just that no new |
| // resource has been created (for example an image filter might still insert |
| // dimensions into the <img> tag even if it can't compress the image better). |
| // |
| // Render: |
| // Updates the document based on information stored in CachedResult. |
| // This is the only step that can touch the HTML DOM. Note that you |
| // do not need to implement it if you just want to update the URL to the new |
| // version: the ResourceSlot's will do it automatically. |
| // |
| // Which of the steps get invoked depends on how much information has been |
| // cached, as well as on timing of things (since the system tries not to |
| // hold up the web page noticeably to wait for an optimization). Common |
| // scenarios are: |
| // |
| // 1) New rewrite, finishes quickly: |
| // Partition -> Rewrite -> Render |
| // 2) New rewrite, but too slow to render: |
| // Partition -> Rewrite |
| // 3) Metadata cache hit: |
| // Render |
| // 4) Reconstructing output from a .pagespeed. URL: |
| // Rewrite |
| // |
| // Note in particular that (3) means that all rendering should be doable just |
| // from information inside the CachedResult. |
| // |
| // Top-level RewriteContexts are initialized from the HTML thread, by filters |
| // responding to parser events. In particular, from this thread they can be |
| // constructed, and AddSlot() and Initiate() can be called. Once Initiate is |
| // called, the RewriteContext runs purely in its two threads, until |
| // it completes. At that time it will self-delete in coordination with |
| // RewriteDriver. |
| // |
| // RewriteContexts can also be nested, in which case they are constructed, |
| // slotted, and Initated all within the rewrite threads. However, they |
| // are Propagated and destructed by their parent, which was initiated by the |
| // RewriteDriver. |
| // |
| // RewriteContext utilizes two threads (via QueuedWorkerPool::Sequence) |
| // to do most of its work. The "high priority" thread is used to run the |
| // dataflow graph: queue up fetches and cache requests, partition inputs, |
| // render results, etc. The actual Rewrite() methods, however, are invoked |
| // in the "low priority" thread and can be canceled during extreme load |
| // or shutdown. |
| // |
| // TODO(jmarantz): add support for controlling TTL on failures. |
| class RewriteContext { |
| public: |
| typedef std::vector<InputInfo*> InputInfoStarVector; |
| static const char kNumRewritesAbandonedForLockContention[]; |
| static const char kNumDeadlineAlarmInvocations[]; |
| static const char kNumDistributedRewriteSuccesses[]; |
| static const char kNumDistributedRewriteFailures[]; |
| static const char kNumDistributedMetadataFailures[]; |
| // The extension used for all distributed fetch URLs. |
| static const char kDistributedExt[]; |
| // The hash value used for all distributed fetch URLs. |
| static const char kDistributedHash[]; |
| static const char kHashMismatchMessage[]; |
| |
| // Used to pass the result of the metadata cache lookups. Recipient must |
| // take ownership. |
| struct CacheLookupResult { |
| CacheLookupResult() |
| : cache_ok(false), |
| can_revalidate(false), |
| useable_cache_content(false), |
| is_stale_rewrite(false), |
| partitions(new OutputPartitions) {} |
| |
| bool cache_ok; |
| bool can_revalidate; |
| bool useable_cache_content; |
| bool is_stale_rewrite; |
| InputInfoStarVector revalidate; |
| scoped_ptr<OutputPartitions> partitions; |
| }; |
| |
| // Used for LookupMetadataForOutputResource. |
| class CacheLookupResultCallback { |
| public: |
| CacheLookupResultCallback() {} |
| virtual ~CacheLookupResultCallback(); |
| virtual void Done(const GoogleString& cache_key, |
| CacheLookupResult* result) = 0; |
| private: |
| DISALLOW_COPY_AND_ASSIGN(CacheLookupResultCallback); |
| }; |
| |
| // Takes ownership of resource_context, which must be NULL or |
| // allocated with 'new'. |
| RewriteContext(RewriteDriver* driver, // exactly one of driver & parent |
| RewriteContext* parent, // is non-null |
| ResourceContext* resource_context); |
| virtual ~RewriteContext(); |
| |
| // Random access to slots. This is not thread-safe. Prior to |
| // Initialize(), these can be called by the constructing thread. |
| // After Initiate(), these should only be called by the Rewrite |
| // thread. |
| int num_slots() const { return slots_.size(); } |
| ResourceSlotPtr slot(int index) const { return slots_[index]; } |
| |
| // Random access to outputs. These should only be accessed by |
| // the RewriteThread. |
| int num_outputs() const { return outputs_.size(); } |
| OutputResourcePtr output(int i) const { return outputs_[i]; } |
| |
| // These are generally accessed in the Rewrite thread, |
| // but may also be accessed in ::Render. |
| int num_output_partitions() const; |
| const CachedResult* output_partition(int i) const; |
| CachedResult* output_partition(int i); |
| |
| // Returns true if this context is chained to some predecessors, and |
| // must therefore be started by a predecessor and not RewriteDriver. |
| bool chained() const { return chained_; } |
| |
| // Resource slots must be added to a Rewrite before Initiate() can |
| // be called. Starting the rewrite sets in motion a sequence |
| // of async cache-lookups &/or fetches. |
| void AddSlot(const ResourceSlotPtr& slot); |
| |
| // Remove the last slot from the context's slot list. This |
| // context must be the last one attached to the slot. |
| void RemoveLastSlot(); |
| |
| // Adds a new nested RewriteContext. This RewriteContext will not |
| // be considered complete until all nested contexts have completed. |
| // This may be useful, for example for a CSS optimizer that also wants to |
| // optimize images referred to from CSS (in which case the image rewrite |
| // context will be nested inside the CSS context). |
| void AddNestedContext(RewriteContext* context); |
| |
| void CallFetchInputs(); |
| void CallLockFailed(); |
| void CallStartFetchImpl(); |
| |
| // Starts a resource rewrite. Once Inititated, the Rewrite object |
| // should only be accessed from the Rewrite thread, until it |
| // Completes, at which point top-level Contexts will call |
| // RewriteComplete on their driver, and nested Contexts will call |
| // NestedRewriteComplete on their parent. Nested rewrites will be |
| // Started directly from their parent context, and Initiate will not |
| // be called. |
| // |
| // Precondition: this rewrite isn't anyone's successor (e.g. chain() == false) |
| // and has not been started before. |
| void Initiate(); |
| |
| // Fetch the specified output resource by reconstructing it from |
| // its inputs, sending output into fetch. |
| // |
| // True is returned if an asynchronous fetch got queued up. |
| // If false, fetch->Done() will not be called. |
| bool Fetch(const OutputResourcePtr& output_resource, |
| AsyncFetch* fetch, |
| MessageHandler* message_handler); |
| |
| // If true, we have determined that this job can't be rendered just |
| // from metadata cache (including all prerequisites). |
| bool slow() const { return slow_; } |
| |
| // This particular rewrite was a metadata cache miss. |
| bool is_metadata_cache_miss() const { return is_metadata_cache_miss_; } |
| |
| // Returns true if this is a nested rewriter. |
| bool has_parent() const { return parent_ != NULL; } |
| |
| // Returns true if this is a child rewriter and its parent has the given |
| // id. |
| bool IsNestedIn(StringPiece id) const; |
| |
| // Allows a nested rewriter to walk up its parent hierarchy. |
| RewriteContext* parent() { return parent_; } |
| const RewriteContext* parent() const { return parent_; } |
| |
| // Accessors for the nested rewrites. |
| int num_nested() const { return nested_.size(); } |
| RewriteContext* nested(int i) const { return nested_[i]; } |
| |
| RewriteDriver* Driver() const { |
| return driver_; |
| } |
| |
| // If called with true, forces a rewrite and re-generates the output. |
| void set_force_rewrite(bool x) { force_rewrite_ = x; } |
| |
| bool rewrite_uncacheable() const { return rewrite_uncacheable_; } |
| void set_rewrite_uncacheable(bool rewrite_uncacheable) { |
| rewrite_uncacheable_ = rewrite_uncacheable; |
| } |
| |
| const ResourceContext* resource_context() const { |
| return resource_context_.get(); |
| } |
| |
| // Returns debug information about this RewriteContext. |
| GoogleString ToString() const; |
| GoogleString ToStringWithPrefix(StringPiece prefix) const; |
| |
| // Initializes statistics. |
| static void InitStats(Statistics* stats); |
| |
| protected: |
| typedef std::vector<GoogleUrl*> GoogleUrlStarVector; |
| |
| // ----------------------------------------------------------------------- |
| // Resource transformation APIs. If you are implementing an optimization, |
| // you'll be dealing mainly with these. |
| // ----------------------------------------------------------------------- |
| |
| // Finds the ServerContext associated with this context. Note that |
| // this method might have to climb up the parent-tree, but it's typically |
| // not a deep tree. Same with Driver() and Options(). |
| ServerContext* FindServerContext() const; |
| const RewriteOptions* Options() const; |
| |
| OutputPartitions* partitions() { return partitions_.get(); } |
| |
| // Add a dummy other_dependency that will force the rewrite's OutputPartitions |
| // to be rechecked after a modest TTL. |
| void AddRecheckDependency(); |
| |
| // If this returns true, running the rewriter isn't required for |
| // correctness of the page, so the engine will be permitted to drop |
| // the rewrite if needed to preserve system responsiveness. |
| virtual bool OptimizationOnly() const { return true; } |
| |
| // Partitions the input resources into one or more outputs. Return |
| // 'true' if the partitioning could complete (whether a rewrite was |
| // found or not), false if the attempt was abandoned and no |
| // conclusion can be drawn. |
| // |
| // Note that if partitioner finds that the resources are not |
| // rewritable, it will still return true; it will simply have |
| // an empty inputs-array in OutputPartitions and leave |
| // 'outputs' unmodified. 'false' is only returned if the subclass |
| // skipped the rewrite attempt due to a lock conflict. |
| // |
| // You must override one of Partition() or PartitionAsync(). Partition() |
| // is normally fine unless you need to do computations that can take a |
| // noticeable amount of time, since there are some scenarios under which |
| // page output may end up being held up for a partitioning step. If you |
| // do need to do something computationally expensive in partitioning steps, |
| // override PartitionAsync() instead. |
| virtual bool Partition(OutputPartitions* partitions, |
| OutputResourceVector* outputs); |
| |
| // As above, but you report the result asynchronously by calling |
| // PartitionDone(), which must be done from the main rewrite |
| // sequence. One of Partition or PartitionAsync() must be overridden in |
| // the subclass. The default implementation is implemented in terms of |
| // Partition(). |
| virtual void PartitionAsync(OutputPartitions* partitions, |
| OutputResourceVector* outputs); |
| |
| // Call this from the main rewrite sequence to report results of |
| // PartitionAsync. If the client is not in the main rewrite sequence, |
| // use CrossThreadPartitionDone() instead. |
| void PartitionDone(RewriteResult result); |
| |
| // Helper for queuing invocation of PartitionDone to run in the |
| // main rewrite sequence. |
| void CrossThreadPartitionDone(RewriteResult result); |
| |
| // Takes a completed rewrite partition and rewrites it. When |
| // complete, implementations should call RewriteDone(kRewriteOk) if |
| // they successfully created an output resource using RewriteDriver::Write, |
| // and RewriteDone(kRewriteFailed) if they didn't. They may also call |
| // RewriteDone(kTooBusy) in case system load/resource usage makes it |
| // dangerous for the filter to do optimization at this time. |
| // |
| // Any information about the inputs or output that may be needed to update |
| // the containing document should be stored inside the CachedResult. |
| // |
| // If implementors wish to rewrite resources referred to from within the |
| // inputs (e.g. images in CSS), they may create nested rewrite contexts |
| // and call AddNestedContext() on each, and then StartNestedTasks() |
| // when all have been added. |
| // |
| // TODO(jmarantz): check for resource completion from a different |
| // thread (while we were waiting for resource fetches) when Rewrite |
| // gets called. |
| virtual void Rewrite(int partition_index, |
| CachedResult* partition, |
| const OutputResourcePtr& output) = 0; |
| |
| // Called by subclasses when an individual rewrite partition is |
| // done. Note that RewriteDone may 'delete this' so no |
| // further references to 'this' should follow a call to RewriteDone. |
| // This method can run in any thread. |
| void RewriteDone(RewriteResult result, int partition_index); |
| |
| // Sends a a response to the the client via the AsyncFetch, transforming |
| // output if needed (e.g. css absolutification) and controlling chunked |
| // encoding hints as needed. |
| // |
| // This is called in case a rewrite fails in the fetch path or a deadline |
| // is exceeded. Default implementation is just to write the input. |
| // But contexts may need to specialize this to actually absolutify |
| // subresources if the fetched resource is served on a different path |
| // than the input resource. |
| virtual bool SendFallbackResponse(StringPiece output_url_base, |
| StringPiece contents, |
| AsyncFetch* async_fetch, |
| MessageHandler* handler); |
| |
| // Called on the parent to initiate all nested tasks. This is so |
| // that they can all be added before any of them are started. |
| // May be called from any thread. |
| void StartNestedTasks(); |
| |
| // Once any nested rewrites have completed, the results of these |
| // can be incorporated into the rewritten data. For contexts that |
| // do not require any nested RewriteContexts, it is OK to skip |
| // overriding this method -- the empty default implementation is fine. |
| virtual void Harvest(); |
| |
| // Performs rendering activities that span multiple HTML slots. For |
| // example, in a filter that combines N slots to 1, N-1 of the HTML |
| // elements might need to be removed. That can be performed in |
| // Render(). This method is optional; the base-class implementation |
| // is empty. |
| // |
| // Note that unlike Harvest(), this method runs in the HTML thread (for |
| // top-level rewrites), and only runs if the rewrite completes prior to |
| // the rewrite-deadline. If the rewrite does make it by the deadline, |
| // RewriteContext::Render() will be invoked regardless of whether any slots |
| // were actually optimized successfully. |
| virtual void Render(); |
| |
| // Notifies the subclass that the filter will not be able to render its |
| // output to the containing HTML document, because it wasn't ready in time. |
| // Note that neither Render() nor WillNotRender() may be called in case |
| // this rewrite got canceled due to disable_further_processing(), or in case |
| // Partition() failed. This is called from the HTML thread, but should only be |
| // used for read access, and subclasss implementations are required to be |
| // reasonably quick since it's called with rewrite_mutex() held. It's called |
| // after any earlier contexts in filter order had completed their rendering, |
| // if any, but with no order guarantees with respect to other WillNotRender() |
| // invocations. |
| virtual void WillNotRender(); |
| |
| // This method is invoked (in Rewrite thread) if this context got canceled |
| // due to an earlier filter sharing a slot with it having called |
| // set_disable_further_processing. Default implementation does nothing. |
| virtual void Cancel(); |
| |
| // This final set of protected methods can be optionally overridden |
| // by subclasses. |
| |
| // All RewriteContexts define how they encode URLs and other |
| // associated information needed for a rewrite into a URL. |
| // The default implementation handles a single URL with |
| // no extra data. The RewriteContext owns the encoder. |
| // |
| // TODO(jmarantz): remove the encoder from RewriteFilter. |
| virtual const UrlSegmentEncoder* encoder() const; |
| |
| // Allows subclasses to add additional text to be appended to the |
| // metadata cache key. The default implementation returns "". |
| virtual GoogleString CacheKeySuffix() const; |
| |
| // Indicates user agent capabilities that must be stored in the cache key. |
| // |
| // Note that the context may be NULL as it may not be set before this. Since |
| // it isn't going to be modified in the method, ResourceContext is passed |
| // as a const pointer. |
| // TODO(morlovich): This seems to overlap with CacheKeySuffix. |
| virtual GoogleString UserAgentCacheKey( |
| const ResourceContext* context) const { |
| return ""; |
| } |
| |
| // Encodes User Agent into the ResourceContext. |
| // A subclass ResourceContext should normally call |
| // RewriteFilter::EncodeUserAgentIntoResourceContext if it has access to |
| // a RewriteFilter. |
| virtual void EncodeUserAgentIntoResourceContext(ResourceContext* context) {} |
| |
| // Returns the filter ID. |
| virtual const char* id() const = 0; |
| |
| // Rewrites come in three flavors, as described in output_resource_kind.h, |
| // so this method must be defined by subclasses to indicate which it is. |
| // |
| // For example, we will avoid caching output_resource content in the HTTP |
| // cache for rewrites that are so quick to complete that it's fine to |
| // do the rewrite on every request. extend_cache is obviously in |
| // this category, and it's arguable we could treat js minification |
| // that way too (though we don't at the moment). |
| virtual OutputResourceKind kind() const = 0; |
| |
| // ----------------------------------------------------------------------- |
| // Tracing API. |
| // ----------------------------------------------------------------------- |
| |
| // Creates a new request trace associated with this context with a given |
| // |label|. |
| void AttachDependentRequestTrace(const StringPiece& label); |
| |
| // Provides the dependent request trace associated with this context, if any. |
| // Note that this is distinct from the root user request trace, available |
| // in Driver(). |
| RequestTrace* dependent_request_trace() { return dependent_request_trace_; } |
| |
| // A convenience wrapper to log a trace annotation in both the request |
| // trace (if present) as well as the root user request trace (if present). |
| void TracePrintf(const char* fmt, ...); |
| |
| // ----------------------------------------------------------------------- |
| // Fetch state machine override APIs, as well as exports of some general |
| // state machine state for overriders to use. If you just want to write an |
| // optimization, you do not need these --- they are useful if you want to |
| // write a new state machine that's similar but not quite identical to |
| // what RewriteContext provides. |
| // ----------------------------------------------------------------------- |
| |
| // Called in fetch path if we have not found the resource available |
| // in HTTP cache under an alternate location suggested by metadata cache |
| // such as a different hash or the original, and thus need to fully |
| // reconstruct it. |
| // |
| // The base implementation will do an asynchronous locking attempt, |
| // scheduling to run FetchInputs when complete. Subclasses may override |
| // this method to preload inputs in a different manner, and may delay |
| // calling of base version until that is complete. |
| virtual void StartFetchReconstruction(); |
| |
| // Determines if the given rewrite should be distributed. This is based on |
| // whether distributed servers have been configured, if the current filter is |
| // configured to be distributed, where a filter is in a chain, if a |
| // distributed fetcher is in place, and if distribution has been explicitly |
| // disabled for this context. |
| bool ShouldDistributeRewrite() const; |
| |
| // Determines if this rewrite-context is acting on behalf of a distributed |
| // rewrite request from an HTML rewrite. Verifies the distributed rewrite key. |
| bool IsDistributedRewriteForHtml() const; |
| |
| // Dispatches the rewrite to another task with a distributed fetcher. Should |
| // not be called without first getting true from ShoulDistributeRewrite() as |
| // it has guards (such as checking the number of slots). |
| void DistributeRewrite(); |
| |
| // Makes the rest of a fetch run in background, not producing |
| // a result or invoking callbacks. Will arrange for appropriate |
| // memory management with the rewrite driver itself; but the caller |
| // is responsible for delivering results itself and invoking the |
| // callback. |
| void DetachFetch(); |
| |
| // Decodes the output resource to find the resources to be fetched. The |
| // default behavior decodes the output resource name into multiple paths and |
| // absolutifies them with respect to the output resource base. Returns true if |
| // the decoding is successful and false otherwise. |
| virtual bool DecodeFetchUrls(const OutputResourcePtr& output_resource, |
| MessageHandler* message_handler, |
| GoogleUrlStarVector* url_vector); |
| |
| // Adjust headers sent out for a stale or in-place result. We may send out |
| // stale results in the fallback fetch pathway, but these results should not |
| // be cached much. By default we strip Set-Cookie* headers and Etags, and |
| // convert Cache-Control headers to private, max-age=300. |
| virtual void FixFetchFallbackHeaders(const CachedResult& cached_result, |
| ResponseHeaders* headers); |
| |
| // Callback once the fetch is done. This calls Driver()->FetchComplete() if |
| // notify_driver_on_fetch_done is true. |
| virtual void FetchCallbackDone(bool success); |
| |
| // Attempts to fetch a given URL from HTTP cache, and serves it |
| // (with shortened HTTP headers) if available. If not, fallback to normal |
| // full reconstruction path. Note that the hash can be an empty string if the |
| // url is not rewritten. |
| virtual void FetchTryFallback(const GoogleString& url, |
| const StringPiece& hash); |
| |
| // Freshens resources proactively to avoid expiration in the near future. |
| void Freshen(); |
| |
| bool notify_driver_on_fetch_done() const { |
| return notify_driver_on_fetch_done_; |
| } |
| void set_notify_driver_on_fetch_done(bool value) { |
| notify_driver_on_fetch_done_ = value; |
| } |
| |
| // Returns true if this context will prevent any attempt at distributing a |
| // rewrite (although its nested context still may be distributed). See |
| // ShouldDistributeRewrite for more detail on when a rewrite should be |
| // distributed. |
| bool block_distribute_rewrite() const { return block_distribute_rewrite_; } |
| void set_block_distribute_rewrite(const bool x) { |
| block_distribute_rewrite_ = x; |
| } |
| |
| // Note that the following must only be called in the fetch flow. |
| AsyncFetch* async_fetch(); |
| |
| // Is fetch_ detached? Only call this in the fetch flow. |
| bool FetchContextDetached(); |
| |
| // The message handler for the fetch. |
| MessageHandler* fetch_message_handler(); |
| |
| // Indicates whether we are serving a stale rewrite. |
| bool stale_rewrite() const { return stale_rewrite_; } |
| |
| // Returns an interval in milliseconds to wait when configuring the deadline |
| // alarm in FetchContext::SetupDeadlineAlarm(). Subclasses may configure the |
| // deadline based on rewrite type, e.g., IPRO vs. HTML-path. |
| virtual int64 GetRewriteDeadlineAlarmMs() const; |
| |
| // Should the context call LockForCreation before checking the cache? |
| virtual bool CreationLockBeforeStartFetch() const; |
| |
| // Should the context fail to serve the rewritten resource if the hash |
| // doesn't match user requested hash? |
| // By default, we do not fail and simply serve with limited Caching headers |
| // assuming that an out-of-date resource is better than none. But for |
| // resources like source maps, out-of-date versions are worse than nothing |
| // because they are complete non-sense if not associated with the exact |
| // expected contents. |
| virtual bool FailOnHashMismatch() const { return false; } |
| |
| // Backend to RewriteDriver::LookupMetadataForOutputResource, with |
| // the RewriteContext of appropriate type and the OutputResource already |
| // created. Takes ownership of rewrite_context. |
| static bool LookupMetadataForOutputResourceImpl( |
| OutputResourcePtr output_resource, |
| const GoogleUrl& gurl, |
| RewriteContext* rewrite_context, |
| RewriteDriver* driver, |
| GoogleString* error_out, |
| CacheLookupResultCallback* callback); |
| |
| private: |
| class DistributedRewriteCallback; |
| class DistributedRewriteFetch; |
| class OutputCacheCallback; |
| class WriteIfChanged; |
| class LookupMetadataForOutputResourceCallback; |
| class HTTPCacheCallback; |
| class ResourceCallbackUtils; |
| class ResourceFetchCallback; |
| class ResourceReconstructCallback; |
| class ResourceRevalidateCallback; |
| class InvokeRewriteFunction; |
| class RewriteFreshenCallback; |
| friend class RewriteDriver; |
| |
| typedef std::set<RewriteContext*> ContextSet; |
| |
| // This is passed to CanFetchFallbackToOriginal when trying to determine |
| // whether using the 0th input resource would be an acceptable substitute |
| // for output when: |
| enum FallbackCondition { |
| kFallbackDiscretional, // trying to produce result quicker to improve |
| // latency |
| kFallbackEmergency // rewrite failed and output would otherwise not |
| // be available |
| }; |
| |
| // Callback helper functions. |
| void Start(); |
| void SetPartitionKey(); |
| void StartFetch(); |
| void StartFetchImpl(); |
| void CancelFetch(); |
| void OutputCacheDone(CacheLookupResult* cache_result); |
| void OutputCacheHit(bool write_partitions); |
| void OutputCacheRevalidate(const InputInfoStarVector& to_revalidate); |
| void OutputCacheMiss(); |
| void ResourceFetchDone(bool success, ResourcePtr resource, int slot_index); |
| void ResourceRevalidateDone(InputInfo* input_info, bool success); |
| void LogMetadataCacheInfo(bool cache_ok, bool can_revalidate); |
| |
| // When a RewriteContext 'B' discovers that it's doing the exact same rewrite |
| // as a previous RewriteContext 'A', B adds itself to A->repeated_, and |
| // suspends its work, expecting 'A' to call B->RepeatedSuccess(A) or |
| // B->RepeatedFailure() to give it the result of the rewrite. |
| void RepeatedSuccess(const RewriteContext* primary); |
| void RepeatedFailure(); |
| |
| // After a Rewrite is complete, writes the metadata for the rewrite |
| // operation to the cache, and runs any further rewites that are |
| // dependent on this one. |
| // |
| // If there are pending nested rewrites then this call has no |
| // effect. Once all the nested rewrites have been accounted for via |
| // NestedRewriteDone() then Finalize can queue up its render and |
| // enable successor rewrites to proceed. |
| void Finalize(); |
| |
| // Get reference to lock_, lazy-initializing if necessary. |
| NamedLock* Lock(); |
| |
| // Initiates an asynchronous fetch for the resources associated with |
| // each slot, calling ResourceFetchDone() when complete. |
| // |
| // To avoid concurrent fetches across multiple processes or threads, the |
| // caller must first lock each input by name, blocking or abandoning rewriting |
| // as necessary. Input fetches done on behalf of resource fetches must |
| // succeed to avoid sending 404s to clients, and so they will break locks. |
| // Input fetches done for async rewrite initiations should fail fast to help |
| // avoid having multiple concurrent processes attempt the same rewrite. |
| void FetchInputs(); |
| |
| // Called when we fail to acquire the lock for the output resource. |
| void LockFailed(); |
| |
| // Callback to a distributed rewrite fetch. Queued to run in the high-priority |
| // thread. Fetch path: If the fetch succeeded then the rest of the flow is |
| // skipped and that result is used, otherwise the original resource is fetched |
| // and returned, bypassing rewriting. |
| void DistributeRewriteDone(bool success); |
| |
| // If the response_headers have metadata in them, strip the metadata from the |
| // headers, parse them and write them to cache_result. Returns true if |
| // the parse was successful otherwise false. |
| bool ParseAndRemoveMetadataFromResponseHeaders( |
| ResponseHeaders* response_headers, CacheLookupResult* cache_result); |
| |
| // Create an OutputResource initialized from CachedResult, response headers, |
| // and content. |
| bool CreateOutputResourceFromContent(const CachedResult& cached_result, |
| const ResponseHeaders& response_headers, |
| StringPiece content, |
| OutputResourcePtr* output_resource); |
| |
| // The distributed rewrite path for HTML rewrites works by converting the |
| // input URL on the ingress task into a .pagespeed. fetch for the distributed |
| // task to reconstruct using the corresponding filter id. This function maps |
| // the given input resource URL into a .pagespeed. URL for reconstruction. It |
| // uses a hash value of 0 and an extension of "distributed". Returns an empty |
| // string if the URL could not be constructed (e.g., was too long). |
| // |
| // Ex. input: http://www.example.com/a.png with an image compression context |
| // output: http://www.example.com/50x50xa.png.pagespeed.ic.0.distributed |
| GoogleString DistributedFetchUrl(StringPiece url); |
| |
| // Returns true if this rewrite context was created to fetch a resource (e.g., |
| // IPRO or .pagespeed. URLs) and false otherwise. |
| bool IsFetchRewrite() const { return fetch_.get() != NULL; } |
| |
| // Called on the parent from a nested Rewrite when it is complete. |
| // Note that we don't track rewrite success/failure here. We only |
| // care whether the nested rewrites are complete, and whether there |
| // are any dependencies. |
| void NestedRewriteDone(const RewriteContext* context); |
| |
| // Generally a RewriteContext is waiting for one or more |
| // asynchronous events to take place. Activate is called |
| // to run some action to help us advance to the next state. |
| void Activate(); |
| |
| // Runs after all Rewrites have been completed, and all nested |
| // RewriteContexts have completed and harvested. |
| // |
| // For top-level Rewrites, this must be called from the HTML thread. |
| // For nested Rewrites it runs from the Rewrite thread. |
| // |
| // If render_slots is true, then all the slots owned by this context |
| // will have Render() called on them. For top-level Rewrites, this |
| // should only be done if the rewrite completes before the rewrite |
| // deadline expires. After that, the HTML elements referred to by |
| // the slots have already been flushed to the network. For nested |
| // Rewrites it's done unconditionally. |
| // |
| // Rewriting and propagation continue even after this deadline, so |
| // that we may cache the rewritten results, allowing the deadline to |
| // be easier-to-hit next time the same resources need to be |
| // rewritten. |
| // |
| // And in all cases, the successors Rewrites are queued up in the |
| // Rewrite thread once any nested propagation is complete. And, in |
| // particular, each slot must be updated with any rewritten |
| // resources, before the successors can be run, independent of |
| // whether the slots can be rendered into HTML. |
| void Propagate(bool render_slots); |
| |
| // With all resources loaded, the rewrite can now be done, writing: |
| // The metadata into the cache |
| // The output resource into the cache |
| // if the driver has not been detached, |
| // the url+data->rewritten_resource is written into the rewrite |
| // driver's map, for each of the URLs. |
| void StartRewriteForHtml(); |
| void StartRewriteForFetch(); |
| |
| // Determines whether the Context is in a state where it's ready to |
| // rewrite. This requires: |
| // - no preceding RewriteContexts in progress |
| // - no outstanding cache lookups |
| // - no outstanding fetches |
| // - rewriting not already complete. |
| bool ReadyToRewrite() const; |
| |
| // Removes this RewriteContext from all slots. This is done normally when |
| // a RewriteContext is completed and we are ready to run the successors. |
| // It is also done when aborting a RewriteContext due to cache being |
| // unhealthy. |
| void DetachSlots(); |
| |
| // Activate any Rewrites that come after this one, for serializability |
| // of access to common slots. |
| void RunSuccessors(); |
| |
| // Writes out the partition-table into the metadata cache (checking |
| // ok_to_write_output_partitions_) |
| void WritePartition(); |
| |
| // Does all the bookkeeping needed after rewrite in HTML completes --- |
| // writes out cache data, notifies any repeated rewrites, queues up |
| // successors, cleans things up, etc. |
| // |
| // This method may call 'delete this' so it should be the last call at its |
| // call-site. |
| // |
| // It will *not* call 'delete this' if there is a live RewriteDriver, |
| // waiting for a convenient point to render the rewrites into HTML. |
| void FinalizeRewriteForHtml(); |
| |
| // Arranges for commit of all the state (if permit_render is true), and |
| // notification of parents, rewrite driver, etc., as well as running of |
| // successors if applicable. This is the tail portion of |
| // FinalizeRewriteForHtml that must be called even if we didn't |
| // actually get as far as computing a partition_key_. |
| void RetireRewriteForHtml(bool permit_render); |
| |
| // Marks this job and any dependents slow as appropriate, notifying the |
| // RewriteDriver of any changes. |
| void MarkSlow(); |
| |
| // Notes that we dropped parts of this rewrite due to system load, so we |
| // should not cache it. |
| void MarkTooBusy(); |
| |
| // Collect all non-nested contexts that depend on this one (including |
| // itself). Note that this might exclude some repeated jobs that haven't |
| // gotten far enough to realize that yet. |
| void CollectDependentTopLevel(ContextSet* contexts); |
| |
| // Actual implementation of RewriteDone that's queued to run in |
| // high-priority rewrite thread. |
| void RewriteDoneImpl(RewriteResult result, int partition_index); |
| |
| // Actual implementation of StartNestedTasks that's queued to run in |
| // high-priority rewrite thread. |
| void StartNestedTasksImpl(); |
| |
| // Establishes that a slot has been rewritten. So when Propagate() |
| // is called, the resource update that has been written to this slot can |
| // be propagated to the DOM. |
| void RenderPartitionOnDetach(int partition_index); |
| |
| // Sets up all the state needed for Fetch, but doesn't register this context |
| // or actually start the rewrite process. |
| bool PrepareFetch( |
| const OutputResourcePtr& output_resource, |
| AsyncFetch* fetch, |
| MessageHandler* message_handler); |
| |
| // Creates an output resource that corresponds to a full URL stored in |
| // metadata cache. |
| bool CreateOutputResourceForCachedOutput(const CachedResult* cached_result, |
| OutputResourcePtr* output_resource); |
| |
| // Callback for metadata lookup on fetch path. |
| void FetchCacheDone(CacheLookupResult* cache_result); |
| |
| // Callback for HTTP lookup on fetch path where the metadata cache suggests |
| // we should try either serving a different path or the original. |
| void FetchFallbackCacheDone(HTTPCache::FindResult result, |
| HTTPCache::Callback* data); |
| |
| // Returns true if we can attempt to serve the original file for a fetch |
| // request in case something goes wrong with rewriting (circumstance == |
| // kFallbackEmergency) or the system thinks that would avoid a latency |
| // spike or overload (kFallbackDiscretional). |
| bool CanFetchFallbackToOriginal(FallbackCondition circumstance) const; |
| |
| // Checks whether an other dependency input info already exists in the |
| // partition with the same data. Used to de-dup the field. |
| bool HasDuplicateOtherDependency(const InputInfo& input); |
| |
| // Check if there is a duplicate and if there is none, add to the other |
| // dependencies. Updates the internal other_dependency map that is used to |
| // de-dup the contents. |
| void CheckAndAddOtherDependency(const InputInfo& input); |
| |
| // Perform checks and freshen the input resource. Also updates metadata if |
| // required. |
| void CheckAndFreshenResource(const InputInfo& input_info, |
| ResourcePtr resource, int partition_index, |
| int input_index, |
| FreshenMetadataUpdateManager* freshen_manager); |
| ResourcePtr CreateUrlResource(const StringPiece& input_url); |
| |
| // To perform a rewrite, we need to have data for all of its input slots. |
| ResourceSlotVector slots_; |
| |
| // Not all of the slots require rendering from this RewriteContext. If an |
| // optimization was deemed non-beneficial then we skip rendering the slot. |
| // So keep the slots requiring rendering in a bitvector. |
| std::vector<bool> render_slots_; |
| |
| // It's feasible that callbacks for different resources will be delivered |
| // on different threads, thus we must protect these counters with a mutex |
| // or make them using atomic integers. |
| // |
| // TODO(jmarantz): keep the outstanding fetches as a set so they can be |
| // terminated cleanly and immediately, allowing fast process shutdown. |
| // For example, if Apache notifies our process that it's being shut down |
| // then we should have a mechanism to cancel all pending fetches. This |
| // would require a new cancellation interface from both CacheInterface and |
| // UrlAsyncFetcher. |
| |
| bool started_; |
| scoped_ptr<OutputPartitions> partitions_; |
| OutputResourceVector outputs_; |
| int outstanding_fetches_; |
| int outstanding_rewrites_; |
| scoped_ptr<ResourceContext> resource_context_; |
| GoogleString partition_key_; |
| |
| UrlSegmentEncoder default_encoder_; |
| |
| // Lock guarding output partitioning and rewriting. Lazily initialized by |
| // Lock(), unlocked on destruction or the end of Finish(). |
| scoped_ptr<NamedLock> lock_; |
| |
| // When this rewrite object is created on behalf of a fetch, we must |
| // keep the response_writer, request_headers, and callback in the |
| // FetchContext so they can be used once the inputs are available. |
| class FetchContext; |
| scoped_ptr<FetchContext> fetch_; |
| |
| // Track the RewriteContexts that must be run after this one because they |
| // share a slot. |
| std::vector<RewriteContext*> successors_; |
| |
| // Other places on the page (or CSS) that should be rewritten the same |
| // way 'this' is (e.g. because they refer to the same URL, filter and |
| // settings). |
| std::vector<RewriteContext*> repeated_; |
| |
| // Track the number of nested contexts that must be completed before |
| // this one can be marked complete. Nested contexts are typically |
| // added during the Rewrite() phase. |
| int num_pending_nested_; |
| std::vector<RewriteContext*> nested_; |
| |
| // If this context is nested, the parent is the context that 'owns' it. |
| RewriteContext* parent_; |
| |
| // If this context was initiated from a RewriteDriver, either due to |
| // a Resource Fetch or an HTML Rewrite, then we keep track of the |
| // RewriteDriver, and notify it when the RewriteContext is complete. |
| // That way it can stay around and 'own' all the resources associated |
| // with all the resources it spawns, directly or indirectly. |
| // |
| // Nested RewriteContexts obtain their driver from their parent, but |
| // store it here to permit Driver() to be a simple getter. |
| RewriteDriver* driver_; |
| |
| // Track the number of ResourceContexts that must be run before this one. |
| int num_predecessors_; |
| |
| // If true, this context's execution must follow some other context's |
| // completion (which may have occurred already). |
| bool chained_; |
| |
| // TODO(jmarantz): Refactor to replace a bunch bool member variables with |
| // an explicit state_ member variable, with a set of possibilties that |
| // look something like this: |
| // |
| // enum State { |
| // kCluster, // Inputs are being clustered into RewriteContexts. |
| // kLookup, // Looking up partitions & rewritten URLs in the cache. |
| // // - If successsful, skip to Render. |
| // kFetch, // Waiting for URL fetches to complete. |
| // kPartition, // Fetches complete; ready to partition into |
| // // OutputResources. |
| // kRewrite, // Partitioning complete, ready to Rewrite. |
| // kHarvest, // Nested RewriteContexts complete, ready to harvest |
| // // results. |
| // kRender, // Ready to render the rewrites into the DOM. |
| // kComplete // Ready to delete. |
| // }; |
| |
| // True if all the rewriting is done for this context. |
| bool rewrite_done_; |
| |
| // True if it's valid to write the partition table to the metadata cache. |
| // We would *not* want to do that if one of the Rewrites completed |
| // with status kTooBusy or if we've just read these very partitions from |
| // the metadata cache. |
| // |
| // Because both failure (kTooBusy) and success (we just read this from cache) |
| // lead to ok_to_write_output_partitions_ being turned off, this is not copied |
| // from nested rewrite contexts. In the success case we want the parent to |
| // write iff it has made changes, which is what it will do if we copy nothing; |
| // in the failure case we also set was_too_busy_, which does get copied to the |
| // parent. |
| bool ok_to_write_output_partitions_; |
| |
| // True if the rewrite was incomplete due to heavy load; if this is true |
| // ok_to_write_output_partitions_ must be false. This is copied from nested |
| // rewrite contexts because if one rewrite fails none should be saved. |
| bool was_too_busy_; |
| |
| // We mark a job as "slow" when we cannot render it entirely from the |
| // metadata cache (including rendering its predecessors). We only do this |
| // for top-level jobs. |
| bool slow_; |
| |
| // Starts at true, set to false if any content-change checks failed. |
| bool revalidate_ok_; |
| |
| // Indicates that the context should call driver()->FetchComplete() once the |
| // fetch is done. |
| bool notify_driver_on_fetch_done_; |
| |
| // Indicates whether we want to force a rewrite. If true, we skip reading |
| // from the metadata cache. |
| bool force_rewrite_; |
| |
| // Indicates that the current rewrite involves at least one resource which |
| // is stale. |
| bool stale_rewrite_; |
| |
| // Indicates whether we have a metadata miss (or an unsuccessful revalidation |
| // attempt) on the html path. |
| bool is_metadata_cache_miss_; |
| |
| // If set to true, we'll try to rewrite un-cacheable resources. |
| // The flag is expected to be set to true only from IPRO context. |
| bool rewrite_uncacheable_; |
| |
| // An optional request trace associated with this context. May be NULL. |
| // Always owned externally. |
| RequestTrace* dependent_request_trace_; |
| |
| // Set true if this rewrite context should be blocked from distributing its |
| // rewrite. |
| bool block_distribute_rewrite_; |
| |
| // Stores the resulting headers and content of a distributed rewrite. |
| scoped_ptr<DistributedRewriteFetch> distributed_fetch_; |
| |
| // Map to dedup partitions other dependency field. |
| StringIntMap other_dependency_map_; |
| |
| Variable* const num_rewrites_abandoned_for_lock_contention_; |
| Variable* const num_distributed_rewrite_failures_; |
| Variable* const num_distributed_rewrite_successes_; |
| Variable* const num_distributed_metadata_failures_; |
| DISALLOW_COPY_AND_ASSIGN(RewriteContext); |
| }; |
| |
| } // namespace net_instaweb |
| |
| #endif // NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_CONTEXT_H_ |