blob: c89958b92d39837aa36dc37bebfaafef5c10b724 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
//
// NOTE: This interface is actively under development and may be
// changed extensively. Contact us at mod-pagespeed-discuss@googlegroups.com
// if you are interested in using it.
#ifndef PAGESPEED_AUTOMATIC_PROXY_FETCH_H_
#define PAGESPEED_AUTOMATIC_PROXY_FETCH_H_
#include <map>
#include <set>
#include <vector>
#include "net/instaweb/http/public/async_fetch.h"
#include "net/instaweb/http/public/request_context.h"
#include "net/instaweb/util/public/fallback_property_page.h"
#include "net/instaweb/util/public/property_cache.h"
#include "pagespeed/automatic/html_detector.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/gtest_prod.h"
#include "pagespeed/kernel/base/scoped_ptr.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/http_names.h"
#include "pagespeed/kernel/http/user_agent_matcher.h"
#include "pagespeed/kernel/thread/queued_worker_pool.h"
namespace net_instaweb {
class AbstractMutex;
class CacheUrlAsyncFetcher;
class Function;
class GoogleUrl;
class MessageHandler;
class ProxyFetch;
class ProxyFetchPropertyCallbackCollector;
class QueuedAlarm;
class ServerContext;
class ResponseHeaders;
class RewriteDriver;
class RewriteOptions;
class Timer;
// Factory for creating and starting ProxyFetches. Must outlive all
// ProxyFetches it creates.
class ProxyFetchFactory {
public:
explicit ProxyFetchFactory(ServerContext* server_context);
~ProxyFetchFactory();
// Convenience method that calls CreateNewProxyFetch and then StartFetch() on
// the resulting fetch.
void StartNewProxyFetch(
const GoogleString& url,
AsyncFetch* async_fetch,
RewriteDriver* driver,
ProxyFetchPropertyCallbackCollector* property_callback,
AsyncFetch* original_content_fetch);
// Creates a new proxy fetch and passes it to the fetcher to start it. If the
// UrlNamer doesn't authorize this url it calls CleanUp() on the driver,
// Detach() on the property callback, Done() on the async_fetch and
// original_content_fetch, and returns NULL.
//
// If you're using a fetcher for the original request content you should use
// StartNewProxyFetch() instead. CreateNewProxyFetch is for callers who will
// not be calling StartFetch() and instead will call HeadersComplete(),
// Write(), Flush(), and Done() as they get data in from another source.
ProxyFetch* CreateNewProxyFetch(
const GoogleString& url,
AsyncFetch* async_fetch,
RewriteDriver* driver,
ProxyFetchPropertyCallbackCollector* property_callback,
AsyncFetch* original_content_fetch);
// Initiates the PropertyCache lookup. See ngx_pagespeed.cc or
// proxy_interface.cc for example usage.
static ProxyFetchPropertyCallbackCollector* InitiatePropertyCacheLookup(
bool is_resource_fetch,
const GoogleUrl& request_url,
ServerContext* server_context,
RewriteOptions* options,
AsyncFetch* async_fetch,
const bool requires_blink_cohort);
MessageHandler* message_handler() const { return handler_; }
private:
friend class ProxyFetch;
// Helps track the status of in-flight ProxyFetches. These are intended for
// use only by ProxyFetch.
//
// TODO(jmarantz): Enumerate outstanding fetches in server status page.
void RegisterNewFetch(ProxyFetch* proxy_fetch);
void RegisterFinishedFetch(ProxyFetch* proxy_fetch);
ServerContext* server_context_;
Timer* timer_;
MessageHandler* handler_;
scoped_ptr<AbstractMutex> outstanding_proxy_fetches_mutex_;
std::set<ProxyFetch*> outstanding_proxy_fetches_;
DISALLOW_COPY_AND_ASSIGN(ProxyFetchFactory);
};
// Tracks a single property-cache lookup. These lookups are initiated
// immediately upon handling the request, in parallel with determining
// domain-specific RewriteOptions and fetching the HTTP headers for the HTML.
//
// Request handling can proceed in parallel with the property-cache lookups,
// including RewriteOptions lookup and initiating the HTTP fetch. However,
// handling incoming bytes will be blocked waiting for property-cache lookups
// to complete.
class ProxyFetchPropertyCallback : public PropertyPage {
public:
ProxyFetchPropertyCallback(PageType page_type,
PropertyCache* property_cache,
const StringPiece& url,
const StringPiece& options_signature_hash,
UserAgentMatcher::DeviceType device_type,
ProxyFetchPropertyCallbackCollector* collector,
AbstractMutex* mutex);
PageType page_type() const { return page_type_; }
// Delegates to collector_'s IsCacheValid.
virtual bool IsCacheValid(int64 write_timestamp_ms) const;
virtual void Done(bool success);
private:
PageType page_type_;
UserAgentMatcher::DeviceType device_type_;
ProxyFetchPropertyCallbackCollector* collector_;
GoogleString url_;
DISALLOW_COPY_AND_ASSIGN(ProxyFetchPropertyCallback);
};
// Tracks a collection of property-cache lookups occurring in parallel.
// Sequence is used to execute various functions in an orderly fashion to
// avoid any kind of race between Done(), ConnectProxyFetch(), Detach() and
// AddPostLookupTask(). When any function is called, it is added to the
// sequence and added function will be executed immediately if sequence is
// free, otherwise it will wait for its turn.
//
// Order of events:
// InitiatePropertyCacheLookup-->AddPostLookupTask-->Initiate Html Fetch
// | (Added to Sequence) |
// | Fetch Done
// Lookup Done() |
// (Added to Sequence) -------------------
// is html | !html |
// ConnectProxyFetch() Detach()
// (Added to Sequence)
//
// This will also wait for RequestHeadersComplete() to be called before
// invoking any post-completion callbacks (but not before canceling them
// due to Detach).
class ProxyFetchPropertyCallbackCollector {
public:
ProxyFetchPropertyCallbackCollector(ServerContext* server_context,
const StringPiece& url,
const RequestContextPtr& req_ctx,
const RewriteOptions* options,
UserAgentMatcher::DeviceType device_type);
virtual ~ProxyFetchPropertyCallbackCollector();
// Add a callback to be handled by this collector.
// Transfers ownership of the callback to the collector.
void AddCallback(ProxyFetchPropertyCallback* callback);
// Must be called once request headers have been resolved from configuration,
// Gates successful post-lookup callback invocation.
void RequestHeadersComplete();
// In our flow, we initiate the property-cache lookup prior to
// creating a proxy-fetch, so that RewriteOptions lookup can proceed
// in parallel. If/when we determine that ProxyFetch is associated
// with HTML content, we connect it to this callback. Note that if
// the property cache lookups have completed, this will result in
// a direct call into proxy_fetch->PropertyCacheComplete.
void ConnectProxyFetch(ProxyFetch* proxy_fetch);
// If for any reason we decide *not* to initiate a ProxyFetch for a
// request, then we need to 'detach' this request so that we can
// delete it once it completes, rather than waiting for a
// ProxyFetch to be inserted. The status code of the response is passed from
// ProxyFetch to the Collector. In case the status code is unknown then pass
// RewriteDriver::kStatusCodeUnknown.
void Detach(HttpStatus::Code status_code);
// Returns the actual property page.
PropertyPage* property_page() {
return fallback_property_page_ == NULL ?
NULL : fallback_property_page_->actual_property_page();
}
// Returns the fallback property page.
FallbackPropertyPage* fallback_property_page() {
return fallback_property_page_.get();
}
// Returns the collected PropertyPage with the corresponding page_type.
// Ownership of the object is transferred to the caller.
PropertyPage* ReleasePropertyPage(
ProxyFetchPropertyCallback::PageType page_type);
// Releases the ownership of fallback property page.
FallbackPropertyPage* ReleaseFallbackPropertyPage() {
return fallback_property_page_.release();
}
// Releases the ownership of origin property page.
PropertyPage* ReleaseOriginPropertyPage() {
return origin_property_page_.release();
}
// In our flow, property-page will be available via RewriteDriver only after
// ProxyFetch is set. But there may be instances where the result may be
// required even before proxy-fetch is created. Any task that depends on the
// PropertyCache result will be executed as soon as PropertyCache lookup is
// done and RequestHeadersComplete() has been called.
//
// func is guaranteed to execute after PropertyCache lookup has completed, as
// long as ProxyFetch is not set before PropertyCache lookup is done. One
// should use PropertyCache result via RewriteDriver if some other thread can
// initiate SetProxyFetch().
void AddPostLookupTask(Function* func);
// If options_ is NULL returns true. Else, returns true if (url_,
// write_timestamp_ms) is valid as per URL cache invalidation entries is
// options_.
bool IsCacheValid(int64 write_timestamp_ms) const;
// Called by a ProxyFetchPropertyCallback when the former is complete.
void Done(ProxyFetchPropertyCallback* callback);
const RequestContextPtr& request_context() { return request_context_; }
// Returns DeviceType from device property page.
UserAgentMatcher::DeviceType device_type() { return device_type_; }
private:
friend class ProxyFetchPropertyCallbackCollectorTest;
void ExecuteDone(ProxyFetchPropertyCallback* callback);
void ExecuteAddPostLookupTask(Function* func);
void ExecuteConnectProxyFetch(ProxyFetch* proxy_fetch);
void ExecuteDetach(HttpStatus::Code status_code);
void ExecuteRequestHeadersComplete();
void RunPostLookupsAndCleanupIfSafe();
// Updates the status code of response in property cache.
void UpdateStatusCodeInPropertyCache();
std::set<ProxyFetchPropertyCallback*> pending_callbacks_;
std::map<ProxyFetchPropertyCallback::PageType, PropertyPage*>
property_pages_;
scoped_ptr<AbstractMutex> mutex_;
ServerContext* const server_context_;
QueuedWorkerPool::Sequence* const sequence_;
const GoogleString url_;
const RequestContextPtr request_context_;
const UserAgentMatcher::DeviceType device_type_;
bool is_options_valid_; // protected by mutex_.
// Unless guarded by mutex_, the fields are only accessed by code serialized
// via sequence_.
bool detached_;
bool done_;
bool request_headers_ok_;
ProxyFetch* proxy_fetch_;
std::vector<Function*> post_lookup_task_vector_;
const RewriteOptions* options_; // protected by mutex_;
HttpStatus::Code status_code_; // status_code_ of the response.
scoped_ptr<FallbackPropertyPage> fallback_property_page_;
scoped_ptr<PropertyPage> origin_property_page_;
DISALLOW_COPY_AND_ASSIGN(ProxyFetchPropertyCallbackCollector);
};
// Manages a single fetch of an HTML or resource file from the original server.
// If it is an HTML file, it is rewritten.
// Fetch is initialized by calling ProxyFetchFactory::StartNewProxyFetch().
// For fetching pagespeed rewritten resources, use ResourceFetch.
// This is only meant to be used by ProxyInterface.
//
// Takes ownership of custom_options.
//
// The ProxyFetch passes through non-HTML directly to base_writer.
//
// For HTML, the sequence is this:
// 1. HeadersComplete is called, allowing us to establish we've got HTML.
// 2. Some number of calls to Write occur.
// 3. Optional: Flush is called, followed by more Writes. Repeat.
// 4. Done is called.
// These virtual methods are called from some arbitrary thread, e.g. a
// dedicated fetcher thread. We use a QueuedWorkerPool::Sequence to
// offload them to a worker-thread. This implementation bundles together
// multiple Writes, and depending on the timing, may move Flushes to
// follow Writes and collapse multiple Flushes into one.
class ProxyFetch : public SharedAsyncFetch {
public:
// These strings identify sync-points for reproducing races between
// PropertyCache lookup completion and Origin HTML Fetch completion.
static const char kCollectorConnectProxyFetchFinish[];
static const char kCollectorDetachFinish[];
static const char kCollectorDoneFinish[];
static const char kCollectorFinish[];
static const char kCollectorDetachStart[];
static const char kCollectorRequestHeadersCompleteFinish[];
// These strings identify sync-points for introducing races between
// PropertyCache lookup completion and HeadersComplete.
static const char kHeadersSetupRaceAlarmQueued[];
static const char kHeadersSetupRaceDone[];
static const char kHeadersSetupRaceFlush[];
static const char kHeadersSetupRacePrefix[];
static const char kHeadersSetupRaceWait[];
// Number of milliseconds to wait, in a test, for an event that we
// are hoping does not occur, specifically an inappropriate call to
// base_fetch()->HeadersComplete() while we are still mutating
// response headers in SetupForHtml.
//
// This is used only for testing.
static const int kTestSignalTimeoutMs = 200;
void set_trusted_input(bool trusted_input) { trusted_input_ = trusted_input; }
protected:
// protected interface from AsyncFetch.
virtual void HandleHeadersComplete();
virtual bool HandleWrite(const StringPiece& content, MessageHandler* handler);
virtual bool HandleFlush(MessageHandler* handler);
virtual void HandleDone(bool success);
virtual bool IsCachedResultValid(const ResponseHeaders& headers);
private:
friend class ProxyFetchFactory;
friend class ProxyFetchPropertyCallbackCollector;
friend class MockProxyFetch;
FRIEND_TEST(ProxyFetchTest, TestInhibitParsing);
// Called by ProxyFetchPropertyCallbackCollector when all property-cache
// fetches are complete. This function takes ownership of collector.
virtual void PropertyCacheComplete(
ProxyFetchPropertyCallbackCollector* collector);
// If cross_domain is true, we're requested under a domain different from
// the underlying host, using proxy mode in UrlNamer.
ProxyFetch(const GoogleString& url,
bool cross_domain,
ProxyFetchPropertyCallbackCollector* property_cache_callback,
AsyncFetch* async_fetch,
AsyncFetch* original_content_fetch,
RewriteDriver* driver,
ServerContext* server_context,
Timer* timer,
ProxyFetchFactory* factory);
virtual ~ProxyFetch();
const RewriteOptions* Options();
// Once we have decided this is HTML, begin parsing and set headers.
void SetupForHtml();
// Adds a pagespeed header to response_headers if enabled.
void AddPagespeedHeader();
// Sets up driver_, registering the writer and start parsing url.
// Returns whether we started parsing successfully or not.
bool StartParse();
// Start the fetch which includes preparing the request.
void StartFetch();
// Actually do the fetch, called from callback of StartFetch.
// "prepare_success" represents whether the request was prepared successfully
// by the UrlNamer.
void DoFetch(bool prepare_success);
// Handles buffered HTML writes, flushes, and done calls
// in the QueuedWorkerPool::Sequence sequence_.
void ExecuteQueued();
// Schedules the task to run any buffered work, if needed. Assumes mutex
// held.
void ScheduleQueueExecutionIfNeeded();
// Frees up the RewriteDriver (via FinishParse or Cleanup),
// calls the callback (nulling out callback_ to ensure that we don't
// do it again), notifies the ProxyInterface that the fetch is
// complete, and deletes the ProxyFetch.
void Finish(bool success);
// Used to wrap up the FinishParseAsync invocation.
void CompleteFinishParse(bool success);
// Callback we give to ExecuteFlushIfRequestedAsync to notify us when
// it's done with its work.
void FlushDone();
// Management functions for idle_alarm_. Must only be called from
// within sequence_.
// Cancels any previous alarm.
void CancelIdleAlarm();
// Cancels previous alarm and starts next one.
void QueueIdleAlarm();
// Handler for the alarm; run in sequence_.
void HandleIdleAlarm();
GoogleString url_;
ServerContext* server_context_;
Timer* timer_;
scoped_ptr<CacheUrlAsyncFetcher> cache_fetcher_;
// True if we're handling a cross-domain request in proxy mode, which
// should do some additional checking.
bool cross_domain_;
// Does page claim to be "Content-Type: text/html"? (It may be lying)
bool claims_html_;
// Has a call to StartParse succeeded? We'll only do this if we actually
// decide it is HTML.
bool started_parse_;
// Has a call to RewriteDriver::ParseText been made yet.
bool parse_text_called_;
// Tracks whether Done() has been called.
bool done_called_;
HtmlDetector html_detector_;
// Tracks a set of outstanding property-cache lookups. This is NULLed
// when the property-cache completes or when we detach it. We use
// this to detach the callback if we decide we don't care about the
// property-caches because we discovered we are not working with HTML.
ProxyFetchPropertyCallbackCollector* property_cache_callback_;
// Fetch where raw original headers and contents are sent.
// To contrast, base_fetch() is sent rewritten contents and headers.
// If NULL, original_content_fetch_ is ignored.
AsyncFetch* original_content_fetch_;
// ProxyFetch is responsible for getting RewriteDrivers from the pool and
// putting them back.
RewriteDriver* driver_;
// True if we have queued up ExecuteQueued but did not
// execute it yet.
bool queue_run_job_created_;
// As the UrlAsyncFetcher calls our Write & Flush methods, we collect
// the text in text_queue, and note the Flush call in
// network_flush_requested_, returning control to the fetcher as quickly
// as possible so it can continue to process incoming network traffic.
//
// We offload the handling of the incoming text events to a
// QueuedWorkerPool::Sequence. Note that we may receive a new chunk
// of text while we are still processing an old chunk. The sequentiality
// is preserved by QueuedWorkerPool::Sequence.
//
// The Done callback is also indirected through this Sequence.
scoped_ptr<AbstractMutex> mutex_;
StringStarVector text_queue_;
bool network_flush_outstanding_;
QueuedWorkerPool::Sequence* sequence_;
// done_oustanding_ will be true if we got called with ::Done but didn't
// invoke Finish yet.
bool done_outstanding_;
// Finish is true if we started Finish, perhaps doing FinishParseAsync.
// Accessed only from within context of sequence_.
bool finishing_;
// done_result_ is used to store the result of ::Done if we're deferring
// handling it until the driver finishes handling a Flush.
bool done_result_;
// We may also end up receiving new events in between calling FlushAsync
// and getting the callback called. In that case, we want to hold off
// on actually dispatching things queued up above.
bool waiting_for_flush_to_finish_;
// Alarm used to keep track of inactivity, in order to help issue
// flushes. Must only be accessed from the thread context of sequence_
QueuedAlarm* idle_alarm_;
ProxyFetchFactory* factory_;
// Set to true if this proxy_fetch is the result of a distributed fetch.
bool distributed_fetch_;
// Set to true if this proxy_fetch is actually operating on trusted
// (non-proxied) content.
bool trusted_input_;
DISALLOW_COPY_AND_ASSIGN(ProxyFetch);
};
} // namespace net_instaweb
#endif // PAGESPEED_AUTOMATIC_PROXY_FETCH_H_