blob: 897db286c51f8720b2d0d258b9ccc67b88540df4 [file] [log] [blame]
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jmarantz@google.com (Joshua Marantz)
// lsong@google.com (Libo Song)
#ifndef PAGESPEED_SYSTEM_SERF_URL_ASYNC_FETCHER_H_
#define PAGESPEED_SYSTEM_SERF_URL_ASYNC_FETCHER_H_
#include <vector>
#include "net/instaweb/http/public/url_async_fetcher.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/gtest_prod.h"
#include "pagespeed/kernel/base/pool.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/thread_system.h"
#include "pagespeed/kernel/http/response_headers_parser.h"
#include "third_party/serf/src/serf.h"
// To enable HTTPS fetching with serf, we must link against OpenSSL,
// which is a a large library with licensing restrictions not known to
// be wholly inline with the Apache license. To disable HTTPS fetching:
// 1. Set SERF_HTTPS_FETCHING to 0 here
// 2. Comment out the references to openssl.gyp and ssl_buckets.c in
// src/third_party/serf/serf.gyp.
// 3. Comment out all references to openssl in src/DEPS.
//
// If this is enabled, then the HTTPS fetching can be tested with
// install/apache_https_fetch_test.sh
#ifndef SERF_HTTPS_FETCHING
#define SERF_HTTPS_FETCHING 1
#endif
struct apr_pool_t;
struct apr_uri_t;
struct serf_context_t;
namespace net_instaweb {
class AsyncFetch;
class MessageHandler;
class Statistics;
class SerfFetch;
class SerfThreadedFetcher;
class Timer;
class UpDownCounter;
class Variable;
struct SerfStats {
static const char kSerfFetchRequestCount[];
static const char kSerfFetchByteCount[];
static const char kSerfFetchTimeDurationMs[];
static const char kSerfFetchCancelCount[];
static const char kSerfFetchActiveCount[];
static const char kSerfFetchTimeoutCount[];
static const char kSerfFetchFailureCount[];
static const char kSerfFetchCertErrors[];
};
// Identifies the set of HTML keywords. This is used in error messages emitted
// both from the config parser in this module, and in the directives table in
// mod_instaweb.cc which must be statically constructed using a compile-time
// concatenation. Hence this must be a literal string and not a const char*.
#define SERF_HTTPS_KEYWORDS \
"enable,disable,allow_self_signed," \
"allow_unknown_certificate_authority,allow_certificate_not_yet_valid"
// TODO(sligocki): Serf does not seem to act appropriately in IPv6
// environments, fix and test this.
// Specifically:
// (1) It does not attempt to fall-back to IPv4 if IPv6 connection fails;
// (2) It may not correctly signal failure, which causes the incoming
// connection to hang.
class SerfUrlAsyncFetcher : public UrlAsyncFetcher {
public:
enum WaitChoice {
kThreadedOnly,
kMainlineOnly,
kThreadedAndMainline
};
SerfUrlAsyncFetcher(const char* proxy, apr_pool_t* pool,
ThreadSystem* thread_system,
Statistics* statistics, Timer* timer, int64 timeout_ms,
MessageHandler* handler);
SerfUrlAsyncFetcher(SerfUrlAsyncFetcher* parent, const char* proxy);
virtual ~SerfUrlAsyncFetcher();
static void InitStats(Statistics* statistics);
// Stops all active fetches and prevents further fetches from starting
// (they will instead quickly call back to ->Done(false).
virtual void ShutDown();
virtual bool SupportsHttps() const;
virtual void Fetch(const GoogleString& url,
MessageHandler* message_handler,
AsyncFetch* callback);
// TODO(morlovich): Make private once non-thread mode concept removed.
int Poll(int64 max_wait_ms);
bool WaitForActiveFetches(int64 max_milliseconds,
MessageHandler* message_handler,
WaitChoice wait_choice);
// Remove the completed fetch from the active fetch set, and put it into a
// completed fetch list to be cleaned up.
void FetchComplete(SerfFetch* fetch);
// Update the statistics object with results of the (completed) fetch.
void ReportCompletedFetchStats(SerfFetch* fetch);
apr_pool_t* pool() const { return pool_; }
serf_context_t* serf_context() const { return serf_context_; }
void PrintActiveFetches(MessageHandler* handler) const;
virtual int64 timeout_ms() { return timeout_ms_; }
ThreadSystem* thread_system() { return thread_system_; }
// Indicates that Serf should enumerate failing URLs whenever the underlying
// Serf library reports an error.
void set_list_outstanding_urls_on_error(bool x);
// Indicates that Serf should track the original content length for
// fetched resources.
bool track_original_content_length() const {
return track_original_content_length_;
}
void set_track_original_content_length(bool x);
// Indicates that direct HTTPS fetching should be allowed, and how picky
// to be about certificates. The directive is a comma separated list of
// these keywords:
// enable
// disable
// allow_self_signed
// allow_unknown_certificate_authority
// allow_certificate_not_yet_valid
// Returns 'false' if the directive does not parse properly.
bool SetHttpsOptions(StringPiece directive);
// Validates the correctness of an https directive. Exposed as a static
// method for early exit on mis-specified pagespeed.conf.
static bool ValidateHttpsOptions(StringPiece directive,
GoogleString* error_message) {
uint32 options;
return ParseHttpsOptions(directive, &options, error_message);
}
void SetSslCertificatesDir(StringPiece dir);
const GoogleString& ssl_certificates_dir() const {
return ssl_certificates_dir_;
}
void SetSslCertificatesFile(StringPiece file);
const GoogleString& ssl_certificates_file() const {
return ssl_certificates_file_;
}
protected:
typedef Pool<SerfFetch> SerfFetchPool;
// Determines whether https is allowed in the current configuration.
inline bool allow_https() const;
inline bool allow_self_signed() const;
inline bool allow_unknown_certificate_authority() const;
inline bool allow_certificate_not_yet_valid() const;
void set_https_options(uint32 https_options) {
https_options_ = https_options;
}
void Init(apr_pool_t* parent_pool, const char* proxy);
bool SetupProxy(const char* proxy);
// Start a SerfFetch. Takes ownership of fetch and makes sure callback is
// called even if fetch fails to start.
//
// mutex_ must be held before calling StartFetch.
bool StartFetch(SerfFetch* fetch);
// AnyPendingFetches is accurate only at the time of call; this is
// used conservatively during shutdown. It counts fetches that have been
// requested by some thread, and can include fetches for which no action
// has yet been taken (ie fetches that are not active).
virtual bool AnyPendingFetches();
// ApproximateNumActiveFetches can under- or over-count and is used only for
// error reporting.
int ApproximateNumActiveFetches();
void CancelActiveFetches();
void CancelActiveFetchesMutexHeld();
bool WaitForActiveFetchesHelper(int64 max_ms,
MessageHandler* message_handler);
// This cleans up the serf resources for fetches that errored out.
// Must be called only immediately after running the serf event loop.
// Must be called with mutex_ held.
void CleanupFetchesWithErrors();
// These must be accessed with mutex_ held.
bool shutdown() const { return shutdown_; }
void set_shutdown(bool s) { shutdown_ = s; }
apr_pool_t* pool_;
ThreadSystem* thread_system_;
Timer* timer_;
// mutex_ protects serf_context_ and active_fetches_.
ThreadSystem::CondvarCapableMutex* mutex_;
serf_context_t* serf_context_;
SerfFetchPool active_fetches_;
typedef std::vector<SerfFetch*> FetchVector;
SerfFetchPool completed_fetches_;
SerfThreadedFetcher* threaded_fetcher_;
// This is protected because it's updated along with active_fetches_,
// which happens in subclass SerfThreadedFetcher as well as this class.
UpDownCounter* active_count_;
private:
friend class SerfFetch; // To access stats variables below.
// Note: returned string memory substring of memory in the pool.
static const char* ExtractHostHeader(const apr_uri_t& uri,
apr_pool_t* pool);
FRIEND_TEST(SerfUrlAsyncFetcherTest, TestHostConstruction);
// Transforms Host: header into SNI host name by dropping the port.
// Exposed for testability
static GoogleString RemovePortFromHostHeader(const GoogleString& in);
FRIEND_TEST(SerfUrlAsyncFetcherTest, TestPortRemoval);
static bool ParseHttpsOptions(StringPiece directive, uint32* options,
GoogleString* error_message);
Variable* request_count_;
Variable* byte_count_;
Variable* time_duration_ms_;
Variable* cancel_count_;
Variable* timeout_count_;
Variable* failure_count_;
Variable* cert_errors_;
const int64 timeout_ms_;
bool shutdown_;
bool list_outstanding_urls_on_error_;
bool track_original_content_length_;
uint32 https_options_; // Composed of HttpsOptions ORed together.
MessageHandler* message_handler_;
GoogleString ssl_certificates_dir_;
GoogleString ssl_certificates_file_;
DISALLOW_COPY_AND_ASSIGN(SerfUrlAsyncFetcher);
};
// TODO(lsong): Move this to a separate file. Necessary?
class SerfFetch : public PoolElement<SerfFetch> {
public:
// TODO(lsong): make use of request_headers.
SerfFetch(const GoogleString& url,
AsyncFetch* async_fetch,
MessageHandler* message_handler,
Timer* timer);
~SerfFetch();
// Start the fetch. It returns immediately. This can only be run when
// locked with fetcher->mutex_.
bool Start(SerfUrlAsyncFetcher* fetcher);
GoogleString DebugInfo();
// This must be called while holding SerfUrlAsyncFetcher's mutex_.
void Cancel();
// Calls the callback supplied by the user. This needs to happen
// exactly once. In some error cases it appears that Serf calls
// HandleResponse multiple times on the same object.
//
// This must be called while holding SerfUrlAsyncFetcher's mutex_.
//
// Note that when there are SSL error messages, we immediately call
// CallCallback, which is robust against duplicate calls in that case.
void CallCallback(bool success);
void CallbackDone(bool success);
// If last poll of this fetch's connection resulted in an error, clean it up.
// Must be called after serf_context_run, with fetcher's mutex_ held.
void CleanupIfError();
// For use only by unit tests. Calls ParseUrl(), then makes things available
// for checking.
void ParseUrlForTesting(bool* status,
apr_uri_t** url,
const char** host_header,
const char** sni_host);
void SetFetcherForTesting(SerfUrlAsyncFetcher* fetcher);
int64 TimeDuration() const;
int64 fetch_start_ms() const { return fetch_start_ms_; }
size_t bytes_received() const { return bytes_received_; }
MessageHandler* message_handler() { return message_handler_; }
private:
// Static functions used in callbacks.
// The code under SERF_HTTPS_FETCHING was contributed by Devin Anderson
// (surfacepatterns@gmail.com).
//
// Note this must be ifdef'd because calling serf_bucket_ssl_decrypt_create
// requires ssl_buckets.c in the link. ssl_buckets.c requires openssl.
#if SERF_HTTPS_FETCHING
static apr_status_t SSLCertValidate(void *data, int failures,
const serf_ssl_certificate_t *cert);
static apr_status_t SSLCertChainValidate(
void *data, int failures, int error_depth,
const serf_ssl_certificate_t * const *certs,
apr_size_t certs_count);
#endif
static apr_status_t ConnectionSetup(
apr_socket_t* socket, serf_bucket_t **read_bkt, serf_bucket_t **write_bkt,
void* setup_baton, apr_pool_t* pool);
static void ClosedConnection(serf_connection_t* conn,
void* closed_baton,
apr_status_t why,
apr_pool_t* pool);
static serf_bucket_t* AcceptResponse(serf_request_t* request,
serf_bucket_t* stream,
void* acceptor_baton,
apr_pool_t* pool);
static apr_status_t HandleResponse(serf_request_t* request,
serf_bucket_t* response,
void* handler_baton,
apr_pool_t* pool);
static bool MoreDataAvailable(apr_status_t status);
static bool IsStatusOk(apr_status_t status);
#if SERF_HTTPS_FETCHING
// Called indicating whether SSL certificate errors have occurred detected.
// The function returns SUCCESS in all cases, but sets ssl_error_message_
// non-null for errors as a signal to ReadHeaders that we should not let
// any output thorugh.
//
// Interpretation of two of the error conditions is configurable:
// 'allow_unknown_certificate_authority' and 'allow_self_signed'.
//
// If there is a cert that should be checked for a hostname match, that should
// go in cert. Otherwise cert should be null.
apr_status_t HandleSSLCertValidation(
int errors, int failure_depth, const serf_ssl_certificate_t *cert);
#endif
apr_status_t HandleResponse(serf_bucket_t* response);
apr_status_t ReadStatusLine(serf_bucket_t* response);
// Know what's weird? You have do a body-read to get access to the
// headers. You need to read 1 byte of body to force an FSM inside
// Serf to parse the headers. Then you can parse the headers and
// finally read the rest of the body. I know, right?
//
// The simpler approach, and likely what the Serf designers intended,
// is that you read the entire body first, and then read the headers.
// But if you are trying to stream the data as its fetched through some
// kind of function that needs to know the content-type, then it's
// really a drag to have to wait till the end of the body to get the
// content type.
apr_status_t ReadOneByteFromBody(serf_bucket_t* response);
// Once that one byte is read from the body, we can go ahead and
// parse the headers. The dynamics of this appear that for N
// headers we'll get 2N calls to serf_bucket_read: one each for
// attribute names & values.
apr_status_t ReadHeaders(serf_bucket_t* response);
// Once headers are complete we can get the body. The dynamics of this
// are likely dependent on everything on the network between the client
// and server, but for a 10k buffer I seem to frequently get 8k chunks.
apr_status_t ReadBody(serf_bucket_t* response);
// Ensures that a user-agent string is included, and that the mod_pagespeed
// version is appended.
void FixUserAgent();
static apr_status_t SetupRequest(serf_request_t* request,
void* setup_baton,
serf_bucket_t** req_bkt,
serf_response_acceptor_t* acceptor,
void** acceptor_baton,
serf_response_handler_t* handler,
void** handler_baton,
apr_pool_t* pool);
bool ParseUrl();
SerfUrlAsyncFetcher* fetcher_;
Timer* timer_;
const GoogleString str_url_;
AsyncFetch* async_fetch_;
ResponseHeadersParser parser_;
bool status_line_read_;
bool one_byte_read_;
bool has_saved_byte_;
char saved_byte_;
MessageHandler* message_handler_;
apr_pool_t* pool_;
serf_bucket_alloc_t* bucket_alloc_;
apr_uri_t url_;
const char* host_header_; // in pool_
const char* sni_host_; // in pool_
serf_connection_t* connection_;
size_t bytes_received_;
int64 fetch_start_ms_;
int64 fetch_end_ms_;
// Variables used for HTTPS connection handling
bool using_https_;
serf_ssl_context_t* ssl_context_;
const char* ssl_error_message_;
DISALLOW_COPY_AND_ASSIGN(SerfFetch);
};
} // namespace net_instaweb
#endif // PAGESPEED_SYSTEM_SERF_URL_ASYNC_FETCHER_H_