| // Copyright 2010 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Author: jmarantz@google.com (Joshua Marantz) |
| // lsong@google.com (Libo Song) |
| |
| #ifndef PAGESPEED_SYSTEM_SERF_URL_ASYNC_FETCHER_H_ |
| #define PAGESPEED_SYSTEM_SERF_URL_ASYNC_FETCHER_H_ |
| |
| #include <vector> |
| |
| #include "net/instaweb/http/public/url_async_fetcher.h" |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/gtest_prod.h" |
| #include "pagespeed/kernel/base/pool.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/base/thread_system.h" |
| #include "pagespeed/kernel/http/response_headers_parser.h" |
| |
| #include "third_party/serf/src/serf.h" |
| |
| // To enable HTTPS fetching with serf, we must link against OpenSSL, |
| // which is a a large library with licensing restrictions not known to |
| // be wholly inline with the Apache license. To disable HTTPS fetching: |
| // 1. Set SERF_HTTPS_FETCHING to 0 here |
| // 2. Comment out the references to openssl.gyp and ssl_buckets.c in |
| // src/third_party/serf/serf.gyp. |
| // 3. Comment out all references to openssl in src/DEPS. |
| // |
| // If this is enabled, then the HTTPS fetching can be tested with |
| // install/apache_https_fetch_test.sh |
| #ifndef SERF_HTTPS_FETCHING |
| #define SERF_HTTPS_FETCHING 1 |
| #endif |
| |
| struct apr_pool_t; |
| struct apr_uri_t; |
| struct serf_context_t; |
| |
| namespace net_instaweb { |
| |
| class AsyncFetch; |
| class MessageHandler; |
| class Statistics; |
| class SerfFetch; |
| class SerfThreadedFetcher; |
| class Timer; |
| class UpDownCounter; |
| class Variable; |
| |
| struct SerfStats { |
| static const char kSerfFetchRequestCount[]; |
| static const char kSerfFetchByteCount[]; |
| static const char kSerfFetchTimeDurationMs[]; |
| static const char kSerfFetchCancelCount[]; |
| static const char kSerfFetchActiveCount[]; |
| static const char kSerfFetchTimeoutCount[]; |
| static const char kSerfFetchFailureCount[]; |
| static const char kSerfFetchCertErrors[]; |
| }; |
| |
| // Identifies the set of HTML keywords. This is used in error messages emitted |
| // both from the config parser in this module, and in the directives table in |
| // mod_instaweb.cc which must be statically constructed using a compile-time |
| // concatenation. Hence this must be a literal string and not a const char*. |
| #define SERF_HTTPS_KEYWORDS \ |
| "enable,disable,allow_self_signed," \ |
| "allow_unknown_certificate_authority,allow_certificate_not_yet_valid" |
| |
| // TODO(sligocki): Serf does not seem to act appropriately in IPv6 |
| // environments, fix and test this. |
| // Specifically: |
| // (1) It does not attempt to fall-back to IPv4 if IPv6 connection fails; |
| // (2) It may not correctly signal failure, which causes the incoming |
| // connection to hang. |
| class SerfUrlAsyncFetcher : public UrlAsyncFetcher { |
| public: |
| enum WaitChoice { |
| kThreadedOnly, |
| kMainlineOnly, |
| kThreadedAndMainline |
| }; |
| |
| SerfUrlAsyncFetcher(const char* proxy, apr_pool_t* pool, |
| ThreadSystem* thread_system, |
| Statistics* statistics, Timer* timer, int64 timeout_ms, |
| MessageHandler* handler); |
| SerfUrlAsyncFetcher(SerfUrlAsyncFetcher* parent, const char* proxy); |
| virtual ~SerfUrlAsyncFetcher(); |
| |
| static void InitStats(Statistics* statistics); |
| |
| // Stops all active fetches and prevents further fetches from starting |
| // (they will instead quickly call back to ->Done(false). |
| virtual void ShutDown(); |
| |
| virtual bool SupportsHttps() const; |
| |
| virtual void Fetch(const GoogleString& url, |
| MessageHandler* message_handler, |
| AsyncFetch* callback); |
| // TODO(morlovich): Make private once non-thread mode concept removed. |
| int Poll(int64 max_wait_ms); |
| |
| bool WaitForActiveFetches(int64 max_milliseconds, |
| MessageHandler* message_handler, |
| WaitChoice wait_choice); |
| |
| // Remove the completed fetch from the active fetch set, and put it into a |
| // completed fetch list to be cleaned up. |
| void FetchComplete(SerfFetch* fetch); |
| |
| // Update the statistics object with results of the (completed) fetch. |
| void ReportCompletedFetchStats(SerfFetch* fetch); |
| |
| apr_pool_t* pool() const { return pool_; } |
| serf_context_t* serf_context() const { return serf_context_; } |
| |
| void PrintActiveFetches(MessageHandler* handler) const; |
| virtual int64 timeout_ms() { return timeout_ms_; } |
| ThreadSystem* thread_system() { return thread_system_; } |
| |
| // Indicates that Serf should enumerate failing URLs whenever the underlying |
| // Serf library reports an error. |
| void set_list_outstanding_urls_on_error(bool x); |
| |
| // Indicates that Serf should track the original content length for |
| // fetched resources. |
| bool track_original_content_length() const { |
| return track_original_content_length_; |
| } |
| void set_track_original_content_length(bool x); |
| |
| // Indicates that direct HTTPS fetching should be allowed, and how picky |
| // to be about certificates. The directive is a comma separated list of |
| // these keywords: |
| // enable |
| // disable |
| // allow_self_signed |
| // allow_unknown_certificate_authority |
| // allow_certificate_not_yet_valid |
| // Returns 'false' if the directive does not parse properly. |
| bool SetHttpsOptions(StringPiece directive); |
| |
| // Validates the correctness of an https directive. Exposed as a static |
| // method for early exit on mis-specified pagespeed.conf. |
| static bool ValidateHttpsOptions(StringPiece directive, |
| GoogleString* error_message) { |
| uint32 options; |
| return ParseHttpsOptions(directive, &options, error_message); |
| } |
| |
| void SetSslCertificatesDir(StringPiece dir); |
| const GoogleString& ssl_certificates_dir() const { |
| return ssl_certificates_dir_; |
| } |
| |
| void SetSslCertificatesFile(StringPiece file); |
| const GoogleString& ssl_certificates_file() const { |
| return ssl_certificates_file_; |
| } |
| |
| protected: |
| typedef Pool<SerfFetch> SerfFetchPool; |
| |
| // Determines whether https is allowed in the current configuration. |
| inline bool allow_https() const; |
| inline bool allow_self_signed() const; |
| inline bool allow_unknown_certificate_authority() const; |
| inline bool allow_certificate_not_yet_valid() const; |
| |
| void set_https_options(uint32 https_options) { |
| https_options_ = https_options; |
| } |
| |
| void Init(apr_pool_t* parent_pool, const char* proxy); |
| bool SetupProxy(const char* proxy); |
| |
| // Start a SerfFetch. Takes ownership of fetch and makes sure callback is |
| // called even if fetch fails to start. |
| // |
| // mutex_ must be held before calling StartFetch. |
| bool StartFetch(SerfFetch* fetch); |
| |
| // AnyPendingFetches is accurate only at the time of call; this is |
| // used conservatively during shutdown. It counts fetches that have been |
| // requested by some thread, and can include fetches for which no action |
| // has yet been taken (ie fetches that are not active). |
| virtual bool AnyPendingFetches(); |
| // ApproximateNumActiveFetches can under- or over-count and is used only for |
| // error reporting. |
| int ApproximateNumActiveFetches(); |
| |
| void CancelActiveFetches(); |
| void CancelActiveFetchesMutexHeld(); |
| bool WaitForActiveFetchesHelper(int64 max_ms, |
| MessageHandler* message_handler); |
| |
| // This cleans up the serf resources for fetches that errored out. |
| // Must be called only immediately after running the serf event loop. |
| // Must be called with mutex_ held. |
| void CleanupFetchesWithErrors(); |
| |
| // These must be accessed with mutex_ held. |
| bool shutdown() const { return shutdown_; } |
| void set_shutdown(bool s) { shutdown_ = s; } |
| |
| apr_pool_t* pool_; |
| ThreadSystem* thread_system_; |
| Timer* timer_; |
| |
| // mutex_ protects serf_context_ and active_fetches_. |
| ThreadSystem::CondvarCapableMutex* mutex_; |
| serf_context_t* serf_context_; |
| SerfFetchPool active_fetches_; |
| |
| typedef std::vector<SerfFetch*> FetchVector; |
| SerfFetchPool completed_fetches_; |
| SerfThreadedFetcher* threaded_fetcher_; |
| |
| // This is protected because it's updated along with active_fetches_, |
| // which happens in subclass SerfThreadedFetcher as well as this class. |
| UpDownCounter* active_count_; |
| |
| private: |
| friend class SerfFetch; // To access stats variables below. |
| |
| // Note: returned string memory substring of memory in the pool. |
| static const char* ExtractHostHeader(const apr_uri_t& uri, |
| apr_pool_t* pool); |
| FRIEND_TEST(SerfUrlAsyncFetcherTest, TestHostConstruction); |
| |
| // Transforms Host: header into SNI host name by dropping the port. |
| // Exposed for testability |
| static GoogleString RemovePortFromHostHeader(const GoogleString& in); |
| FRIEND_TEST(SerfUrlAsyncFetcherTest, TestPortRemoval); |
| |
| static bool ParseHttpsOptions(StringPiece directive, uint32* options, |
| GoogleString* error_message); |
| |
| Variable* request_count_; |
| Variable* byte_count_; |
| Variable* time_duration_ms_; |
| Variable* cancel_count_; |
| Variable* timeout_count_; |
| Variable* failure_count_; |
| Variable* cert_errors_; |
| const int64 timeout_ms_; |
| bool shutdown_; |
| bool list_outstanding_urls_on_error_; |
| bool track_original_content_length_; |
| uint32 https_options_; // Composed of HttpsOptions ORed together. |
| MessageHandler* message_handler_; |
| GoogleString ssl_certificates_dir_; |
| GoogleString ssl_certificates_file_; |
| |
| DISALLOW_COPY_AND_ASSIGN(SerfUrlAsyncFetcher); |
| }; |
| |
| // TODO(lsong): Move this to a separate file. Necessary? |
| class SerfFetch : public PoolElement<SerfFetch> { |
| public: |
| // TODO(lsong): make use of request_headers. |
| SerfFetch(const GoogleString& url, |
| AsyncFetch* async_fetch, |
| MessageHandler* message_handler, |
| Timer* timer); |
| ~SerfFetch(); |
| |
| // Start the fetch. It returns immediately. This can only be run when |
| // locked with fetcher->mutex_. |
| bool Start(SerfUrlAsyncFetcher* fetcher); |
| |
| GoogleString DebugInfo(); |
| |
| // This must be called while holding SerfUrlAsyncFetcher's mutex_. |
| void Cancel(); |
| |
| // Calls the callback supplied by the user. This needs to happen |
| // exactly once. In some error cases it appears that Serf calls |
| // HandleResponse multiple times on the same object. |
| // |
| // This must be called while holding SerfUrlAsyncFetcher's mutex_. |
| // |
| // Note that when there are SSL error messages, we immediately call |
| // CallCallback, which is robust against duplicate calls in that case. |
| void CallCallback(bool success); |
| void CallbackDone(bool success); |
| |
| // If last poll of this fetch's connection resulted in an error, clean it up. |
| // Must be called after serf_context_run, with fetcher's mutex_ held. |
| void CleanupIfError(); |
| |
| // For use only by unit tests. Calls ParseUrl(), then makes things available |
| // for checking. |
| void ParseUrlForTesting(bool* status, |
| apr_uri_t** url, |
| const char** host_header, |
| const char** sni_host); |
| |
| void SetFetcherForTesting(SerfUrlAsyncFetcher* fetcher); |
| |
| int64 TimeDuration() const; |
| |
| int64 fetch_start_ms() const { return fetch_start_ms_; } |
| |
| size_t bytes_received() const { return bytes_received_; } |
| MessageHandler* message_handler() { return message_handler_; } |
| |
| private: |
| // Static functions used in callbacks. |
| |
| // The code under SERF_HTTPS_FETCHING was contributed by Devin Anderson |
| // (surfacepatterns@gmail.com). |
| // |
| // Note this must be ifdef'd because calling serf_bucket_ssl_decrypt_create |
| // requires ssl_buckets.c in the link. ssl_buckets.c requires openssl. |
| #if SERF_HTTPS_FETCHING |
| static apr_status_t SSLCertValidate(void *data, int failures, |
| const serf_ssl_certificate_t *cert); |
| |
| static apr_status_t SSLCertChainValidate( |
| void *data, int failures, int error_depth, |
| const serf_ssl_certificate_t * const *certs, |
| apr_size_t certs_count); |
| #endif |
| |
| static apr_status_t ConnectionSetup( |
| apr_socket_t* socket, serf_bucket_t **read_bkt, serf_bucket_t **write_bkt, |
| void* setup_baton, apr_pool_t* pool); |
| static void ClosedConnection(serf_connection_t* conn, |
| void* closed_baton, |
| apr_status_t why, |
| apr_pool_t* pool); |
| static serf_bucket_t* AcceptResponse(serf_request_t* request, |
| serf_bucket_t* stream, |
| void* acceptor_baton, |
| apr_pool_t* pool); |
| static apr_status_t HandleResponse(serf_request_t* request, |
| serf_bucket_t* response, |
| void* handler_baton, |
| apr_pool_t* pool); |
| static bool MoreDataAvailable(apr_status_t status); |
| static bool IsStatusOk(apr_status_t status); |
| |
| #if SERF_HTTPS_FETCHING |
| // Called indicating whether SSL certificate errors have occurred detected. |
| // The function returns SUCCESS in all cases, but sets ssl_error_message_ |
| // non-null for errors as a signal to ReadHeaders that we should not let |
| // any output thorugh. |
| // |
| // Interpretation of two of the error conditions is configurable: |
| // 'allow_unknown_certificate_authority' and 'allow_self_signed'. |
| // |
| // If there is a cert that should be checked for a hostname match, that should |
| // go in cert. Otherwise cert should be null. |
| apr_status_t HandleSSLCertValidation( |
| int errors, int failure_depth, const serf_ssl_certificate_t *cert); |
| #endif |
| |
| apr_status_t HandleResponse(serf_bucket_t* response); |
| |
| apr_status_t ReadStatusLine(serf_bucket_t* response); |
| |
| // Know what's weird? You have do a body-read to get access to the |
| // headers. You need to read 1 byte of body to force an FSM inside |
| // Serf to parse the headers. Then you can parse the headers and |
| // finally read the rest of the body. I know, right? |
| // |
| // The simpler approach, and likely what the Serf designers intended, |
| // is that you read the entire body first, and then read the headers. |
| // But if you are trying to stream the data as its fetched through some |
| // kind of function that needs to know the content-type, then it's |
| // really a drag to have to wait till the end of the body to get the |
| // content type. |
| apr_status_t ReadOneByteFromBody(serf_bucket_t* response); |
| |
| // Once that one byte is read from the body, we can go ahead and |
| // parse the headers. The dynamics of this appear that for N |
| // headers we'll get 2N calls to serf_bucket_read: one each for |
| // attribute names & values. |
| apr_status_t ReadHeaders(serf_bucket_t* response); |
| |
| // Once headers are complete we can get the body. The dynamics of this |
| // are likely dependent on everything on the network between the client |
| // and server, but for a 10k buffer I seem to frequently get 8k chunks. |
| apr_status_t ReadBody(serf_bucket_t* response); |
| |
| // Ensures that a user-agent string is included, and that the mod_pagespeed |
| // version is appended. |
| void FixUserAgent(); |
| static apr_status_t SetupRequest(serf_request_t* request, |
| void* setup_baton, |
| serf_bucket_t** req_bkt, |
| serf_response_acceptor_t* acceptor, |
| void** acceptor_baton, |
| serf_response_handler_t* handler, |
| void** handler_baton, |
| apr_pool_t* pool); |
| bool ParseUrl(); |
| |
| SerfUrlAsyncFetcher* fetcher_; |
| Timer* timer_; |
| const GoogleString str_url_; |
| AsyncFetch* async_fetch_; |
| ResponseHeadersParser parser_; |
| bool status_line_read_; |
| bool one_byte_read_; |
| bool has_saved_byte_; |
| char saved_byte_; |
| MessageHandler* message_handler_; |
| |
| apr_pool_t* pool_; |
| serf_bucket_alloc_t* bucket_alloc_; |
| apr_uri_t url_; |
| const char* host_header_; // in pool_ |
| const char* sni_host_; // in pool_ |
| serf_connection_t* connection_; |
| size_t bytes_received_; |
| int64 fetch_start_ms_; |
| int64 fetch_end_ms_; |
| |
| // Variables used for HTTPS connection handling |
| bool using_https_; |
| serf_ssl_context_t* ssl_context_; |
| const char* ssl_error_message_; |
| |
| DISALLOW_COPY_AND_ASSIGN(SerfFetch); |
| }; |
| |
| } // namespace net_instaweb |
| |
| #endif // PAGESPEED_SYSTEM_SERF_URL_ASYNC_FETCHER_H_ |