src/net/instaweb/apache/apache_slurp.cc - incubator-pagespeed-mod - Git at Google

 // Copyright 2010 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Author: jmarantz@google.com (Joshua Marantz)

 // Must precede any Apache includes, for now, due a conflict in
 // the use of 'OK" as an Instaweb enum and as an Apache #define.
 #include "base/string_util.h"
 #include "net/instaweb/apache/header_util.h"

 // TODO(jmarantz): serf_url_async_fetcher evidently sets
 // 'gzip' unconditionally, and the response includes the 'gzip'
 // encoding header, but serf unzips the response itself.
 //
 // I think the correct behavior is that our async fetcher should
 // transmit the 'gzip' request header if it was specified in the call
 // to StreamingFetch.  This would be easy to fix.
 //
 // Unfortunately, serf 0.31 appears to unzip the content for us, which
 // is not what we are asking for.  And it leaves the 'gzip' response
 // header in despite having unzipped it itself.  I have tried later
 // versions of serf, but the API is not compatible (easy to deal with)
 // but the resulting binary has unresolved symbols.  I am wondering
 // whether we will have to go to libcurl.
 //
 // For now use wget when slurping additional files.

 #include "net/instaweb/apache/apache_rewrite_driver_factory.h"
 #include "net/instaweb/util/public/message_handler.h"
 #include "net/instaweb/util/public/query_params.h"
 #include "net/instaweb/util/public/simple_meta_data.h"
 #include "net/instaweb/util/public/string_writer.h"
 #include "net/instaweb/util/public/url_fetcher.h"

 // The Apache headers must be after instaweb headers.  Otherwise, the
 // compiler will complain
 //   "strtoul_is_not_a_portable_function_use_strtol_instead".
 #include "apr_strings.h"  // for apr_pstrdup
 #include "http_protocol.h"

 namespace net_instaweb {

 namespace {

 // Default handler when the file is not found
 void SlurpDefaultHandler(request_rec* r) {
   ap_set_content_type(r, "text/html; charset=utf-8");
   std::string buf = StringPrintf(
       "<html><head><title>Slurp Error</title></head>"
       "<body><h1>Slurp failed</h1>\n"
       "<p>host=%s\n"
       "<p>uri=%s\n"
       "</body></html>",
       r->hostname, r->unparsed_uri);
   ap_rputs(buf.c_str(), r);
   r->status = HttpStatus::kNotFound;
   r->status_line = "Not Found";
 }

 // TODO(jmarantz): The ApacheWriter defined below is much more
 // efficient than the mechanism we are currently using, which is to
 // buffer the entire response in a string and then send it later.
 // For some reason, this did not work when I tried it, but it's
 // worth another look.

 class ApacheWriter : public Writer {
  public:
   ApacheWriter(request_rec* r, SimpleMetaData* response_headers,
                int64 flush_limit)
       : request_(r),
         response_headers_(response_headers),
         size_(0),
         flush_limit_(flush_limit),
         headers_out_(false) {
   }

   virtual bool Write(const StringPiece& str, MessageHandler* handler) {
     if (!headers_out_) {
       OutputHeaders();
     }
     ap_rwrite(str.data(), str.size(), request_);
     size_ += str.size();
     if ((flush_limit_ != 0) && (size_ > flush_limit_)) {
       Flush(handler);
     }
     return true;
   }

   virtual bool Flush(MessageHandler* handler) {
     ap_rflush(request_);
     size_ = 0;
     return true;
   }

   int64 size() const { return size_; }

   void OutputHeaders() {
     if (headers_out_) {
       return;
     }
     headers_out_ = true;

     // Apache2 defaults to set the status line as HTTP/1.1.  If the
     // original content was HTTP/1.0, we need to force the server to use
     // HTTP/1.0.  I'm not sure why/whether we need to do this; it was in
     // mod_static from the sdpy project, which is where I copied this
     // code from.
     if ((response_headers_->major_version() == 1) &&
         (response_headers_->minor_version() == 0)) {
       apr_table_set(request_->subprocess_env, "force-response-1.0", "1");
     }

     char* content_type = NULL;
     CharStarVector v;
     CHECK(response_headers_->headers_complete());
     if (response_headers_->Lookup(HttpAttributes::kContentType, &v)) {
       CHECK(!v.empty());
       // ap_set_content_type does not make a copy of the string, we need
       // to duplicate it.  Note that we will update the content type below,
       // after transforming the headers.
       content_type = apr_pstrdup(request_->pool, v[v.size() - 1]);
       response_headers_->RemoveAll(HttpAttributes::kContentType);
     }
     response_headers_->RemoveAll(HttpAttributes::kTransferEncoding);
     response_headers_->RemoveAll(HttpAttributes::kContentLength);
     MetaDataToApacheHeader(*response_headers_, request_->headers_out,
                            &request_->status, &request_->proto_num);
     if (content_type != NULL) {
       ap_set_content_type(request_, content_type);
     }

     // Recompute the content-length, because the content is decoded.
     // TODO(lsong): We don't know the content size, do we?
     // ap_set_content_length(request_, contents.size());
   }

  private:
   request_rec* request_;
   SimpleMetaData* response_headers_;
   int size_;
   int flush_limit_;
   bool headers_out_;

   DISALLOW_COPY_AND_ASSIGN(ApacheWriter);
 };

 }  // namespace

 // Remove any mod-pagespeed-specific modifiers before we go to our slurped
 // fetcher.
 //
 // TODO(jmarantz): share the string constants from mod_instaweb.cc and
 // formalize the prefix-matching assumed here.
 std::string RemoveModPageSpeedQueryParams(
     const std::string& uri, const char* query_param_string) {
   QueryParams query_params, stripped_query_params;
   query_params.Parse(query_param_string);
   bool rewrite_query_params = false;

   for (int i = 0; i < query_params.size(); ++i) {
     const char* name = query_params.name(i);
     static const char kModPagespeed[] = "ModPagespeed";
     if (strncmp(name, kModPagespeed, sizeof(kModPagespeed) - 1) == 0) {
       rewrite_query_params = true;
     } else {
       stripped_query_params.Add(name, query_params.value(i));
     }
   }

   std::string stripped_url;
   if (rewrite_query_params) {
     // TODO(jmarantz): It would be nice to use GoogleUrl to do this but
     // it's not clear how it would help.  Instead just hack the string.
     std::string::size_type question_mark = uri.find('?');
     CHECK(question_mark != std::string::npos);
     stripped_url.append(uri.data(), question_mark);  // does not include "?" yet
     if (stripped_query_params.size() != 0) {
       stripped_url += StrCat("?", stripped_query_params.ToString());
     }
   } else {
     stripped_url = uri;
   }
   return stripped_url;
 }

 void SlurpUrl(const std::string& uri, ApacheRewriteDriverFactory* factory,
               request_rec* r) {
   SimpleMetaData request_headers, response_headers;
   ApacheHeaderToMetaData(r->headers_in, 0, 0, &request_headers);
   std::string contents;
   ApacheWriter writer(r, &response_headers, factory->slurp_flush_limit());

   std::string stripped_url = RemoveModPageSpeedQueryParams(
       uri, r->parsed_uri.query);

   UrlFetcher* fetcher = factory->ComputeUrlFetcher();
   if (fetcher->StreamingFetchUrl(stripped_url, request_headers,
                                  &response_headers, &writer,
                                  factory->message_handler())) {
     // In the event of empty content, the writer's Write method may not be
     // called, but we should still emit headers.
     writer.OutputHeaders();
   } else {
     MessageHandler* handler = factory->message_handler();
     handler->Message(kError, "mod_slurp: fetch of url %s"
                      " failed.\nRequest Headers: %s\n\nResponse Headers: %s",
                      stripped_url.c_str(),
                      request_headers.ToString().c_str(),
                      response_headers.ToString().c_str());
     SlurpDefaultHandler(r);
   }
 }

 }  // namespace net_instaweb
	// Copyright 2010 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	// Author: jmarantz@google.com (Joshua Marantz)

	// Must precede any Apache includes, for now, due a conflict in
	// the use of 'OK" as an Instaweb enum and as an Apache #define.
	#include "base/string_util.h"
	#include "net/instaweb/apache/header_util.h"

	// TODO(jmarantz): serf_url_async_fetcher evidently sets
	// 'gzip' unconditionally, and the response includes the 'gzip'
	// encoding header, but serf unzips the response itself.
	//
	// I think the correct behavior is that our async fetcher should
	// transmit the 'gzip' request header if it was specified in the call
	// to StreamingFetch. This would be easy to fix.
	//
	// Unfortunately, serf 0.31 appears to unzip the content for us, which
	// is not what we are asking for. And it leaves the 'gzip' response
	// header in despite having unzipped it itself. I have tried later
	// versions of serf, but the API is not compatible (easy to deal with)
	// but the resulting binary has unresolved symbols. I am wondering
	// whether we will have to go to libcurl.
	//
	// For now use wget when slurping additional files.

	#include "net/instaweb/apache/apache_rewrite_driver_factory.h"
	#include "net/instaweb/util/public/message_handler.h"
	#include "net/instaweb/util/public/query_params.h"
	#include "net/instaweb/util/public/simple_meta_data.h"
	#include "net/instaweb/util/public/string_writer.h"
	#include "net/instaweb/util/public/url_fetcher.h"

	// The Apache headers must be after instaweb headers. Otherwise, the
	// compiler will complain
	// "strtoul_is_not_a_portable_function_use_strtol_instead".
	#include "apr_strings.h" // for apr_pstrdup
	#include "http_protocol.h"

	namespace net_instaweb {

	namespace {

	// Default handler when the file is not found
	void SlurpDefaultHandler(request_rec* r) {
	ap_set_content_type(r, "text/html; charset=utf-8");
	std::string buf = StringPrintf(
	"<html><head><title>Slurp Error</title></head>"
	"<body><h1>Slurp failed</h1>\n"
	"<p>host=%s\n"
	"<p>uri=%s\n"
	"</body></html>",
	r->hostname, r->unparsed_uri);
	ap_rputs(buf.c_str(), r);
	r->status = HttpStatus::kNotFound;
	r->status_line = "Not Found";
	}

	// TODO(jmarantz): The ApacheWriter defined below is much more
	// efficient than the mechanism we are currently using, which is to
	// buffer the entire response in a string and then send it later.
	// For some reason, this did not work when I tried it, but it's
	// worth another look.

	class ApacheWriter : public Writer {
	public:
	ApacheWriter(request_rec* r, SimpleMetaData* response_headers,
	int64 flush_limit)
	: request_(r),
	response_headers_(response_headers),
	size_(0),
	flush_limit_(flush_limit),
	headers_out_(false) {
	}

	virtual bool Write(const StringPiece& str, MessageHandler* handler) {
	if (!headers_out_) {
	OutputHeaders();
	}
	ap_rwrite(str.data(), str.size(), request_);
	size_ += str.size();
	if ((flush_limit_ != 0) && (size_ > flush_limit_)) {
	Flush(handler);
	}
	return true;
	}

	virtual bool Flush(MessageHandler* handler) {
	ap_rflush(request_);
	size_ = 0;
	return true;
	}

	int64 size() const { return size_; }

	void OutputHeaders() {
	if (headers_out_) {
	return;
	}
	headers_out_ = true;

	// Apache2 defaults to set the status line as HTTP/1.1. If the
	// original content was HTTP/1.0, we need to force the server to use
	// HTTP/1.0. I'm not sure why/whether we need to do this; it was in
	// mod_static from the sdpy project, which is where I copied this
	// code from.
	if ((response_headers_->major_version() == 1) &&
	(response_headers_->minor_version() == 0)) {
	apr_table_set(request_->subprocess_env, "force-response-1.0", "1");
	}

	char* content_type = NULL;
	CharStarVector v;
	CHECK(response_headers_->headers_complete());
	if (response_headers_->Lookup(HttpAttributes::kContentType, &v)) {
	CHECK(!v.empty());
	// ap_set_content_type does not make a copy of the string, we need
	// to duplicate it. Note that we will update the content type below,
	// after transforming the headers.
	content_type = apr_pstrdup(request_->pool, v[v.size() - 1]);
	response_headers_->RemoveAll(HttpAttributes::kContentType);
	}
	response_headers_->RemoveAll(HttpAttributes::kTransferEncoding);
	response_headers_->RemoveAll(HttpAttributes::kContentLength);
	MetaDataToApacheHeader(*response_headers_, request_->headers_out,
	&request_->status, &request_->proto_num);
	if (content_type != NULL) {
	ap_set_content_type(request_, content_type);
	}

	// Recompute the content-length, because the content is decoded.
	// TODO(lsong): We don't know the content size, do we?
	// ap_set_content_length(request_, contents.size());
	}

	private:
	request_rec* request_;
	SimpleMetaData* response_headers_;
	int size_;
	int flush_limit_;
	bool headers_out_;

	DISALLOW_COPY_AND_ASSIGN(ApacheWriter);
	};

	} // namespace

	// Remove any mod-pagespeed-specific modifiers before we go to our slurped
	// fetcher.
	//
	// TODO(jmarantz): share the string constants from mod_instaweb.cc and
	// formalize the prefix-matching assumed here.
	std::string RemoveModPageSpeedQueryParams(
	const std::string& uri, const char* query_param_string) {
	QueryParams query_params, stripped_query_params;
	query_params.Parse(query_param_string);
	bool rewrite_query_params = false;

	for (int i = 0; i < query_params.size(); ++i) {
	const char* name = query_params.name(i);
	static const char kModPagespeed[] = "ModPagespeed";
	if (strncmp(name, kModPagespeed, sizeof(kModPagespeed) - 1) == 0) {
	rewrite_query_params = true;
	} else {
	stripped_query_params.Add(name, query_params.value(i));
	}
	}

	std::string stripped_url;
	if (rewrite_query_params) {
	// TODO(jmarantz): It would be nice to use GoogleUrl to do this but
	// it's not clear how it would help. Instead just hack the string.
	std::string::size_type question_mark = uri.find('?');
	CHECK(question_mark != std::string::npos);
	stripped_url.append(uri.data(), question_mark); // does not include "?" yet
	if (stripped_query_params.size() != 0) {
	stripped_url += StrCat("?", stripped_query_params.ToString());
	}
	} else {
	stripped_url = uri;
	}
	return stripped_url;
	}

	void SlurpUrl(const std::string& uri, ApacheRewriteDriverFactory* factory,
	request_rec* r) {
	SimpleMetaData request_headers, response_headers;
	ApacheHeaderToMetaData(r->headers_in, 0, 0, &request_headers);
	std::string contents;
	ApacheWriter writer(r, &response_headers, factory->slurp_flush_limit());

	std::string stripped_url = RemoveModPageSpeedQueryParams(
	uri, r->parsed_uri.query);

	UrlFetcher* fetcher = factory->ComputeUrlFetcher();
	if (fetcher->StreamingFetchUrl(stripped_url, request_headers,
	&response_headers, &writer,
	factory->message_handler())) {
	// In the event of empty content, the writer's Write method may not be
	// called, but we should still emit headers.
	writer.OutputHeaders();
	} else {
	MessageHandler* handler = factory->message_handler();
	handler->Message(kError, "mod_slurp: fetch of url %s"
	" failed.\nRequest Headers: %s\n\nResponse Headers: %s",
	stripped_url.c_str(),
	request_headers.ToString().c_str(),
	response_headers.ToString().c_str());
	SlurpDefaultHandler(r);
	}
	}

	} // namespace net_instaweb