blob: c2c3e35f490a315ce8874a6c7d378bd3d46ea371 [file] [log] [blame]
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jmarantz@google.com (Joshua Marantz)
// Must precede any Apache includes, for now, due a conflict in
// the use of 'OK" as an Instaweb enum and as an Apache #define.
#include "base/string_util.h"
#include "net/instaweb/apache/header_util.h"
// TODO(jmarantz): serf_url_async_fetcher evidently sets
// 'gzip' unconditionally, and the response includes the 'gzip'
// encoding header, but serf unzips the response itself.
//
// I think the correct behavior is that our async fetcher should
// transmit the 'gzip' request header if it was specified in the call
// to StreamingFetch. This would be easy to fix.
//
// Unfortunately, serf 0.31 appears to unzip the content for us, which
// is not what we are asking for. And it leaves the 'gzip' response
// header in despite having unzipped it itself. I have tried later
// versions of serf, but the API is not compatible (easy to deal with)
// but the resulting binary has unresolved symbols. I am wondering
// whether we will have to go to libcurl.
//
// For now use wget when slurping additional files.
#include "net/instaweb/apache/apache_rewrite_driver_factory.h"
#include "net/instaweb/util/public/message_handler.h"
#include "net/instaweb/util/public/query_params.h"
#include "net/instaweb/util/public/simple_meta_data.h"
#include "net/instaweb/util/public/string_writer.h"
#include "net/instaweb/util/public/url_fetcher.h"
// The Apache headers must be after instaweb headers. Otherwise, the
// compiler will complain
// "strtoul_is_not_a_portable_function_use_strtol_instead".
#include "apr_strings.h" // for apr_pstrdup
#include "http_protocol.h"
namespace net_instaweb {
namespace {
// Default handler when the file is not found
void SlurpDefaultHandler(request_rec* r) {
ap_set_content_type(r, "text/html; charset=utf-8");
std::string buf = StringPrintf(
"<html><head><title>Slurp Error</title></head>"
"<body><h1>Slurp failed</h1>\n"
"<p>host=%s\n"
"<p>uri=%s\n"
"</body></html>",
r->hostname, r->unparsed_uri);
ap_rputs(buf.c_str(), r);
r->status = HttpStatus::kNotFound;
r->status_line = "Not Found";
}
// TODO(jmarantz): The ApacheWriter defined below is much more
// efficient than the mechanism we are currently using, which is to
// buffer the entire response in a string and then send it later.
// For some reason, this did not work when I tried it, but it's
// worth another look.
class ApacheWriter : public Writer {
public:
ApacheWriter(request_rec* r, SimpleMetaData* response_headers,
int64 flush_limit)
: request_(r),
response_headers_(response_headers),
size_(0),
flush_limit_(flush_limit),
headers_out_(false) {
}
virtual bool Write(const StringPiece& str, MessageHandler* handler) {
if (!headers_out_) {
OutputHeaders();
}
ap_rwrite(str.data(), str.size(), request_);
size_ += str.size();
if ((flush_limit_ != 0) && (size_ > flush_limit_)) {
Flush(handler);
}
return true;
}
virtual bool Flush(MessageHandler* handler) {
ap_rflush(request_);
size_ = 0;
return true;
}
int64 size() const { return size_; }
void OutputHeaders() {
if (headers_out_) {
return;
}
headers_out_ = true;
// Apache2 defaults to set the status line as HTTP/1.1. If the
// original content was HTTP/1.0, we need to force the server to use
// HTTP/1.0. I'm not sure why/whether we need to do this; it was in
// mod_static from the sdpy project, which is where I copied this
// code from.
if ((response_headers_->major_version() == 1) &&
(response_headers_->minor_version() == 0)) {
apr_table_set(request_->subprocess_env, "force-response-1.0", "1");
}
char* content_type = NULL;
CharStarVector v;
CHECK(response_headers_->headers_complete());
if (response_headers_->Lookup(HttpAttributes::kContentType, &v)) {
CHECK(!v.empty());
// ap_set_content_type does not make a copy of the string, we need
// to duplicate it. Note that we will update the content type below,
// after transforming the headers.
content_type = apr_pstrdup(request_->pool, v[v.size() - 1]);
response_headers_->RemoveAll(HttpAttributes::kContentType);
}
response_headers_->RemoveAll(HttpAttributes::kTransferEncoding);
response_headers_->RemoveAll(HttpAttributes::kContentLength);
MetaDataToApacheHeader(*response_headers_, request_->headers_out,
&request_->status, &request_->proto_num);
if (content_type != NULL) {
ap_set_content_type(request_, content_type);
}
// Recompute the content-length, because the content is decoded.
// TODO(lsong): We don't know the content size, do we?
// ap_set_content_length(request_, contents.size());
}
private:
request_rec* request_;
SimpleMetaData* response_headers_;
int size_;
int flush_limit_;
bool headers_out_;
DISALLOW_COPY_AND_ASSIGN(ApacheWriter);
};
} // namespace
// Remove any mod-pagespeed-specific modifiers before we go to our slurped
// fetcher.
//
// TODO(jmarantz): share the string constants from mod_instaweb.cc and
// formalize the prefix-matching assumed here.
std::string RemoveModPageSpeedQueryParams(
const std::string& uri, const char* query_param_string) {
QueryParams query_params, stripped_query_params;
query_params.Parse(query_param_string);
bool rewrite_query_params = false;
for (int i = 0; i < query_params.size(); ++i) {
const char* name = query_params.name(i);
static const char kModPagespeed[] = "ModPagespeed";
if (strncmp(name, kModPagespeed, sizeof(kModPagespeed) - 1) == 0) {
rewrite_query_params = true;
} else {
stripped_query_params.Add(name, query_params.value(i));
}
}
std::string stripped_url;
if (rewrite_query_params) {
// TODO(jmarantz): It would be nice to use GoogleUrl to do this but
// it's not clear how it would help. Instead just hack the string.
std::string::size_type question_mark = uri.find('?');
CHECK(question_mark != std::string::npos);
stripped_url.append(uri.data(), question_mark); // does not include "?" yet
if (stripped_query_params.size() != 0) {
stripped_url += StrCat("?", stripped_query_params.ToString());
}
} else {
stripped_url = uri;
}
return stripped_url;
}
void SlurpUrl(const std::string& uri, ApacheRewriteDriverFactory* factory,
request_rec* r) {
SimpleMetaData request_headers, response_headers;
ApacheHeaderToMetaData(r->headers_in, 0, 0, &request_headers);
std::string contents;
ApacheWriter writer(r, &response_headers, factory->slurp_flush_limit());
std::string stripped_url = RemoveModPageSpeedQueryParams(
uri, r->parsed_uri.query);
UrlFetcher* fetcher = factory->ComputeUrlFetcher();
if (fetcher->StreamingFetchUrl(stripped_url, request_headers,
&response_headers, &writer,
factory->message_handler())) {
// In the event of empty content, the writer's Write method may not be
// called, but we should still emit headers.
writer.OutputHeaders();
} else {
MessageHandler* handler = factory->message_handler();
handler->Message(kError, "mod_slurp: fetch of url %s"
" failed.\nRequest Headers: %s\n\nResponse Headers: %s",
stripped_url.c_str(),
request_headers.ToString().c_str(),
response_headers.ToString().c_str());
SlurpDefaultHandler(r);
}
}
} // namespace net_instaweb