blob: cd11b512edf9aa57608c567318732e87a3857612 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Authors: jmarantz@google.com (Joshua Marantz)
// vchudnov@google.com (Victor Chudnovsky)
#include "net/instaweb/http/public/external_url_fetcher.h"
#include <cerrno>
#include <cstdio> // for pclose, popen, FILE
#include "base/logging.h"
#include "net/instaweb/http/public/async_fetch.h"
#include "net/instaweb/http/public/http_response_parser.h"
#include "pagespeed/kernel/base/message_handler.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/http_names.h"
#include "pagespeed/kernel/http/request_headers.h"
#include "pagespeed/kernel/http/response_headers.h"
namespace {
// It turns out to be harder to quote in bash with single-quote
// than double-quote. From man sh:
//
// Single Quotes
// Enclosing characters in single quotes preserves the literal meaning of
// all the characters (except single quotes, making it impossible to put
// single-quotes in a single-quoted string).
//
// Double Quotes
// Enclosing characters within double quotes preserves the literal meaning
// of all characters except dollarsign ($), backquote (‘), and backslash
// (\). The backslash inside double quotes is historically weird, and
// serves to quote only the following characters:
// $ ‘ " \ <newline>.
// Otherwise it remains literal.
//
// So we put double-quotes around most strings, after first escaping
// any of these characters:
const char kEscapeChars[] = "\"$`\\";
} // namespace
namespace net_instaweb {
// Default user agent to a Chrome user agent, so that we get real website.
const char ExternalUrlFetcher::kDefaultUserAgent[] =
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) "
"AppleWebKit/534.0 (KHTML, like Gecko) Chrome/6.0.408.1 Safari/534.0";
void ExternalUrlFetcher::AppendHeaders(const RequestHeaders& request_headers,
StringVector* escaped_headers) {
for (int i = 0, n = request_headers.NumAttributes(); i < n; ++i) {
GoogleString escaped_name, escaped_value;
BackslashEscape(request_headers.Name(i), kEscapeChars, &escaped_name);
BackslashEscape(request_headers.Value(i), kEscapeChars, &escaped_value);
StrAppend(&escaped_name, ": ", escaped_value);
escaped_headers->push_back(escaped_name);
}
}
void ExternalUrlFetcher::Fetch(
const GoogleString& url, MessageHandler* handler, AsyncFetch* fetch) {
const RequestHeaders& request_headers = *fetch->request_headers();
ResponseHeaders* response_headers = fetch->response_headers();
// Use default user-agent if none is set in headers.
ConstStringStarVector values;
request_headers.Lookup("user-agent", &values);
const char* user_agent = values.empty() ? kDefaultUserAgent : NULL;
StringVector escaped_headers;
AppendHeaders(request_headers, &escaped_headers);
GoogleString escaped_url;
BackslashEscape(url, kEscapeChars, &escaped_url);
GoogleString cmd = ConstructFetchCommand(escaped_url,
user_agent,
escaped_headers);
handler->Message(kInfo, "%s --... %s\n", GetFetchLabel(), url.c_str());
VLOG(2) << "Running: " << cmd;
FILE* cmd_stdout = popen(cmd.c_str(), "r");
bool ret = false;
if (cmd_stdout == NULL) {
handler->Message(kError, "Fetch command popen failed on url %s: %s",
url.c_str(), strerror(errno));
} else {
HttpResponseParser parser(response_headers, fetch, handler);
ret = parser.Parse(cmd_stdout);
int exit_status = pclose(cmd_stdout);
if (exit_status != 0) {
// The command failed. Some (all?) commands do not always
// (ever?) write appropriate headers when it fails, so invent
// some.
if (response_headers->status_code() == 0) {
response_headers->set_first_line(1, 1, HttpStatus::kBadRequest,
"Command Failed");
response_headers->ComputeCaching();
// TODO(jmarantz): set_headers_complete
// response_headers->set_headers_complete(true);
fetch->Write(GetFetchLabel(), handler);
fetch->Write(" failed: ", handler);
fetch->Write(url, handler);
fetch->Write("<br>\nExit Status: ", handler);
fetch->Write(IntegerToString(exit_status), handler);
}
}
}
fetch->Done(ret);
}
void ExternalUrlFetcher::set_binary(const GoogleString& binary) {
binary_ = binary;
}
} // namespace net_instaweb