blob: 18003ecec9eff93dde8d268487fdd361eaf1a2c1 [file] [log] [blame]
#!/usr/bin/python
#
# Copyright 2010 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fetches a set of URLs via a proxy, keeping statistics.
This script attempts to fetch all URLs in the list given on
the command-line via a specified proxy. It differs from the
widely available tools in that:
- The proxy connection is kept-alive to try to maximize throughput.
- Statuses and completion times for each URL are output to stdout to
help analyze the results.
With the --js option the output is a JavaScript object literal with fields named
for URLs with http:// replaced with whatever is passed as test_cat,
followed by a dash.
"""
__author__ = "morlovich@google.com (Maksim Orlovich)"
import getopt
import httplib
import socket
import sys
import time
import urlparse
def OpenProxy(config):
if config.ssl_mode:
new_proxy = httplib.HTTPSConnection(config.proxy_host, config.proxy_port)
else:
new_proxy = httplib.HTTPConnection(config.proxy_host, config.proxy_port)
new_proxy.connect()
return new_proxy
def ReopenProxy(config, old_proxy):
old_proxy.close()
return OpenProxy(config)
def TestName(config, test_url):
return test_url.replace("http://", config.test_cat + "-")
def FormatResult(config, time_str, status, test_url):
if config.js_mode:
return '"%s": %s,' % (TestName(config, test_url), time_str)
else:
return "%s %s %s" % (time_str, status, test_url)
class Configuration(object):
"""packages up execution settings."""
def __init__(self):
"""Initializes settings from command-line arguments."""
try:
opts, _ = getopt.getopt(sys.argv[1:], "",
["ssl", "js=", "proxy_host=", "proxy_port=",
"urls_file=", "user_agent="])
except getopt.GetoptError as err:
print str(err)
print ("Usage: devel/fetch_all.py [--ssl] [--js test_cat] "
"[--proxy_host host] [--proxy_port port] [--urls_file file] "
"[--user_agent user_agent]")
sys.exit(2)
self.ssl_mode = False
self.js_mode = False
self.has_user_agent = False
for name, value in opts:
if name == "--ssl":
self.ssl_mode = True
elif name == "--js":
self.js_mode = True
self.test_cat = value
elif name == "--proxy_host":
self.proxy_host = value
elif name == "--proxy_port":
self.proxy_port = int(value)
elif name == "--urls_file":
self.urls_file = value
elif name == "--user_agent":
self.has_user_agent = True
self.user_agent = value
def main():
conf = Configuration()
# Open a persistent connection to the proxy
proxy = OpenProxy(conf)
if conf.js_mode:
print "{"
f = open(conf.urls_file, "rt")
for url in f:
try:
# Fetch url
status = 301
followed = 0
while followed < 5:
url = url.strip()
if conf.ssl_mode:
url = url.replace("http://", "https://", 1)
start = time.time()
headers = {"Accept-Encoding": "gzip"}
if conf.has_user_agent:
headers["User-Agent"] = conf.user_agent
if "Chrome/" in conf.user_agent:
headers["Accept"] = "image/webp"
proxy.request("GET", url, None, headers)
response = proxy.getresponse()
response.read()
stop = time.time()
status = response.status
# Honor server's close request
connect_ctl = response.getheader("connection", default="")
if connect_ctl.lower().find("close") != -1:
proxy = ReopenProxy(conf, proxy)
# Report.
print FormatResult(conf, str((stop - start)*1000),
str(status), url)
# Handle redirections
if 301 <= status <= 303 or status == 307:
url = urlparse.urljoin(url,
response.getheader("Location", default=""))
followed += 1
else:
break
except httplib.BadStatusLine:
print FormatResult(conf, "0", "BadStatusLine", url)
proxy = ReopenProxy(conf, proxy)
except httplib.IncompleteRead:
print FormatResult(conf, "0", "IncompleteRead", url)
proxy = ReopenProxy(conf, proxy)
except socket.error:
print FormatResult(conf, "0", "SocketError", url)
proxy = ReopenProxy(conf, proxy)
if conf.js_mode:
print "}"
f.close()
if __name__ == "__main__":
main()