import re
import sys
import time
import DNSFakeoutHelpers
import ConnectorHelpers
import RSSConnectorHelpers
import sqatools
from sqatools import LicenseMakerClient
import TestDocs
import VirtualBrowser
import MetaCartaVersion
# Run a document status report to get the expire and refetch times
def run_document_status_report( username, password, connection_name, job_id, url_match ):
""" Return tuples of (expiration time, refetch time) """
results = ConnectorHelpers.run_document_status_ui( username, password, connection_name,
[ job_id ], identifier_regexp=url_match )
if len(results) != 1:
raise Exception("Expecting to see a single row for identifier %s in document status report, saw %d" % (url_match,len(results)))
result = results[0]
scheduled_time = result[ "Scheduled" ]
scheduled_action = result[ "Scheduled Action" ]
if scheduled_action == "Expire":
time_value = ConnectorHelpers.parse_date_time(scheduled_time)
return ( time_value, -1 )
elif scheduled_action == "Process":
time_value = ConnectorHelpers.parse_date_time(scheduled_time)
return ( -1, time_value )
return ( -1, -1 )
# Some of these feeds are ill-behaved in that they reference documents in other domains. This is a problem for the test, because there's no guarantee that all the
# feeds that contribute documents to a domain will be processed simultaneously. feedburner, for example, references lots of domains but also has lots of feeds, so
# there is a considerable delay before all the documents referenced by feedburner actually make it into the queue. In addition, carrydown constraints cause recrawling of
# some documents up to a dozen times or so (because of the quantity of feed references found).
# A tight scheduling-only test would therefore not include feedburner or any other large aggregator. Unfortunately, I have no simple means of separating one kind of feed from another;
# to do so requires data analysis on the sqlite database that I have not written the tools for. So, I have adopted the approach of allowing looser boundaries.
seed_list = ["",
"Sports from",
# In case I ever come up with a list of servers that are treated especially badly, I can remove them from consideration here
servers_to_ignore = [ "" ]
# Crawl user credentials
username = "testingest"
password = "testingest"
def preclean( target_server, print_errors=True ):
''' Clean up everything we might have done during the execution of this test.
This will include all jobs and ingested documents. '''
# End all jobs and clean up before undoing redirection
except Exception, e:
if print_errors:
print "Error resetting all jobs"
print e
# End redirection first, then clean up session. This permits us to clean up the
# session on the correct websimulator machine.
except Exception, e:
if print_errors:
print "Error ending dns redirection"
print e
# End the current session - but specifically tell it to end the session on
# the websimulator machine we intend to use.
# End the current session
except Exception, e:
if print_errors:
print "Error ending dns capture session"
print e
ConnectorHelpers.delete_crawler_user( username )
except Exception, e:
if print_errors:
print "Error deleting crawl user"
print e
except Exception, e:
if print_errors:
print "Error cleaning up old license"
print e
ConnectorHelpers.teardown_connector_environment( )
except Exception, e:
if print_errors:
print "Error cleaning up debs"
print e
# Main
if __name__ == '__main__':
if len(sys.argv) > 1:
target_machine = sys.argv[1]
if len(sys.argv) > 2:
internal_port = int(sys.argv[2])
internal_port = 53
target_machine = ""
internal_port = 53
print "Precleaning!"
preclean( target_machine, print_errors=False )
print "Setup Connector Environment."
print "Setting up license."
sqatools.appliance.install_license(extra_services=["rssConnector"], detect_gdms=True)
# Set up the ingestion user.
ConnectorHelpers.create_crawler_user( username, password )
ConnectorHelpers.define_gts_outputconnection( )
# Now, perform the dns redirection. This will mean that all dns requests to anything other than localhost will go somewhere
# else.
# Create a session
session_id = "session_%f" % time.time()
print "Running postgresql maintenance (to make timings consistent)"
print "Set up rss connection."
# Define repository connection. The right way to do it is to set the average rate for what we want, and the maximum rate should set a hard limit
# to what's acceptable, usually higher than the average rate. This is reflected in the settings below.
RSSConnectorHelpers.define_rss_repository_connection_ui( username,
"RSS Connection",
throttles=[("","All individual domains",str(24))],
job_id = RSSConnectorHelpers.define_rss_job_ui( username,
seed_list )
# Run the job
ConnectorHelpers.start_job( job_id )
# RSS Connector abort test! Abort the job and see how long it takes for it to actually stop doing stuff
# First, wait 2 minutes for the job to get really rolling
# Now, abort it
ConnectorHelpers.abort_job( job_id )
# Wait to see how long it actually takes to abort the job
the_time = time.time()
ConnectorHelpers.wait_job_complete( job_id )
elapsed_time = time.time() - the_time;
print "It took %f seconds to abort the job" % elapsed_time
if elapsed_time > 120.0:
raise Exception( "Took too long for job to abort: %f seconds" % elapsed_time )
# Now, start it again and this time run the job to completion
ConnectorHelpers.start_job( job_id )
# Once again, wait until it's really working, then cycle the service, to be sure it is shutting down cleanly
ConnectorHelpers.restart_agents( )
ConnectorHelpers.wait_job_complete( job_id )
# Ok, run the report and grab the analysis we want
# The best fetch rate is 24 docs per minute, but we have to adjust also for the fetch delay. The average document size is about 75K, so that fetch delay
# will account for 1.2 seconds per document. If my calculations are right, that yields 16.2 docs per minute when all is said and done.
analysis = DNSFakeoutHelpers.run_session_report_remote( "", [str(16)] )
# Assess the maximum difference from ideal for all the rows returned in this report. If we get something back that looks like it isn't in the format we expect,
# it probably means there was an error string returned instead, so just print that.
max_delay = 0
max_url = "None"
lines = analysis.splitlines()
for line in lines:
# Line is in format:
# doc_count,url,first_fetch,last_fetch,actual_fetch_duration,overall_fetch_duration,actual_fetch_rate,overall_fetch_rate,actual_difference_from_ideal
# First fetch and Last fetch are times; everything else is a number, except time deltas. For example:
# 5 2008-10-16 10:12:46.718005 2008-10-16 10:17:09.371442 0:04:22.653437 0:06:09.371442 1.15 0.81 0:04:10.153437
if line != "":
fields = line.split()
if len(fields) != 11:
raise Exception("Report response in unexpected form: %s" % line)
server_name = fields[1].strip()
if server_name not in servers_to_ignore:
diff = fields[10].strip()
if diff != "None":
# Convert the timedelta to some number of seconds
d = re.match(
r'((?P<days>\d+) days, )?(?P<hours>\d+):'
delta = int(d["seconds"]) + int(d["minutes"]) * 60 + int(d["hours"]) * 3600 + int(d["days"]) * 86400
if delta > max_delay:
max_delay = delta
max_url = server_name
# This crawl runs a little bit behind in the beginning, which adversely affects the timing of progressively larger domains. Also, the effects of multiple references, esp.
# on carrydown and discovery, confuse the report statistics. So this is a loose bound only (9 hrs "behind").
if max_delay > 9 * 60 * 60:
raise Exception("Report response indicates that %s has too large a latency: %d seconds" % (max_url,max_delay))
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
ConnectorHelpers.delete_repository_connection_ui( username, password, "RSSConnection" )
# End the current session
# Stop the redirection
ConnectorHelpers.delete_gts_outputconnection( )
ConnectorHelpers.delete_crawler_user( username )
ConnectorHelpers.teardown_connector_environment( )
print "Performance RSSConnector test PASSED"