import os
import sys
import errno
import time
import datetime
import subprocess
import socket
import signal
import re
import sqatools
from sqatools import docs, sqautils
import urllib
import urllib2
import TestDocs
import SQAhelpers
import traceback
import VirtualBrowser
import pycurl
from sqatools import LicenseMakerClient
from sqatools import appliance
# These methods are helper methods that abstract from the various testing scripts
# Set ingestion proxy timeout
def set_proxy_timeout(timeout):
""" Set the ingestion proxy timeout.
# Restart leafblower
# Restore ingestion proxy timeout to default
def restore_default_proxy_timeout():
""" Restore the proxy timeout to default value """
# Restart leafblower
# Global flag pertaining to which kind of adtools we will use
use_legacy_adtools = False
# Global adtools object, which will be overridden by various imports when initialized properly
#adtools = None
# Synchronize ad mode. This detects the current setting of legacy mode, based on the appliance environment
def synchronize_legacy_mode( ):
""" Detect which mode is currently in place. This is necessary in order for preclean to work properly. """
global use_legacy_adtools
if os.readlink("/etc/alternatives/active_directory_control") == "/usr/lib/metacarta/active_directory_tool":
# Legacy mode
use_legacy_adtools = True
elif os.readlink("/etc/alternatives/active_directory_control") == "/usr/lib/metacarta/active_directory_control":
# MD mode
use_legacy_adtools = False
raise Exception("Could not synchronize legacy mode")
# Select 'legacy' ad mode
def select_legacy_mode(use_legacy_tools=True):
""" Change the mode based on where we currently are """
global use_legacy_adtools
# Call out to active directory tool to set or clear legacy mode
if use_legacy_tools != use_legacy_adtools:
if use_legacy_tools:
invoke_root_script( [ "/usr/bin/active_directory_control", "downgrade" ] )
invoke_root_script( [ "/usr/bin/active_directory_control", "upgrade-ad" ] )
use_legacy_adtools = use_legacy_tools
# Cancel legacy mode (restore to legacy mode = off)
def cancel_legacy_mode():
""" Cancel legacy mode, but only if it is set """
if use_legacy_adtools:
def initialize_adtools():
""" Import the right adtools version, and return it """
if use_legacy_adtools:
from wintools import adtools as adtools
from wintools import adtools_md as adtools
return adtools
# Records of when certain operations started
time_records = {}
def start_timing( time_key ):
""" Record the start of a given activity """
global time_records
if time_records.has_key( time_key ):
raise Exception("Two starts with the same key! %s" % time_key)
time_records[time_key] = time.time()
def end_timing( time_key, limit=None ):
""" Record the end of a given activity, and error out if it exceeds the limit """
global time_records
if not time_records.has_key( time_key ):
raise Exception("End without start! %s" % time_key)
start_time = time_records[time_key]
end_time = time.time()
print "Test activity %s took %f seconds." % (time_key,end_time-start_time)
del time_records[time_key]
if limit != None and end_time-start_time > limit:
raise Exception("Time for activity %s took too long; actual %f, limit %f" % (time_key,end_time-start_time,limit))
# Do an http fetch using curl
class CallbackClass:
def __init__(self):
self.contents = ''
def body_callback(self, buf):
self.contents = self.contents + buf
def invoke_curl( url_string, user_name=None, password=None ):
""" Use curl to request data from a url via http """
t = CallbackClass()
c = pycurl.Curl()
c.setopt(c.URL, url_string)
c.setopt(c.WRITEFUNCTION, t.body_callback)
if user_name != None and password != None:
c.setopt(c.USERPWD, "%s:%s" % (user_name,password))
errcode = c.getinfo(pycurl.HTTP_CODE)
if errcode != 200:
raise Exception("HTTP error %s" % str(errcode))
return t.contents
# Set the clock forward enough to screw up thread sleeps
def set_clock_forward():
""" Set clock forward 1 day (for testing thread sleep) """
t =
day, month, year = + 1, t.month, t.year
if day > 28:
day, month = 1, month + 1
if month > 12:
month, year = month - 12, year + 1
invoke_root_script(["date","%02d%02d%02d%02d%02d" % (month, day, t.hour, t.minute, year)])
# Set clock back to normal
def restore_clock():
""" Restore clock to the correct time, as served by ntp """
# reset clock
# restart ntp server
# Count temporary files in temporary area that have a specified prefix
def count_temporary_files( prefix_string ):
""" Look in /var/tmp for files that begin with prefix_string, and return a count """
file_list = invoke_root_script( [ "ls", "-1", "/var/tmp" ] )
file_array = file_list.splitlines()
tfilecount = 0;
for filename in file_array:
if filename.startswith( prefix_string ):
tfilecount += 1
return tfilecount
# Kick off a maintenance operation
def run_maintenance( ):
""" Run postgresql maintenance operation, and check for success.
output = invoke_root_script( [ "/usr/lib/metacarta/postgres-maintenance" ] )
lines = output.splitlines()
saw_vacuum = False
for line in lines:
line_pos = line.find("VACUUM")
if line_pos != None and line_pos != -1:
saw_vacuum = True
line_pos = line.find("Postgresql maintenance completed")
if line_pos != None and line_pos != -1:
if saw_vacuum == False:
raise Exception( "Maintenance did not perform a VACUUM; got %s" % output )
raise Exception( "Maintenance operation did not complete successfully; output %s" % output)
# Find the metacarta-agents process id
def find_daemon_pid( ):
""" Find the pid of the daemon process, and return None if it can't be found. """
process_list = invoke_root_script( [ "ps", "-eo", "pid,command", "-w", "-w" ] )
# Break process_list into lines
process_array = process_list.splitlines()
# Go through each line
pid = None
for line in process_array:
line_pos = line.find("com.metacarta.agents.AgentRun")
if line_pos != None and line_pos != -1:
line_fields = line.split()
pid = int(line_fields[ 0 ])
return pid
# Find the daemon invocation command
def confirm_daemon_switch( switch_string ):
""" Find a given switch in the daemon invocation line. """
process_list = invoke_root_script( [ "ps", "-eo", "command", "-w", "-w" ] )
# Break process_list into lines
process_array = process_list.splitlines()
# Go through each line
command = None
for line in process_array:
line_pos = line.find("com.metacarta.agents.AgentRun")
if line_pos != None and line_pos != -1:
command = line
if command.find(switch_string) == -1:
raise Exception("Expected command clause '%s', did not see it" % switch_string)
# Find the number of matching lsof lines for a regexp against the daemon process
def count_lsof_daemon_lines( match_regexp ):
""" Count the number of lines from lsof for the AgentRun process which match the
provided regexp.
regexp_pattern = re.compile( match_regexp, 0 )
# Find the pid, if any
pid = find_daemon_pid( )
if pid == None:
raise Exception("Daemon process is not running")
# Now, invoke lsof -p
lsof_list = invoke_root_script( [ "lsof", "-p", str(pid) ] )
lsof_array = lsof_list.splitlines()
matching_count = 0
for line in lsof_array:
if line ) != None:
matching_count += 1
return matching_count
# Find the amount of resident memory the AgentRun process is consuming, in bytes
def calculate_daemon_memory( ):
""" This involves a ps command to get the mem usage of the AgentRun process """
process_list = invoke_root_script( [ "ps", "-eo", "vsize,rss,command", "-w", "-w" ] )
# Break process_list into lines
process_array = process_list.splitlines()
# Go through each line
for line in process_array:
line_pos = line.find("com.metacarta.agents.AgentRun")
if line_pos != None and line_pos != -1:
line_fields = line.split()
return ( int(line_fields[ 0 ]), int(line_fields[ 1 ]) )
return (0,0)
# Change CF logging setup; does NOT restart tomcat and metacarta-agents
def configure_cf( desired_options ):
""" Modifies the agents.conf file to set up desired logging behavior.
The desired_log_options argument is a dictionary of keys and values corresponding
to what should get set in the log. NOTE WELL: Only entries that already
exist in the agents.conf file will be modified!! Does not restart tomcat or agents
fh = open( "/etc/metacarta/agents.conf", "r" )
outputfh = open( "/etc/metacarta/", "w" )
for line in fh.readlines():
if line.lstrip().startswith("#"):
equals_index = line.find("=")
if equals_index == -1:
elif desired_options.has_key(line[0:equals_index]):
new_value = desired_options[ line[0:equals_index] ]
comment_index = line.find("#",equals_index+1)
if comment_index == -1:
comment_index = len(line.rstrip())
outputfh.write(line[0:equals_index+1] + new_value + line[comment_index:len(line)])
os.remove( "/etc/metacarta/agents.conf" )
os.rename( "/etc/metacarta/", "/etc/metacarta/agents.conf" )
# Clear the metacarta error log
def get_metacarta_log_pos(log_name="/var/log/metacarta/error.log"):
""" Find the position in the metacarta error log, so we can figure out where to read from. """
#tag_string = "------ Connectorframework basic test start marker %f ------" % time.time()
val = os.stat( log_name )
return (val.st_ino,val.st_size)
# Read the metacarta logs, looking for lines that match a regexp
def read_metacarta_log( reg_exp, position, log_name="/var/log/metacarta/error.log" ):
regexp_pattern = re.compile( reg_exp, 0 )
inode,offset = position
val = os.stat( log_name )
if val.st_ino == inode:
return read_matching_lines_position( log_name, offset, regexp_pattern )
val = os.stat( "%s.1" % log_name)
if val.st_ino != inode:
raise Exception("Log rolled but I can't figure out when")
return read_matching_lines_position( "%s.1" % log_name, offset, regexp_pattern ) + read_matching_lines_position( log_name, 0, regexp_pattern )
# Read lines matching a regexp from a file starting at a position
def read_matching_lines_position( file_name, position, regexp_pattern=None ):
fh = open( file_name, "r" ),0)
rlines = [ ]
for line in fh.readlines():
if line ) != None:
rlines.append( line )
return rlines
# No log
return [ ]
# Clear logs
def clear_logs():
# Clear out the existing logs; we don't want to confuse matters with old stuff
invoke_root_script( [ "rm", "-f", "/var/log/metacarta/java-agents/agents.log" ] )
# Read the log, searching for a particular regular expression, and return all lines
# that have that expression, in order.
def read_log( reg_exp ):
regexp_pattern = re.compile( reg_exp, 0 )
return read_matching_lines_position( "/var/log/metacarta/java-agents/agents.log", 0, regexp_pattern )
# Shutdown tomcat
def shutdown_tomcat():
invoke_root_script( [ "/etc/init.d/tomcat5.5", "stop" ] )
# Shutdown agents
def shutdown_agents(timeout=40):
start_time = time.time()
output = invoke_root_script( [ "/etc/init.d/metacarta-agents", "stop" ] )
end_time = time.time()
if output.find("kill") != -1:
raise Exception("metacarta-agents did not shut down cleanly!")
if end_time - start_time > timeout:
raise Exception("Shutdown of metacarta-agents took more than %f seconds" % timeout)
# Call this method to start tomcat
def start_tomcat():
invoke_root_script( [ "/etc/init.d/tomcat5.5", "start" ] )
# Call this method to start agents
def start_agents():
invoke_root_script( [ "/etc/init.d/metacarta-agents", "start" ] )
# Call this method to restart tomcat
def restart_tomcat():
invoke_root_script( [ "/etc/init.d/tomcat5.5", "restart" ] )
# Call this method to restart agents
def restart_agents():
start_time = time.time()
output = invoke_root_script( [ "/etc/init.d/metacarta-agents", "restart" ] )
end_time = time.time()
if output.find("kill") != -1:
raise Exception("metacarta-agents did not shut down cleanly!")
# 2850's get so maxed out that 60 wasn't enough
if end_time - start_time > 120:
raise Exception("Restart of metacarta-agents took more than one minute")
# Call this method to shut down ingestion
def stop_leafblower():
from sqatools import LeafblowerHacks
# Call this method to start ingestion
def start_leafblower():
from sqatools import LeafblowerHacks
# Call this method to set NTLMv1 mode for Share Connector
def set_shareconnector_ntlmv1_mode():
invoke_root_script( [ "/usr/bin/shareconnector_control", "set", "ntlmv1" ] )
# Call this method to restore default (NTLMv2) mode for Share Connector
def set_shareconnector_default_mode():
invoke_root_script( [ "/usr/bin/shareconnector_control", "set", "ntlmv2" ] )
# Get the status of the ntlm version switch for Share Connector
def get_shareconnector_mode():
return invoke_root_script( [ "/usr/bin/shareconnector_control", "status" ] )
# This method deregisters a connector
def deregister_connector( class_name ):
"""Deregisters a connector; used in tests that see what happens when a connector
has been uninstalled, where connections and jobs may remain."""
invoke_script( [ "/usr/lib/metacarta/crawler-unregisterconnector", class_name ] )
# This method reregisters a connector
def register_connector( class_name, description ):
"""Registers or re-registers a connector; used in tests that see what happens when a connector
has been uninstalled, and is reinstalled."""
invoke_script( [ "/usr/lib/metacarta/crawler-registerconnector", class_name, description ] )
# This method deregisters an authority
def deregister_authorityconnector( class_name ):
"""Deregisters an authority connector; used in tests that see what happens when an authority
has been uninstalled, where connections may remain."""
invoke_script( [ "/usr/lib/metacarta/crawler-unregisterauthority", class_name ] )
# This method reregisters an authority connector
def register_authorityconnector( class_name, description ):
"""Registers or re-registers an authority connector; used in tests that see what happens when an authority
has been uninstalled, and is reinstalled."""
invoke_script( [ "/usr/lib/metacarta/crawler-registerauthority", class_name, description ] )
# This method checks what the authority webapp returns
def ask_authority_webapp( user_name ):
"""Ask the authority webapp to see what it says. Returns the entire response."""
f = urllib2.urlopen("http://localhost:8180/authorityservice/UserACLs?username=%s" % urllib.quote(user_name))
except urllib2.HTTPError,e:
return (str(e),None)
return (None,
# Dump configuration to a file
def export_configuration( filename ):
""" Export configuration to the specified file """
invoke_root_script( [ "/usr/lib/metacarta/backup-crawler-configuration", filename ] )
# Restore configuration from a file
def import_configuration( filename ):
""" Import configuration from the specified file """
invoke_root_script( [ "/usr/lib/metacarta/restore-crawler-configuration", filename ] )
# Call this method to prepare for the test. Turns off maintenance script, etc.
def setup_connector_environment( ):
"""Set up the connector environment for tests - disable maintenance, other hooks here."""
def disable_maintenance( ):
""" Disable maintenance script, then wait if it is running until it stops.
invoke_root_script( [ "mv",
"local-metacarta-postgres-maintenance-crontab-copy" ] )
while True:
os.stat( "/var/run/metacarta/postgres-maintenance-in-progress" )
except Exception, e:
# Call this method to clean up an installed machine. Restores maintenance
# script, etc.
def teardown_connector_environment( ):
"""restore normal connector configuration - turn maintenance back on, etc."""
def enable_maintenance( ):
os.stat( "/etc/cron.d/metacarta-postgres-maintenance-crontab" )
except Exception, e:
invoke_root_script( [ "mv",
"/etc/cron.d/metacarta-postgres-maintenance-crontab" ] )
invoke_root_script( [ "rm", "-f",
"local-metacarta-postgres-maintenance-crontab-copy" ] )
# This method MUST be called in order to use the connector framework.
# This also has to be added to the documentation because without it search will not work.
def enable_connector_framework( ):
# Set maximum document-size limit
def set_max_document_size( size_value ):
fd = open( "/etc/metacarta/ingest_reject_size", "w" )
fd.write( str(size_value) )
fd.write( "\n" )
fd.close( )
# Clear maximum document-size limit
def clear_max_document_size( ):
os.remove( "/etc/metacarta/ingest_reject_size" )
# Create a serial file, for client certificate creation
def create_serial_file( serial_file_name ):
fd = open( serial_file_name, "w" )
fd.write( "01\n" )
fd.close( )
# Build a signed, duck-specific client certificate using an existing certificate authority.
# The certificate authority is specified by a public key and a private key, and a password.
def create_client_certificate( ca_public_key_file, ca_private_key_file, serial_file, client_cert_file, password ):
""" This code uses ssl_control and openssl to build a certificate request, and sign the certificate,
respectively. """
invoke_root_script( [ "ssl_control",
"client.req" ], "US\nMassachusetts\nCambridge\nMetaCarta\nEngineering/QA\n\n\n\n" )
# Now invoke openssl
invoke_root_script( [ "openssl",
client_cert_file ], password + "\n" )
# Clean out everything
def reset_all( ):
# Clear out what's been ingested before
from sqatools import LeafblowerHacks
# Get rid of all jobs, etc.
invoke_crawler_command( "com.metacarta.crawler.ResetAll" )
invoke_crawler_command( "com.metacarta.authorities.ResetAll")
# Define the standard output connection
def define_gts_outputconnection( ):
""" Define a standard GTS output connection """
define_outputconnection( "GTS", "GTS", "com.metacarta.agents.output.gts.GTSConnector", configparams=[ "Ingestion URI=http://localhost:7031/HTTPIngest" ] )
# Delete the standard output connection
def delete_gts_outputconnection( ):
""" Delete the standard GTS output connection """
delete_outputconnection( "GTS" )
# Define an output connection
def define_outputconnection( connectionname, connectiondescription, connectionclass,
poolmax="10", configparams=[] ):
"""Define a connection"""
listparams = [ process_argument(connectionname),
str(poolmax) ]
for item in configparams:
invoke_crawler_command( "com.metacarta.crawler.DefineOutputConnection", argument_list=listparams )
# Define a repository connection
def define_repositoryconnection( connectionname, connectiondescription, connectionclass, authorityname="",
poolmax="10", configparams=[] ):
"""Define a connection"""
listparams = [ process_argument(connectionname),
str(poolmax) ]
for item in configparams:
invoke_crawler_command( "com.metacarta.crawler.DefineRepositoryConnection", argument_list=listparams )
# Define an authority connection
def define_authorityconnection( connectionname, connectiondescription, connectionclass,
poolmax="10", configparams=[] ):
"""Define a connection"""
listparams = [ process_argument(connectionname),
poolmax ]
for item in configparams:
invoke_crawler_command( "com.metacarta.authorities.DefineAuthorityConnection", argument_list=listparams )
# Define a job
def define_job( jobdescription, connectionname, xml, output_connection="GTS", output_xml="", type="specified", startmethod="windowbegin",
recrawlinterval=0 ):
"""Define a job"""
return invoke_crawler_command( "com.metacarta.crawler.DefineJob", argument_list=[ process_argument(jobdescription),
"%d" % (recrawlinterval * 1000*60),
process_argument(output_xml) ] )
# Start a job
def start_job( job_id ):
"""Start a job"""
return invoke_crawler_command( "com.metacarta.crawler.StartJob", argument_list=[ process_argument(job_id) ] )
# Wait for job to be complete
def wait_job_complete( job_id ):
"""Wait for job to finish"""
return invoke_crawler_command( "com.metacarta.crawler.WaitForJobInactive", argument_list=[ process_argument(job_id) ] )
# Wait for job to be deleted
def wait_job_deleted( job_id ):
"""Wait for job to be successfully removed"""
return invoke_crawler_command( "com.metacarta.crawler.WaitForJobDeleted", argument_list=[ process_argument(job_id) ] )
# Delete a job
def delete_job( job_id ):
"""Delete an existing job"""
return invoke_crawler_command( "com.metacarta.crawler.DeleteJob", argument_list=[ process_argument(job_id) ] )
# Abort a job
def abort_job( job_id ):
"""Abort a running job"""
return invoke_crawler_command( "com.metacarta.crawler.AbortJob", argument_list=[ process_argument(job_id) ] )
# Pause a job
def pause_job( job_id ):
""" Pause a running job """
return invoke_crawler_command( "com.metacarta.crawler.PauseJob", argument_list=[ process_argument(job_id) ] )
# Resume a job
def resume_job( job_id ):
""" Resume a running job """
return invoke_crawler_command( "com.metacarta.crawler.RestartJob", argument_list=[ process_argument(job_id) ] )
# Wait for job to pause
def wait_job_paused( job_id ):
""" Wait for a job to pause """
return invoke_crawler_command( "com.metacarta.crawler.WaitJobPaused", argument_list=[ process_argument(job_id) ] )
# Delete an output connection
def delete_outputconnection( connection_name ):
"""Delete an output connection"""
return invoke_crawler_command( "com.metacarta.crawler.DeleteOutputConnection", argument_list=[ process_argument(connection_name) ] )
# Delete a repository connection
def delete_repositoryconnection( connection_name ):
"""Delete a repository connection"""
return invoke_crawler_command( "com.metacarta.crawler.DeleteRepositoryConnection", argument_list=[ process_argument(connection_name) ] )
# Delete an authority connection
def delete_authorityconnection( connection_name ):
"""Delete an authority connection"""
return invoke_crawler_command( "com.metacarta.authorities.DeleteAuthorityConnection", argument_list=[ process_argument(connection_name) ] )
def set_scheduled_time( jobid, minutesFromNow, intervalMinutes=None ):
"""Add a schedule time for a job"""
# What we do is get the current time, add the seconds, and then call the schedule set script
currentTime = time.time()
triggerTime = currentTime + minutesFromNow * 60
triggerStruct = time.localtime(triggerTime)
interval = ""
if (intervalMinutes != None):
interval = "%d" % intervalMinutes
months = [ "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december" ]
hours = [ "12am", "1am", "2am", "3am", "4am", "5am", "6am", "7am", "8am", "9am", "10am", "11am", "12pm",
"1pm", "2pm", "3pm", "4pm", "5pm", "6pm", "7pm", "8pm", "9pm", "10pm", "11pm" ]
return invoke_crawler_command( "com.metacarta.crawler.AddScheduledTime", argument_list=[ jobid,
"%d" % triggerStruct.tm_mday,
months[triggerStruct.tm_mon - 1],
"%d" % triggerStruct.tm_year,
"%d" % triggerStruct.tm_min ] )
def change_job_doc_spec( jobid, docspec ):
"""Change the job document specification"""
return invoke_crawler_command( "com.metacarta.crawler.ChangeJobDocSpec", argument_list=[ jobid,
docspec ] )
def change_auth_spec( authname, spec ):
"""Change the authority specification"""
return invoke_crawler_command( "com.metacarta.authorities.ChangeAuthSpec", argument_list=[ authname,
spec ] )
def get_everyone_sid( ):
""" Get the sid of 'everyone' """
return initialize_adtools().EveryoneSID
def get_ad_user_sid( user ):
""" Get the sid of an ad user """
return initialize_adtools().get_sid( user )
def conditionally_add_ad_user( ad_domain_info, user, password ):
""" Create a user if one doesn't already exist.
The appliance must be joined to the domain before this will work.
local_ad_handle = initialize_adtools()
local_ad_handle.query_entity_in_ldap( user )
except local_ad_handle.ADToolsException, e:
add_ad_user( ad_domain_info, user, password )
# if the users are going to hang around we need to ensure that their passwords don't expire
# this is unconditional due to a point in time where they were expiring and we want tests
# to force this setting
local_ad_handle.set_no_password_expire( ad_domain_info.domain_controller_ambassador, ad_domain_info.ad_domain, user)
def add_ad_user( ad_domain_info, user, password ):
"""Create a user in the specified AD domain controller"""
initialize_adtools().create_user( user, password, ad_domain_info.domain_controller_ambassador, ad_domain_info.ad_domain, confirm_user=False )
def delete_ad_user( ad_domain_info, user ):
"""Delete a user"""
initialize_adtools().delete_user(user, ad_domain_info.domain_controller_ambassador)
def configure_ad( ad_domain_info, join_multidomain = False ):
"""Configure appliance to use AD authentication"""
adtools = initialize_adtools()
# if multi-domain join requested then join all the child domains too
# Disable precheck of admin user; this is because this feature is broken in ntlmv2 domains
if join_multidomain:
adtools.ad_md_setup_join( ad_domain_info, False, precheck_admin_user=False )
adtools.join_ad_with_defaults( ad_domain_info.realm_admin_password, admin_user=ad_domain_info.realm_admin.split("@")[0], precheck_admin_user=False )
# Added so the 76 domain works
# Set auth_control again to specify connector framework
invoke_root_script( [ "auth_control",
"connector_framework" ] )
def turn_off_ad( ad_domain_info, leave_multidomain = False):
"""Disable ad authentication"""
# Disable precheck of admin user; this is because this feature is broken in ntlmv2 domains
if leave_multidomain:
initialize_adtools().leave_ad( realm_admin_password=ad_domain_info.realm_admin_password, disable_machine_acct=False, delete_machine_acct=True, already_left=0, precheck_admin_user=False )
initialize_adtools().leave_ad( realm_admin_password=ad_domain_info.realm_admin_password, admin_user=ad_domain_info.realm_admin.split("@")[0], precheck_admin_user=False )
def create_crawler_user( user, password ):
"""Create a user specifically for crawler UI, that the UI will be able to use"""
if user == None:
invoke_root_script( [ "basic_auth_control", "add", "ingest_users", user+":"+password ] )
def delete_crawler_user( user ):
"""Delete the crawler user"""
if user == None:
invoke_root_script( [ "basic_auth_control", "remove", "ingest_users", user ] )
def add_basic_auth_user( user, password ):
"""Set up a basic auth user and password"""
invoke_root_script( [ "basic_auth_control", "add", "my_ui_users", user+":"+password ], "yes\n" )
def delete_basic_auth_user( user ):
"""Delete basic auth user"""
invoke_root_script( [ "basic_auth_control", "remove", "my_ui_users", user ], "yes\n" )
def configure_basic_auth( ):
"""Reset the system so that basic auth is the authenticator, and wait for apache to come back up."""
invoke_root_script( [ "basic_auth_control", "createdb", "my_ui_users" ] )
invoke_root_script( [ "auth_control", "auth", "search-web-ui,soap-search,secure-soap-search",
"basic_auth", "my_ui_users" ] )
invoke_root_script( [ "auth_control",
"connector_framework" ] )
wait_for_apache( )
def turn_off_basic_auth( ):
"""Reset the system so basic auth is off"""
invoke_root_script( [ "auth_control", "auth", "secure-soap-search",
"basic_auth", "ui_users" ] )
invoke_root_script(["auth_control", "auth", "search-web-ui,soap-search", "none"])
wait_for_apache( )
invoke_root_script( [ "basic_auth_control", "destroydb", "my_ui_users", "--force"], "yes\n" )
def invoke_crawler_command( classname, argument_list=[], input=None, additional_switches=[], additional_classpath=None ):
""" Invoke a crawler command, including the metacarta-pullagent-test.jar in addition to the full java-environment classpath
and definitions. Always run as tomcat user. """
if additional_classpath:
additional_classpath = "crawler-testing-package/metacarta-pullagent-test.jar:%s" % additional_classpath
additional_classpath = "crawler-testing-package/metacarta-pullagent-test.jar"
command_arguments = [ "crawler-testing-package/executejava", "-classpath", additional_classpath ] + additional_switches+ [ classname ] + argument_list
return invoke_script(command_arguments,input=input)
def invoke_script( argumentlist, input=None, stdin_encoding=None, stdout_encoding="utf-8", stderr_encoding=None ):
# for some reason the argument list can have null entries, map them to ""
for i in range(0,len(argumentlist)):
if argumentlist[i] == None:
print "Warning: None argument in invoke_script, printing stack trace"
argumentlist[i] = ""
programname = argumentlist[0]
fullargumentlist = [ "sudo", "-u", "tomcat55" ] + argumentlist
print " ".join([quote_escape(i) for i in fullargumentlist])
program = subprocess.Popen(fullargumentlist,
if input != None:
if stdin_encoding != None:
input = input.encode(stdin_encoding)
input = input.encode()
(outputtext, errortext) = program.communicate(input)
retcode = program.wait()
if stderr_encoding != None:
errortext = errortext.decode(stderr_encoding)
errortext = errortext.decode()
if stdout_encoding != None:
outputtext = outputtext.decode(stdout_encoding)
outputtext = outputtext.decode()
if retcode != 0:
raise Exception((u"Error response from %s: %d, message = [" % (programname,retcode)) + errortext + u"]")
# Right at the moment, the testing infrastructure only accepts 7-bit binary values, so we have no choice but to do what we can to bash
# the output to 7 bits
print (u"Program %s output = '" % (programname) + outputtext + u"'").encode("utf-8")
return outputtext
def invoke_root_script( argumentlist, input=None, allow_errors=False ):
programname = argumentlist[0]
# Set up the escaped argument list, which is a single string with each
# argument included in an escaped, quoted manner
accumulator = [ ]
user = sqautils.check_username_configured()
if user != "root":
accumulator.append(" ")
for item in argumentlist:
accumulator.append(" ")
actualcommand = "".join(accumulator)
print actualcommand
program = subprocess.Popen(argumentlist,
if input != None:
input = input.encode()
(outputtext, errortext) = program.communicate(input)
retcode = program.wait()
outputtext = outputtext.decode()
errortext = errortext.decode()
if allow_errors == False and retcode != 0:
raise Exception((u"Error response from %s: %d, message = [" % (programname,retcode)) + errortext + u"]")
# Right at the moment, the testing infrastructure only accepts 7-bit binary values, so we have no choice but to do what we can to bash
# the output to 7 bits
print (u"Program %s output = '" % (programname) + outputtext + u"'").encode("utf-8")
return outputtext
def process_argument( mystring ):
if mystring == None:
return ''
return mystring
def quote_escape( mystring ):
if mystring == None:
return '""'
outputresult = [ '"' ]
for i in range(len(mystring)):
character = mystring[i]
# In order to add memex documents into the repository, had '*' in here too, but that led to fields having '\*' in them instead of '*'.
if character == '"' or character == '\\' or character == '$':
outputresult.append( '"' )
return ''.join(outputresult)
# Waits until apache is up -- stolen from active_directory_tool
def wait_for_apache( ):
"""Wait for apache to respond to port 80"""
http_port = 80
for x in range(0,100):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(("localhost", http_port))
except IOError, e:
if e.errno == errno.EINTR:
raise Exception("timeout talking to port %s" % http_port)
except socket.error, e:
if"Connection refused", str(e)):
raise Exception("Unable to connect to port %d. Check apache error log." % \
def regexp_encode( thestring ):
"""Encode a string so that regexp will match it exactly"""
rval = ""
for thechar in thestring:
if thechar == '.':
rval += "\."
elif thechar == '?':
rval += "\?"
elif thechar == '\\':
rval += "\\\\"
rval += thechar
return rval
def search_exists_check( keywords, collectionname, example, win_host=None, username=None, password=None ):
"""Check for existence in search results of the single example supplied"""
# The SHN and HN may get restarted out from under our query. So
# this should loop a few times to find what we are looking for.
# This means the test may take 4 seconds longer to fail, and other
# timing related issues may not be caught.
for counter in range(4):
matches = TestDocs.search_documents_user(keywords,
# matches are url-encoded, full file names (including /root)
# Filenames in docs are therefore url encoded here
quotedoc = example #urllib.quote(example,safe='')
cn = re.compile(example)
for doc in matches:
# Use re matching
mo = cn.match( doc )
if mo != None:
raise Exception("Expected document %s not returned when searching" % quotedoc)
def search_nonexists_check( keywords, collectionname, example, win_host=None, username=None, password=None ):
"""Check for nonexistence in search results of the single example supplied"""
# The SHN and HN may get restarted out from under our query. This
# means the test may get success when the underlying query fails,
# so there's not much point in doing it. We'll do it anyway
# though, for the day may come when it will be meaningful to ask
# this question.
# We could also query a few times here to gain more confidence
# that the number of results returned is 0.
matches = TestDocs.search_documents_user(keywords,
# matches are url-encoded, full file names (including /root)
# Filenames in docs are therefore url encoded here
quotedoc = example #urllib.quote(example,safe='')
cn = re.compile(example)
for doc in matches:
# Use re matching
mo = cn.match( doc )
if mo != None:
raise Exception("Unexpected document %s was returned when searching: Actual results =[%s]" % (quotedoc,"".join(matches)))
def search_check( keywords, collectionname, docs, win_host=None, username=None, password=None ):
"""Make sure all the specified docs are in the set returned for a
search against the specified keywords"""
# Reingested documents can cause duplicate documents because of race on new slice start
# we need to retry once if we've discovered a document that should not be there.
retry_unexpected = True
# Keep looping until we find what we are looking for.
# This will tend to cause false successes in the case of an empty set, and slower failures
# if the test actually fails.
for counter in range(4):
matches = TestDocs.search_documents_user(keywords,
for founddoc in matches:
if founddoc not in docs:
if retry_unexpected:
print "Retrying search because unexpected document %s was present." % (founddoc)
raise Exception("Unexpected document %s returned when searching; results =[%s]" % (founddoc,"".join(matches)))
# If we got all the way through the loop, there were no unexpected docs to retry
retry_unexpected = False
if retry_unexpected:
# We only want to retry once, so clear the flag that triggers retries, sleep a couple
# seconds to give the old slice time to shut down and then continue on to the next
# search attempt.
retry_unexpected = False
# matches are url-encoded, full file names (including /root)
# Filenames in docs are therefore url encoded here
failed = False
for doc in docs:
quotedoc = doc #urllib.quote(doc,safe='')
if quotedoc not in matches:
failed = True
if failed == False:
raise Exception("Expected document %s not returned when searching; actual results =[%s]" % (quotedoc,"".join(matches)))
def wait_for_ingest( timeout=3600 ):
# Create a file system repository connection via the UI
def define_filesystem_repository_connection_ui( username, password, connection_name, connection_description,
max_connections=None ):
""" The throttles argument is an array of tuples. Each tuple represents a throttle and is of the form (regexp,description,avg-fetch-rate).
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections") )
# Click "add a connection"
window = vb.find_window("")
link = window.find_link("Add a connection") )
# Find the right form elements and set them
window = vb.find_window("")
form = window.find_form("editconnection")
# "Name" tab
namefield = form.find_textarea("connname")
descriptionfield = form.find_textarea("description")
namefield.set_value( connection_name )
descriptionfield.set_value( connection_description )
# "Type" tab
link = window.find_link("Type tab")
window = vb.find_window("")
form = window.find_form("editconnection")
connectortypefield = form.find_selectbox("classname")
connectortypefield.select_value( "com.metacarta.crawler.connectors.filesystem.FileConnector" )
# Click the "Continue" button
continue_button = window.find_button("Continue to next page") )
window = vb.find_window("")
# "Throttling" tab
link = window.find_link("Throttling tab")
window = vb.find_window("")
form = window.find_form("editconnection")
if throttles != None:
for throttle in throttles:
regexp,description,rate = throttle
# Add a throttle with the specified parameters
regexpfield = form.find_textarea("throttle")
descfield = form.find_textarea("throttledesc")
valuefield = form.find_textarea("throttlevalue")
regexpfield.set_value( regexp )
if description != None:
descfield.set_value( description )
valuefield.set_value( str(rate) )
add_button = window.find_button("Add throttle")
window = vb.find_window("")
form = window.find_form("editconnection")
if max_connections != None:
form.find_textarea("maxconnections").set_value( str(max_connections) )
# Now, save this page
save_button = window.find_button("Save this connection") )
# See if the connector saved
window = vb.find_window("")
found_connection_name = window.find_match("<!--connection=(.*)-->",1)
if found_connection_name != connection_name:
raise Exception("Created connection doesn't match")
# Delete a file system repository connection via the UI
def delete_filesystem_repository_connection_ui( username, password, connection_name ):
delete_repository_connection_ui( username, password, connection_name )
# Define a standard job using the UI
def define_filesystem_job_ui( username,
recrawlinterval=0 ):
"""connection_name is the name of the filesystem connection. startpoints_and_matches
is an array, each element of which is a tuple. The tuple consists of the start point
path, and an array of match specifications. Each match specification is a tuple
consisting of a string (either "include" or "exclude"), a type (either "file" or "directory"),
and a match value (such as "*.txt").
Legal values for type are: "specified" or "continuous"
Legal values for start method are: "windowbegin", "windowinside", or "disable".
Hop filters are an array of tuples, each one ( filter_name, filter_value ).
Hop mode has the legal values "accurate", "nodelete", or "neverdelete".
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("List jobs") )
# Grab the new window
window = vb.find_window("")
# Add a job
link = window.find_link("Add a job") )
# Grab the edit window
window = vb.find_window("")
form = window.find_form("editjob")
# "Name" tab
# textarea for setting description
form.find_textarea("description").set_value( job_name )
# "Connection" tab
link = window.find_link("Connection tab")
window = vb.find_window("")
form = window.find_form("editjob")
# start method
if startmethod == "windowbegin":
startmethod_value = 0
elif startmethod == "windowinside":
startmethod_value = 1
elif startmethod == "disable":
startmethod_value = 2
raise Exception("Illegal start method value: '%s'" % startmethod )
form.find_selectbox("startmethod").select_value( str(startmethod_value) )
# connection name
form.find_selectbox("connectionname").select_value( connection_name )
# output connection name
form.find_selectbox("outputname").select_value( "GTS" )
# Click the "Continue" button
window.find_button("Continue to next screen").click( )
window = vb.find_window("")
form = window.find_form("editjob")
# "Scheduling" tab
link = window.find_link("Scheduling tab")
window = vb.find_window("")
form = window.find_form("editjob")
# type
if type == "specified":
type_value = 1
elif type == "continuous":
type_value = 0
raise Exception("Illegal type value: '%s'" % type )
form.find_selectbox("scheduletype").select_value( str(type_value) )
# Recrawl interval
if type == "continuous":
form.find_textarea("recrawlinterval").set_value( str(recrawlinterval) )
# "Hop Filters" tab
link = window.find_link("Hop Filters tab")
window = vb.find_window("")
form = window.find_form("editjob")
if hop_filters != None:
for filterelement in hop_filters:
filter_name, filter_value = filterelement
if filter_value == None:
filter_value = ""
if hop_mode != None:
if hop_mode == "accurate":
mode = 0
elif hop_mode == "nodelete":
mode = 1
elif hop_mode == "neverdelete":
mode = 2
raise Exception("Illegal mode %s" % hop_mode)
# "Collections" tab
link = window.find_link("Collections tab")
window = vb.find_window("")
form = window.find_form("editjob")
# textarea for setting collection
if collection_name != None:
form.find_textarea("gts_collectionname").set_value( collection_name )
# "Template" tab
link = window.find_link("Template tab")
window = vb.find_window("")
form = window.find_form("editjob")
# textarea for setting document template
if document_template != None:
form.find_textarea("gts_documenttemplate").set_value( document_template )
# "Paths" tab
link = window.find_link("Paths tab")
window = vb.find_window("")
form = window.find_form("editjob")
# Now, set up paths and matches
path_index = 0
for pathelement in startpoints_and_matches:
path, matches = pathelement
# Add the path
form.find_textarea("specpath").set_value( path )
# Click the "Add" button
window.find_button("Add new path").click( )
window = vb.find_window("")
form = window.find_form("editjob")
# Now, go through the matches
for match in matches:
include_exclude, match_type, match_value = match
assert include_exclude == "include" or include_exclude == "exclude"
assert match_type == "file" or match_type == "directory"
form.find_selectbox("specflavor_%d" % path_index).select_value(include_exclude)
form.find_selectbox("spectype_%d" % path_index).select_value(match_type)
form.find_textarea("specmatch_%d" % path_index).set_value(match_value)
window.find_button("Add new match for path #%d" % path_index).click( )
window = vb.find_window("")
form = window.find_form("editjob")
path_index += 1
# Finally, submit the form
window.find_button("Save this job").click( )
window = vb.find_window("")
jobid = window.find_match("<!--jobid=(.*)-->",1)
return jobid
# Delete a job using the UI
def delete_job_ui( username,
jobid ):
"""Delete the specified job using the UI. No attempt
is made to insure that the job actually can be deleted. If it is running
then this operation will fail.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("List jobs")
if link == None:
raise Exception("Can't find list link for jobs"); )
# Grab the new window
window = vb.find_window("")
# Find the delete link
deletelink = window.find_link("Delete job "+jobid)
if deletelink == None:
raise Exception("Can't find delete link for job %s" % jobid) )
# Grab the new window, and confirm that the job is gone
window = vb.find_window("")
window.check_no_match("View job "+jobid)
# Start a job using the UI
def start_job_ui( username,
jobid ):
""" Start a job, using the UI to do it. This does not confirm
that the job has started, due to the difficulty of getting
the timing right, but it does kick the job off at least.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Manage jobs") )
# Grab the new window
window = vb.find_window("")
# Find the link that would start the job
link = window.find_link("Start job "+jobid)
# Click it! )
# View repository connection via the UI (and check for 'working' status)
def view_repository_connection_ui( username, password, connection_name, match_string="Connection working" ):
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections") )
# Now, find the delete link for this connection
window = vb.find_window("")
link = window.find_link("View "+connection_name) )
# Verify that the connection was OK
window = vb.find_window("")
# Delete a repository connection via the UI
def delete_repository_connection_ui( username, password, connection_name ):
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections") )
# Now, find the delete link for this connection
window = vb.find_window("")
link = window.find_link("Delete "+connection_name) )
# Verify that the connection was deleted
window = vb.find_window("")
# simply make sure it's not an error screen
window.find_match("List of Repository Connections")
# Delete an authority connection via the UI
def delete_authority_connection_ui( username, password, connection_name ):
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List authorities") )
# Now, find the delete link for this connection
window = vb.find_window("")
link = window.find_link("Delete "+connection_name) )
# Verify that the connection was deleted
window = vb.find_window("")
# simply make sure it's not an error screen
window.find_match("List of Authority Connections")
# Pause a job using the UI
def pause_job_ui( username,
jobid ):
""" Pause a job, using the UI to do it. This does not confirm
that the job has paused, due to the difficulty of getting
the timing right, but it does pause the job off at least.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Manage jobs") )
# Grab the new window
window = vb.find_window("")
# Find the link that would start the job
link = window.find_link("Pause job "+jobid)
# Click it! )
# Resume a job using the UI
def resume_job_ui( username,
jobid ):
""" Resume a job, using the UI to do it. This does not confirm
that the job has resumed, due to the difficulty of getting
the timing right.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Manage jobs") )
# Grab the new window
window = vb.find_window("")
# Find the link that would start the job
link = window.find_link("Resume job "+jobid)
# Click it! )
# Get a job's status string from the UI
def get_job_status_ui( username, password, jobid ):
""" Find a job's status in the UI given it's ID.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Manage jobs") )
# Grab the new window
window = vb.find_window("")
# Use the built-in function to look for a match
return window.find_match( "<td class=\"columncell\"><!--jobid=%s-->([^<]*)</td><td class=\"columncell\">([^<]*)</td>" % jobid, group=2)
def find_job_by_name_ui( username, password, jobname, connectionname ):
""" Look for a job matching the provided description. """
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("List jobs") )
window = vb.find_window("")
window.find_match_no_newlines("<tr[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*</tr[^>]*>" % (jobname,connectionname))
link = window.find_link("Manage jobs") )
# Get the text and look for something that matches what we are looking for.
# This is complicated by the fact that python's regexp processing doesn't like newlines, so eliminate those first.
window = vb.find_window("")
return window.find_match_no_newlines("<!--jobid=([0123456789]*)-->%s<" % (jobname), group=1)
def find_connection_by_name_ui( username, password, connectionname ):
""" Look for a connection matching the provided description. """
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections") )
# Look for a view link for the correct connection name
window = vb.find_window("")
window.find_link("View %s" % connectionname)
def find_status_by_job_name_ui( username, password, jobname, jobstatus, totalqueue, activequeue, remainingqueue ):
""" Look for a matching jobstatus. """
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("Manage jobs") )
# Get the text and look for something that matches what we are looking for.
# This is complicated by the fact that python's regexp processing doesn't like newlines, so eliminate those first.
window = vb.find_window("")
return window.find_match_no_newlines("<tr[^>]*>.*<td[^>]*><!--jobid=([0123456789]*)-->%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*</tr[^>]*>" % (jobname,jobstatus,totalqueue,activequeue,remainingqueue), group=1)
# Parse a date of the form 03-25-2008
def parse_date( date_value ):
date_fields = date_value.split("-")
return (int(date_fields[2]),int(date_fields[0]),int(date_fields[1]))
# Parse a time of the form 10:11:20.523
def parse_time( time_value ):
time_fields = time_value.split(":")
return (int(time_fields[0]), int(time_fields[1]), float(time_fields[2]))
# Parse a time value from a report into seconds since epoch.
# Time values look something like this: 03-25-2008 10:11:20.523
# Note there is no timezone, so the value must be parsed in terms of the appliance's
# default timezone.
def parse_date_time( time_value ):
date_and_time = time_value.split(" ")
year,mon,mday = parse_date(date_and_time[0])
hour,min,sec = parse_time(date_and_time[1])
return time.mktime((year,mon,mday,hour,min,int(sec),-1,-1,-1))
# Helper method: Decode html found in result data. This removes nobr's, a tags, and converts br's to newlines
def decode_html( raw_data ):
replace_data = ""
start_pos = 0
while True:
new_pos = raw_data.find("<a",start_pos)
if new_pos == -1:
replace_data = replace_data + raw_data[start_pos:len(raw_data)]
replace_data = replace_data + raw_data[start_pos:new_pos]
end_pos = raw_data.find(">",new_pos)
if end_pos == -1:
raise Exception("Can't find end of '<a' tag")
start_pos = end_pos+1
replace_data = replace_data.replace("<nobr>","")
replace_data = replace_data.replace("</nobr>","")
replace_data = replace_data.replace("<nobr/>","")
replace_data = replace_data.replace("<br/>","\n")
replace_data = replace_data.replace("</a>","")
return replace_data.strip()
# Helper method to parse a single table row. Returns data as an array.
def process_table_row( data, start_position ):
row_end = data.find("</tr>",start_position)
if row_end == -1:
raise Exception( "Couldn't find end of row in '%s' at position %d" % (data,start_position) )
current_position = start_position
rval = []
while True:
new_position = data.find("<td",current_position)
if new_position == -1 or new_position > row_end:
return rval
new_position = data.find(">",new_position)
if new_position == -1:
raise Exception("Missing td endbracket")
new_position = new_position + 1
end_position = data.find("</td>",new_position)
if end_position == -1:
raise Exception("Missing </td> tag")
if end_position > row_end:
raise Exception("Found </tr> before </td>")
raw_data = data[new_position:end_position]
current_position = end_position + 4
rval.append( decode_html(raw_data) )
return rval
# Helper method to parse a single result row. Returns data as a dictionary, with one element for each <td> cell
def process_data_row( data, headers, start_position ):
row_data = process_table_row( data, start_position )
if len(row_data) != len(headers):
raise Exception( "Row data does not agree with header data in '%s' at position %d" % (data,start_position))
rval = {}
for index in range(0,len(row_data)):
row_value = row_data[index]
header_value = headers[index]
rval[header_value] = row_value
return rval
# Helper method to parse an entire resultset. Returns result as an array of dictionaries.
def parse_tableresult( data ):
# We look for tr's of class "headerrow" followed by "evendatarow" and "odddatarow"
current_pos = data.find('<tr class="headerrow">')
if current_pos == -1:
raise Exception( "Missing report header row in data '%s'" % data )
# Parse the header row into header data
headers = process_table_row( data, current_pos )
rval = []
while True:
# Find evendatarow
new_pos = data.find("<tr class=\"evendatarow\">",current_pos)
if new_pos == -1:
rval.append( process_data_row( data, headers, new_pos ) )
current_pos = new_pos
new_pos = data.find("<tr class=\"odddatarow\">",current_pos)
if new_pos == -1:
rval.append( process_data_row( data, headers, new_pos ) )
current_pos = new_pos
return rval
# Sanity check a report page
def report_sanity_check( window, more):
""" Look at the previous/next buttons to make sure they grey out when they are supposed to, etc. """
# First, we should not be able to find the "Previous page" button.
found_it = False
window.find_button("Previous page")
found_it = True
if found_it:
raise Exception("Previous page button showed up when it shouldn't have")
# The "next" button should not show up either, because the max results should contain anything we try to do with the test
found_it = False
window.find_button("Next page")
found_it = True
if found_it and not more:
raise Exception("Next page button showed up when it shouldn't have")
if not found_it and more:
raise Exception("Next page button didn't show up but it should have")
range = window.find_match("<td class=\"description\"><nobr>Rows:</nobr></td><td class=\"value\">([0-9]*-([0-9]*|END))</td>", 1)
values = range.split("-")
if values[0] != "0":
raise Exception("Row range should have begun at zero - instead saw %s" % values[0])
if values[1] != "END" and not more:
raise Exception("Row range should have ended with END - instead saw %s" % values[1])
# Format activity list in a manner acceptable to the API methods
def format_activity_list(activities_list):
""" Format a list of activities in a manner acceptable to the API scripts """
return ",".join(activities_list)
# Format time value in a manner acceptable to the API methods
def format_time(ms_since_epoch):
""" Format a time in ms since epoch in a manner acceptable to the API scripts """
if ms_since_epoch != None and len(str(ms_since_epoch)) > 0:
return str(ms_since_epoch)
return ""
# Format window size in minutes for the API methods
def format_window_size(minutes):
""" Format window size in minutes """
if minutes != None and len(str(minutes)) > 0:
return str(minutes)
return "5"
# Split a comma-delimited line into multiple columns, unescaping in the process
def split_api_result_line(result_line):
""" Split a comma-delimited line into multiple columns, unescaping in the process
return_value = []
current_value = ""
index = 0
for index in range(len(result_line)):
result_char = result_line[index]
if result_char == '\\':
index += 1
result_char = result_line[index]
current_value += result_char
elif result_char == ',':
index += 1
return_value += [ current_value ]
current_value = ""
current_value += result_char
return_value += [ current_value ]
return return_value
# Digest report API call result and an array of dictionaries corresponding to the results returned
def process_api_result(result, column_list):
""" Digest API result and make an array of dictionaries out of it """
return_value = []
for result_line in result.splitlines():
value_array = split_api_result_line(result_line)
dict = {}
for index in range(len(column_list)):
column_name = column_list[index]
value = value_array[index]
dict[column_name] = value.strip()
return_value += [ dict ]
return return_value
# List jobs using the API and return the results.
def list_jobs_api( ):
""" List jobs using the API """
result = invoke_script( ["/usr/lib/metacarta/crawler-listjobs"] )
return process_api_result(result,["identifier",
# List job statuses using the API and return the results
def list_job_statuses_api( ):
""" List job statuses using the API and return the results """
result = invoke_script( ["/usr/lib/metacarta/crawler-listjobstatuses"] )
return process_api_result(result,["identifier",
# Get job collections using the API and return the results
def get_job_collections_api( job_id ):
""" Get job collections using the API and return the results
result = invoke_script( ["/usr/lib/metacarta/crawler-getjobcollections", job_id] )
return process_api_result(result,["collection"])
# Get job schedule using the API and return the results
def get_job_schedule_api( job_id ):
""" Get job schedule using the API and return the results """
result = invoke_script( ["/usr/lib/metacarta/crawler-getjobschedule", job_id] )
return process_api_result(result,["daysofweek",
# Run a simple history report using the API and return the results.
def run_simple_history_report_api( connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
start_result_row=0, max_result_count=10000 ):
""" Run a simple history report. Return an array of dictionaries, each dictionary having fields that correspond to the data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now).
result = invoke_script( ["/usr/lib/metacarta/crawler-runsimplehistory",
str(max_result_count) ] )
# Decode the result
return process_api_result(result,["identifier","activity","start_time","elapsed_time","result_code","result_desc","byte_count"])
# Run a simple history report from the UI and return the results.
def run_simple_history_report_ui( username, password, connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
max_result_count=10000, more=False ):
""" Run a simple history report. Return an array of dictionaries, each dictionary having fields that correspond to the header data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the
whole tuple, so (None,None,None,None,None) would be one way of indicating that.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Simple history") )
# Select the connection
# Grab the new window
window = vb.find_window("")
form = window.find_form("report")
form.find_selectbox("reportconnection").select_value( connection_name )
# Submit
window.find_button("Continue").click( )
# Set up all the other report parameters
# Select the activities
window = vb.find_window("")
form = window.find_form("report")
activities_select = form.find_selectbox("reportactivities")
for activity in activities_list:
activities_select.multi_select_value( activity )
# Entity match
if entity_regexp != None:
form.find_textarea("reportentitymatch").set_value( entity_regexp )
if result_regexp != None:
form.find_textarea("reportresultcodematch").set_value( result_regexp )
# Select the start time and end time
if start_time != None:
start_hour, start_minute, start_day, start_month, start_year = start_time
if start_hour != None:
form.find_selectbox("reportstarthour").select_value( str(start_hour) )
form.find_selectbox("reportstarthour").select_value( "" )
if start_minute != None:
form.find_selectbox("reportstartminute").select_value( str(start_minute) )
form.find_selectbox("reportstartminute").select_value( "" )
if start_day != None:
form.find_selectbox("reportstartday").select_value( str(start_day) )
form.find_selectbox("reportstartday").select_value( "" )
if start_month != None:
form.find_selectbox("reportstartmonth").select_value( str(start_month) )
form.find_selectbox("reportstartmonth").select_value( "" )
if start_year != None:
form.find_selectbox("reportstartyear").select_value( str(start_year) )
form.find_selectbox("reportstartyear").select_value( "" )
if end_time != None:
end_hour, end_minute, end_day, end_month, end_year = end_time
if end_hour != None:
form.find_selectbox("reportendhour").select_value( str(end_hour) )
form.find_selectbox("reportendhour").select_value( "" )
if end_minute != None:
form.find_selectbox("reportendminute").select_value( str(end_minute) )
form.find_selectbox("reportendminute").select_value( "" )
if end_day != None:
form.find_selectbox("reportendday").select_value( str(end_day) )
form.find_selectbox("reportendday").select_value( "" )
if end_month != None:
form.find_selectbox("reportendmonth").select_value( str(end_month) )
form.find_selectbox("reportendmonth").select_value( "" )
if end_year != None:
form.find_selectbox("reportendyear").select_value( str(end_year) )
form.find_selectbox("reportendyear").select_value( "" )
# Fire off the query
window.find_button("Execute this query").click( )
# Get the window contents, and scrape out the report data
window = vb.find_window("")
# Make sure everything about the response is consistent
return parse_tableresult( window.get_data() )
# Run a max activity history report using the API and return the results.
def run_max_activity_history_report_api( connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
start_result_row=0, max_result_count=10000,
entity_bin_regexp=None, window_size_minutes=None ):
""" Run a max activity history report. Return an array of dictionaries, each dictionary having fields that correspond to the data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now).
result = invoke_script( ["/usr/lib/metacarta/crawler-runmaxactivityhistory",
str(max_result_count) ] )
# Decode the result
return process_api_result(result,["identifier_bucket","starttime_ms","endtime_ms","activity_count"])
# Run a max activity history report from the UI and return the results.
def run_max_activity_history_report_ui( username, password, connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
max_result_count=10000, more=False,
entity_bin_regexp=None, window_size_minutes=None ):
""" Run a max activity history report. Return an array of dictionaries, each dictionary having fields that correspond to the header data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the
whole tuple, so (None,None,None,None,None) would be one way of indicating that.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Maximum activity") )
# Select the connection
# Grab the new window
window = vb.find_window("")
form = window.find_form("report")
form.find_selectbox("reportconnection").select_value( connection_name )
# Submit
window.find_button("Continue").click( )
# Set up all the other report parameters
# Select the activities
window = vb.find_window("")
form = window.find_form("report")
activities_select = form.find_selectbox("reportactivities")
for activity in activities_list:
activities_select.multi_select_value( activity )
# Entity match
if entity_regexp != None:
form.find_textarea("reportentitymatch").set_value( entity_regexp )
if result_regexp != None:
form.find_textarea("reportresultcodematch").set_value( result_regexp )
# Select the start time and end time
if start_time != None:
start_hour, start_minute, start_day, start_month, start_year = start_time
if start_hour != None:
form.find_selectbox("reportstarthour").select_value( str(start_hour) )
form.find_selectbox("reportstarthour").select_value( "" )
if start_minute != None:
form.find_selectbox("reportstartminute").select_value( str(start_minute) )
form.find_selectbox("reportstartminute").select_value( "" )
if start_day != None:
form.find_selectbox("reportstartday").select_value( str(start_day) )
form.find_selectbox("reportstartday").select_value( "" )
if start_month != None:
form.find_selectbox("reportstartmonth").select_value( str(start_month) )
form.find_selectbox("reportstartmonth").select_value( "" )
if start_year != None:
form.find_selectbox("reportstartyear").select_value( str(start_year) )
form.find_selectbox("reportstartyear").select_value( "" )
if end_time != None:
end_hour, end_minute, end_day, end_month, end_year = end_time
if end_hour != None:
form.find_selectbox("reportendhour").select_value( str(end_hour) )
form.find_selectbox("reportendhour").select_value( "" )
if end_minute != None:
form.find_selectbox("reportendminute").select_value( str(end_minute) )
form.find_selectbox("reportendminute").select_value( "" )
if end_day != None:
form.find_selectbox("reportendday").select_value( str(end_day) )
form.find_selectbox("reportendday").select_value( "" )
if end_month != None:
form.find_selectbox("reportendmonth").select_value( str(end_month) )
form.find_selectbox("reportendmonth").select_value( "" )
if end_year != None:
form.find_selectbox("reportendyear").select_value( str(end_year) )
form.find_selectbox("reportendyear").select_value( "" )
if entity_bin_regexp != None:
form.find_textarea("reportbucketdesc").set_value( entity_bin_regexp )
if window_size_minutes != None:
form.find_textarea("reportinterval").set_value( str(window_size_minutes) )
# Fire off the query
window.find_button("Execute this query").click( )
# Get the window contents, and scrape out the report data
window = vb.find_window("")
# Make sure everything about the response is consistent
# Parse the result
return parse_tableresult( window.get_data() )
# Run a max bandwidth history report using the API and return the results.
def run_max_bandwidth_history_report_api( connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
start_result_row=0, max_result_count=10000,
entity_bin_regexp=None, window_size_minutes=None ):
""" Run a max bandwidth report. Return an array of dictionaries, each dictionary having fields that correspond to the data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now).
result = invoke_script( ["/usr/lib/metacarta/crawler-runmaxbandwidthhistory",
str(max_result_count) ] )
# Decode the result
return process_api_result(result,["identifier_bucket","starttime_ms","endtime_ms","byte_count"])
# Run a max bandwidth history report from the UI and return the results.
def run_max_bandwidth_history_report_ui( username, password, connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
max_result_count=10000, more=False,
entity_bin_regexp=None, window_size_minutes=None ):
""" Run a max bandwidth report. Return an array of dictionaries, each dictionary having fields that correspond to the header data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the
whole tuple, so (None,None,None,None,None) would be one way of indicating that.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Maximum bandwidth") )
# Select the connection
# Grab the new window
window = vb.find_window("")
form = window.find_form("report")
form.find_selectbox("reportconnection").select_value( connection_name )
# Submit
window.find_button("Continue").click( )
# Set up all the other report parameters
# Select the activities
window = vb.find_window("")
form = window.find_form("report")
activities_select = form.find_selectbox("reportactivities")
for activity in activities_list:
activities_select.multi_select_value( activity )
# Entity match
if entity_regexp != None:
form.find_textarea("reportentitymatch").set_value( entity_regexp )
if result_regexp != None:
form.find_textarea("reportresultcodematch").set_value( result_regexp )
# Select the start time and end time
if start_time != None:
start_hour, start_minute, start_day, start_month, start_year = start_time
if start_hour != None:
form.find_selectbox("reportstarthour").select_value( str(start_hour) )
form.find_selectbox("reportstarthour").select_value( "" )
if start_minute != None:
form.find_selectbox("reportstartminute").select_value( str(start_minute) )
form.find_selectbox("reportstartminute").select_value( "" )
if start_day != None:
form.find_selectbox("reportstartday").select_value( str(start_day) )
form.find_selectbox("reportstartday").select_value( "" )
if start_month != None:
form.find_selectbox("reportstartmonth").select_value( str(start_month) )
form.find_selectbox("reportstartmonth").select_value( "" )
if start_year != None:
form.find_selectbox("reportstartyear").select_value( str(start_year) )
form.find_selectbox("reportstartyear").select_value( "" )
if end_time != None:
end_hour, end_minute, end_day, end_month, end_year = end_time
if end_hour != None:
form.find_selectbox("reportendhour").select_value( str(end_hour) )
form.find_selectbox("reportendhour").select_value( "" )
if end_minute != None:
form.find_selectbox("reportendminute").select_value( str(end_minute) )
form.find_selectbox("reportendminute").select_value( "" )
if end_day != None:
form.find_selectbox("reportendday").select_value( str(end_day) )
form.find_selectbox("reportendday").select_value( "" )
if end_month != None:
form.find_selectbox("reportendmonth").select_value( str(end_month) )
form.find_selectbox("reportendmonth").select_value( "" )
if end_year != None:
form.find_selectbox("reportendyear").select_value( str(end_year) )
form.find_selectbox("reportendyear").select_value( "" )
if entity_bin_regexp != None:
form.find_textarea("reportbucketdesc").set_value( entity_bin_regexp )
if window_size_minutes != None:
form.find_textarea("reportinterval").set_value( str(window_size_minutes) )
# Fire off the query
window.find_button("Execute this query").click( )
# Get the window contents, and scrape out the report data
window = vb.find_window("")
# Make sure everything about the response is consistent
# Parse the result
return parse_tableresult( window.get_data() )
# Run a result histogram history report using the API and return the results.
def run_result_histogram_history_report_api( connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
start_result_row=0, max_result_count=10000,
entity_bin_regexp=None, result_bin_regexp=None ):
""" Run a result histogram report. Return an array of dictionaries, each dictionary having fields that correspond to the data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now).
result = invoke_script( ["/usr/lib/metacarta/crawler-runresulthistory",
str(max_result_count) ] )
# Decode the result
return process_api_result(result,["identifier_bucket","resultcode_bucket","event_count"])
# Run a result histogram history report from the UI and return the results.
def run_result_histogram_history_report_ui( username, password, connection_name,
start_time=None, end_time=None,
entity_regexp=None, result_regexp=None,
max_result_count=10000, more=False,
entity_bin_regexp=None, result_bin_regexp=None ):
""" Run a result histogram report. Return an array of dictionaries, each dictionary having fields that correspond to the header data
from the report. For arguments, None indicates the default (start time
is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the
whole tuple, so (None,None,None,None,None) would be one way of indicating that.
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Result histogram") )
# Select the connection
# Grab the new window
window = vb.find_window("")
form = window.find_form("report")
form.find_selectbox("reportconnection").select_value( connection_name )
# Submit
window.find_button("Continue").click( )
# Set up all the other report parameters
# Select the activities
window = vb.find_window("")
form = window.find_form("report")
activities_select = form.find_selectbox("reportactivities")
for activity in activities_list:
activities_select.multi_select_value( activity )
# Entity match
if entity_regexp != None:
form.find_textarea("reportentitymatch").set_value( entity_regexp )
if result_regexp != None:
form.find_textarea("reportresultcodematch").set_value( result_regexp )
# Select the start time and end time
if start_time != None:
start_hour, start_minute, start_day, start_month, start_year = start_time
if start_hour != None:
form.find_selectbox("reportstarthour").select_value( str(start_hour) )
form.find_selectbox("reportstarthour").select_value( "" )
if start_minute != None:
form.find_selectbox("reportstartminute").select_value( str(start_minute) )
form.find_selectbox("reportstartminute").select_value( "" )
if start_day != None:
form.find_selectbox("reportstartday").select_value( str(start_day) )
form.find_selectbox("reportstartday").select_value( "" )
if start_month != None:
form.find_selectbox("reportstartmonth").select_value( str(start_month) )
form.find_selectbox("reportstartmonth").select_value( "" )
if start_year != None:
form.find_selectbox("reportstartyear").select_value( str(start_year) )
form.find_selectbox("reportstartyear").select_value( "" )
if end_time != None:
end_hour, end_minute, end_day, end_month, end_year = end_time
if end_hour != None:
form.find_selectbox("reportendhour").select_value( str(end_hour) )
form.find_selectbox("reportendhour").select_value( "" )
if end_minute != None:
form.find_selectbox("reportendminute").select_value( str(end_minute) )
form.find_selectbox("reportendminute").select_value( "" )
if end_day != None:
form.find_selectbox("reportendday").select_value( str(end_day) )
form.find_selectbox("reportendday").select_value( "" )
if end_month != None:
form.find_selectbox("reportendmonth").select_value( str(end_month) )
form.find_selectbox("reportendmonth").select_value( "" )
if end_year != None:
form.find_selectbox("reportendyear").select_value( str(end_year) )
form.find_selectbox("reportendyear").select_value( "" )
if entity_bin_regexp != None:
form.find_textarea("reportbucketdesc").set_value( entity_bin_regexp )
if result_bin_regexp != None:
form.find_textarea("reportresultdesc").set_value( result_bin_regexp )
# Fire off the query
window.find_button("Execute this query").click( )
# Get the window contents, and scrape out the report data
window = vb.find_window("")
# Make sure everything about the response is consistent
# Parse the result
return parse_tableresult( window.get_data() )
document_state_dictionary = { "never_been_processed" : 0, "processed_at_least_once" : 1 }
document_status_dictionary = { "no_longer_active" : 0, "in_progress" : 1,
"being_expired" : 2, "being_deleted" : 3,
"available_for_processing" : 4, "available_for_expiration" : 5,
"not_yet_processable" : 6, "not_yet_expirable" : 7,
"waiting_forever" : 8 }
# Format the job id list
def format_job_list(job_list):
""" Format a list of jobs in a manner acceptable to API methods """
return ",".join(job_list)
# Format a time offset in minutes
def format_offset_minutes(time_offset_minutes):
""" Format a time offset given in minutes as milliseconds """
if time_offset_minutes == None:
return "0"
return str(int(float(time_offset_minutes) * 60000L))
api_state_map = { "never_been_processed" : "neverprocessed", "processed_at_least_once" : "previouslyprocessed" }
# Format a state list
def format_state_list(document_states):
""" Convert standard state strings into API state strings """
state_array = []
for state in document_states:
state_array += [ api_state_map[state] ]
return ",".join(state_array)
api_status_map = { "no_longer_active" : "inactive", "in_progress" : "processing",
"being_expired" : "expiring", "being_deleted" : "deleting",
"available_for_processing" : "readyforprocessing", "available_for_expiration" : "readyforexpiration",
"not_yet_processable" : "waitingforprocessing", "not_yet_expirable" : "waitingforexpiration",
"waiting_forever" : "waitingforever" }
# Format a status list
def format_status_list(document_statuses):
""" Convert standard status strings into API status strings """
status_array = []
for status in document_statuses:
status_array += [ api_status_map[status] ]
return ",".join(status_array)
# Run a document status report using the API and return the results.
def run_document_status_api( connection_name,
document_states=[ "never_been_processed", "processed_at_least_once" ],
document_statuses=[ "no_longer_active", "in_progress", "being_expired", "being_deleted",
"available_for_processing", "available_for_expiration",
"not_yet_processable", "not_yet_expirable", "waiting_forever" ],
start_result_row=0, max_result_count=10000 ):
""" Run a document queue status report. Return an array of dictionaries, each dictionary having fields that correspond to data
from the report. For arguments, document_statuses is an array of strings, whose legal values are:
"no_longer_active", "in_progress", "being_expired", "being_deleted", "available_for_processing",
"available_for_expiration", "not_yet_processable", "not_yet_expirable", "waiting_forever". document_states is an array of strings,
whose legal values are: "never_been_processed", and "processed_at_least_once".
result = invoke_script( ["/usr/lib/metacarta/crawler-rundocumentstatus",
str(max_result_count) ] )
# Decode the result
return process_api_result(result,["doc_identifier","job_description","document_state","document_status","when_scheduled","action_to_take","remaining_retrycount","retrylimit_time"] )
# Run a document status report from the UI and return the results.
def run_document_status_ui( username, password, connection_name,
document_states=[ "never_been_processed", "processed_at_least_once" ],
document_statuses=[ "no_longer_active", "in_progress", "being_expired", "being_deleted",
"available_for_processing", "available_for_expiration",
"not_yet_processable", "not_yet_expirable", "waiting_forever" ],
max_result_count=10000, more=False ):
""" Run a document queue status report. Return an array of dictionaries, each dictionary having fields that correspond to the header data
from the report. For arguments, document_statuses is an array of strings, whose legal values are:
"no_longer_active", "in_progress", "being_expired", "being_deleted", "available_for_processing",
"available_for_expiration", "not_yet_processable", "not_yet_expirable", "waiting_forever". document_states is an array of strings,
whose legal values are: "never_been_processed", and "processed_at_least_once".
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("Document status") )
# Select the connection
# Grab the new window
window = vb.find_window("")
form = window.find_form("report")
form.find_selectbox("statusconnection").select_value( connection_name )
# Submit
window.find_button("Continue").click( )
# Set up all the other report parameters
window = vb.find_window("")
form = window.find_form("report")
# Select the jobs
if job_list != None and len(job_list) > 0:
job_select = form.find_selectbox("statusjobs")
for job in job_list:
job_select.multi_select_value( job )
raise Exception("There must be some jobs!")
# Submit
window.find_button("Continue").click( )
window = vb.find_window("")
form = window.find_form("report")
# Set the time offset, if any
if time_offset_minutes != None:
form.find_textarea( "status_schedule_offset" ).set_value( str(time_offset_minutes) )
# Select the states
if document_states != None:
state_select = form.find_selectbox("statusdocumentstates")
for document_state in document_states:
value = document_state_dictionary[ document_state ]
state_select.multi_select_value( str(value) )
# Select the statuses
if document_statuses != None:
status_select = form.find_selectbox("statusdocumentstatuses")
for document_status in document_statuses:
value = document_status_dictionary[ document_status ]
status_select.multi_select_value( str(value) )
# Entity match
if identifier_regexp != None:
form.find_textarea("statusidentifiermatch").set_value( identifier_regexp )
# Fire off the query
window.find_button("Execute this query").click( )
# Get the window contents, and scrape out the report data
window = vb.find_window("")
# Make sure everything about the response is consistent
# Parse the result
return parse_tableresult( window.get_data() )
# Run a queue status report using the API and return the results.
def run_queue_status_api( connection_name,
document_states=[ "never_been_processed", "processed_at_least_once" ],
document_statuses=[ "no_longer_active", "in_progress", "being_expired", "being_deleted",
"available_for_processing", "available_for_expiration",
"not_yet_processable", "not_yet_expirable", "waiting_forever" ],
start_result_row=0, max_result_count=10000,
bucket_regexp=None ):
""" Run a queue status report. Return an array of dictionaries, each dictionary having fields that correspond to data
from the report. For arguments, document_statuses is an array of strings, whose legal values are:
"no_longer_active", "in_progress", "being_expired", "being_deleted", "available_for_processing",
"available_for_expiration", "not_yet_processable", "not_yet_expirable". document_states is an array of strings,
whose legal values are: "never_been_processed", and "processed_at_least_once".
result = invoke_script( ["/usr/lib/metacarta/crawler-runqueuestatus",
str(max_result_count) ] )
# Decode the result
return process_api_result(result,["id_bucket","inactive_count","processing_count","expiring_count","deleting_count",
"process_ready_count","expire_ready_count","process_waiting_count","expire_waiting_count","waiting_forever_count" ] )
# Build a time value for a report given a time in seconds since epoch, using the current timezone
def build_report_time(seconds_since_epoch):
""" Build a report time structure (hours, minutes, days, month, year) from a value of seconds since epoch. """
time_struct = time.localtime(seconds_since_epoch)
hours = time_struct.tm_hour
minutes = time_struct.tm_min
days = time_struct.tm_mday - 1
month = time_struct.tm_mon - 1
year = time_struct.tm_year
return (hours, minutes, days, month, year)
# Build a time value for an report given a time in seconds since epoch
def build_api_time(seconds_since_epoch):
""" Build a report time string (ms since epoch) from a value of seconds since epoch. """
return str(int(seconds_since_epoch * 1000))
# Miscellaneous file system test helpers. Dave put these here because he wanted his new tests to be able to use them; they really are pretty test-specific though.
# Copy a folder to a (new) area
def copy_folder( source, target ):
invoke_root_script( [ "mkdir", "-p", target ] )
invoke_root_script( [ "cp", "-r", source, target ] )
# Remove a folder
def delete_folder( target ):
invoke_root_script( [ "rm", "-rf", target ] )
def preclean( username, print_errors=True ):
''' Clean up everything we might have done during the execution of this test.
This will include all jobs and ingested documents. '''
except Exception, e:
if print_errors:
print "Error resetting all jobs"
print e
# Remove test documents first
for folder in [ "/root/crawlarea", "/root/crawlarea2" ]:
delete_folder( folder )
except Exception, e:
if print_errors:
print "Error removing %s" % folder
print e
delete_crawler_user( username )
except Exception, e:
if print_errors:
print "Error removing crawler user"
print e
teardown_connector_environment( )
except Exception, e:
if print_errors:
print "Error cleaning up debs"
print e
# Since one of the tests deregisters the filesystem connector, reregister it here to be sure it exists.
register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
except Exception, e:
if print_errors:
print "Error reregistering file system connector"
print e