| #!/usr/bin/python |
| |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| import os |
| import sys |
| import errno |
| import time |
| import datetime |
| import subprocess |
| import socket |
| import signal |
| import re |
| import sqatools |
| from sqatools import docs, sqautils |
| import urllib |
| import urllib2 |
| import TestDocs |
| import SQAhelpers |
| import traceback |
| import VirtualBrowser |
| import pycurl |
| from sqatools import LicenseMakerClient |
| from sqatools import appliance |
| |
| sys.path.append("/usr/lib/metacarta") |
| |
| # These methods are helper methods that abstract from the various testing scripts |
| |
| # Set ingestion proxy timeout |
| def set_proxy_timeout(timeout): |
| """ Set the ingestion proxy timeout. |
| """ |
| appliance.ingestion_proxy_adjust_timeout(timeout) |
| # Restart leafblower |
| stop_leafblower() |
| start_leafblower() |
| |
| # Restore ingestion proxy timeout to default |
| def restore_default_proxy_timeout(): |
| """ Restore the proxy timeout to default value """ |
| appliance.ingestion_proxy_adjust_timeout() |
| # Restart leafblower |
| stop_leafblower() |
| start_leafblower() |
| |
| # Global flag pertaining to which kind of adtools we will use |
| use_legacy_adtools = False |
| # Global adtools object, which will be overridden by various imports when initialized properly |
| #adtools = None |
| |
| # Synchronize ad mode. This detects the current setting of legacy mode, based on the appliance environment |
| def synchronize_legacy_mode( ): |
| """ Detect which mode is currently in place. This is necessary in order for preclean to work properly. """ |
| global use_legacy_adtools |
| if os.readlink("/etc/alternatives/active_directory_control") == "/usr/lib/metacarta/active_directory_tool": |
| # Legacy mode |
| use_legacy_adtools = True |
| elif os.readlink("/etc/alternatives/active_directory_control") == "/usr/lib/metacarta/active_directory_control": |
| # MD mode |
| use_legacy_adtools = False |
| else: |
| raise Exception("Could not synchronize legacy mode") |
| |
| # Select 'legacy' ad mode |
| def select_legacy_mode(use_legacy_tools=True): |
| """ Change the mode based on where we currently are """ |
| global use_legacy_adtools |
| # Call out to active directory tool to set or clear legacy mode |
| if use_legacy_tools != use_legacy_adtools: |
| if use_legacy_tools: |
| invoke_root_script( [ "/usr/bin/active_directory_control", "downgrade" ] ) |
| else: |
| invoke_root_script( [ "/usr/bin/active_directory_control", "upgrade-ad" ] ) |
| use_legacy_adtools = use_legacy_tools |
| |
| # Cancel legacy mode (restore to legacy mode = off) |
| def cancel_legacy_mode(): |
| """ Cancel legacy mode, but only if it is set """ |
| if use_legacy_adtools: |
| select_legacy_mode(use_legacy_tools=False) |
| |
| def initialize_adtools(): |
| """ Import the right adtools version, and return it """ |
| if use_legacy_adtools: |
| from wintools import adtools as adtools |
| else: |
| from wintools import adtools_md as adtools |
| return adtools |
| |
| # Records of when certain operations started |
| time_records = {} |
| |
| def start_timing( time_key ): |
| """ Record the start of a given activity """ |
| global time_records |
| if time_records.has_key( time_key ): |
| raise Exception("Two starts with the same key! %s" % time_key) |
| time_records[time_key] = time.time() |
| |
| def end_timing( time_key, limit=None ): |
| """ Record the end of a given activity, and error out if it exceeds the limit """ |
| global time_records |
| if not time_records.has_key( time_key ): |
| raise Exception("End without start! %s" % time_key) |
| start_time = time_records[time_key] |
| end_time = time.time() |
| print "Test activity %s took %f seconds." % (time_key,end_time-start_time) |
| del time_records[time_key] |
| if limit != None and end_time-start_time > limit: |
| raise Exception("Time for activity %s took too long; actual %f, limit %f" % (time_key,end_time-start_time,limit)) |
| |
| # Do an http fetch using curl |
| class CallbackClass: |
| def __init__(self): |
| self.contents = '' |
| |
| def body_callback(self, buf): |
| self.contents = self.contents + buf |
| |
| def invoke_curl( url_string, user_name=None, password=None ): |
| """ Use curl to request data from a url via http """ |
| t = CallbackClass() |
| c = pycurl.Curl() |
| try: |
| c.setopt(c.URL, url_string) |
| c.setopt(c.WRITEFUNCTION, t.body_callback) |
| if user_name != None and password != None: |
| c.setopt(c.USERPWD, "%s:%s" % (user_name,password)) |
| c.perform() |
| errcode = c.getinfo(pycurl.HTTP_CODE) |
| if errcode != 200: |
| raise Exception("HTTP error %s" % str(errcode)) |
| finally: |
| c.close() |
| return t.contents |
| |
| # Set the clock forward enough to screw up thread sleeps |
| def set_clock_forward(): |
| """ Set clock forward 1 day (for testing thread sleep) """ |
| invoke_root_script(["/etc/init.d/ntp","stop"]) |
| t = datetime.datetime.today() |
| day, month, year = t.day + 1, t.month, t.year |
| if day > 28: |
| day, month = 1, month + 1 |
| if month > 12: |
| month, year = month - 12, year + 1 |
| invoke_root_script(["date","%02d%02d%02d%02d%02d" % (month, day, t.hour, t.minute, year)]) |
| |
| # Set clock back to normal |
| def restore_clock(): |
| """ Restore clock to the correct time, as served by ntp """ |
| # reset clock |
| invoke_root_script(["/usr/sbin/ntpdate","time.metacarta.com"]) |
| # restart ntp server |
| invoke_root_script(["/etc/init.d/ntp","start"]) |
| |
| # Count temporary files in temporary area that have a specified prefix |
| def count_temporary_files( prefix_string ): |
| """ Look in /var/tmp for files that begin with prefix_string, and return a count """ |
| file_list = invoke_root_script( [ "ls", "-1", "/var/tmp" ] ) |
| file_array = file_list.splitlines() |
| tfilecount = 0; |
| for filename in file_array: |
| if filename.startswith( prefix_string ): |
| tfilecount += 1 |
| return tfilecount |
| |
| # Kick off a maintenance operation |
| def run_maintenance( ): |
| """ Run postgresql maintenance operation, and check for success. |
| """ |
| output = invoke_root_script( [ "/usr/lib/metacarta/postgres-maintenance" ] ) |
| lines = output.splitlines() |
| saw_vacuum = False |
| for line in lines: |
| line_pos = line.find("VACUUM") |
| if line_pos != None and line_pos != -1: |
| saw_vacuum = True |
| line_pos = line.find("Postgresql maintenance completed") |
| if line_pos != None and line_pos != -1: |
| return |
| if saw_vacuum == False: |
| raise Exception( "Maintenance did not perform a VACUUM; got %s" % output ) |
| raise Exception( "Maintenance operation did not complete successfully; output %s" % output) |
| |
| # Find the metacarta-agents process id |
| def find_daemon_pid( ): |
| """ Find the pid of the daemon process, and return None if it can't be found. """ |
| process_list = invoke_root_script( [ "ps", "-eo", "pid,command", "-w", "-w" ] ) |
| # Break process_list into lines |
| process_array = process_list.splitlines() |
| # Go through each line |
| pid = None |
| for line in process_array: |
| line_pos = line.find("com.metacarta.agents.AgentRun") |
| if line_pos != None and line_pos != -1: |
| line_fields = line.split() |
| pid = int(line_fields[ 0 ]) |
| break |
| return pid |
| |
| # Find the daemon invocation command |
| def confirm_daemon_switch( switch_string ): |
| """ Find a given switch in the daemon invocation line. """ |
| process_list = invoke_root_script( [ "ps", "-eo", "command", "-w", "-w" ] ) |
| # Break process_list into lines |
| process_array = process_list.splitlines() |
| # Go through each line |
| command = None |
| for line in process_array: |
| line_pos = line.find("com.metacarta.agents.AgentRun") |
| if line_pos != None and line_pos != -1: |
| command = line |
| break |
| if command.find(switch_string) == -1: |
| raise Exception("Expected command clause '%s', did not see it" % switch_string) |
| |
| # Find the number of matching lsof lines for a regexp against the daemon process |
| def count_lsof_daemon_lines( match_regexp ): |
| """ Count the number of lines from lsof for the AgentRun process which match the |
| provided regexp. |
| """ |
| regexp_pattern = re.compile( match_regexp, 0 ) |
| |
| # Find the pid, if any |
| pid = find_daemon_pid( ) |
| if pid == None: |
| raise Exception("Daemon process is not running") |
| |
| # Now, invoke lsof -p |
| lsof_list = invoke_root_script( [ "lsof", "-p", str(pid) ] ) |
| lsof_array = lsof_list.splitlines() |
| matching_count = 0 |
| for line in lsof_array: |
| if regexp_pattern.search( line ) != None: |
| matching_count += 1 |
| return matching_count |
| |
| # Find the amount of resident memory the AgentRun process is consuming, in bytes |
| def calculate_daemon_memory( ): |
| """ This involves a ps command to get the mem usage of the AgentRun process """ |
| process_list = invoke_root_script( [ "ps", "-eo", "vsize,rss,command", "-w", "-w" ] ) |
| # Break process_list into lines |
| process_array = process_list.splitlines() |
| # Go through each line |
| for line in process_array: |
| line_pos = line.find("com.metacarta.agents.AgentRun") |
| if line_pos != None and line_pos != -1: |
| line_fields = line.split() |
| return ( int(line_fields[ 0 ]), int(line_fields[ 1 ]) ) |
| |
| return (0,0) |
| |
| # Change CF logging setup; does NOT restart tomcat and metacarta-agents |
| def configure_cf( desired_options ): |
| """ Modifies the agents.conf file to set up desired logging behavior. |
| The desired_log_options argument is a dictionary of keys and values corresponding |
| to what should get set in the log. NOTE WELL: Only entries that already |
| exist in the agents.conf file will be modified!! Does not restart tomcat or agents |
| processes. |
| """ |
| fh = open( "/etc/metacarta/agents.conf", "r" ) |
| try: |
| outputfh = open( "/etc/metacarta/agents.conf.new", "w" ) |
| try: |
| for line in fh.readlines(): |
| if line.lstrip().startswith("#"): |
| outputfh.write(line) |
| else: |
| equals_index = line.find("=") |
| if equals_index == -1: |
| outputfh.write(line) |
| elif desired_options.has_key(line[0:equals_index]): |
| new_value = desired_options[ line[0:equals_index] ] |
| comment_index = line.find("#",equals_index+1) |
| if comment_index == -1: |
| comment_index = len(line.rstrip()) |
| outputfh.write(line[0:equals_index+1] + new_value + line[comment_index:len(line)]) |
| else: |
| outputfh.write(line) |
| finally: |
| outputfh.close() |
| finally: |
| fh.close() |
| os.remove( "/etc/metacarta/agents.conf" ) |
| os.rename( "/etc/metacarta/agents.conf.new", "/etc/metacarta/agents.conf" ) |
| clear_logs() |
| |
| # Clear the metacarta error log |
| def get_metacarta_log_pos(log_name="/var/log/metacarta/error.log"): |
| """ Find the position in the metacarta error log, so we can figure out where to read from. """ |
| |
| #tag_string = "------ Connectorframework basic test start marker %f ------" % time.time() |
| |
| val = os.stat( log_name ) |
| return (val.st_ino,val.st_size) |
| |
| # Read the metacarta logs, looking for lines that match a regexp |
| def read_metacarta_log( reg_exp, position, log_name="/var/log/metacarta/error.log" ): |
| regexp_pattern = re.compile( reg_exp, 0 ) |
| inode,offset = position |
| val = os.stat( log_name ) |
| if val.st_ino == inode: |
| return read_matching_lines_position( log_name, offset, regexp_pattern ) |
| else: |
| val = os.stat( "%s.1" % log_name) |
| if val.st_ino != inode: |
| raise Exception("Log rolled but I can't figure out when") |
| return read_matching_lines_position( "%s.1" % log_name, offset, regexp_pattern ) + read_matching_lines_position( log_name, 0, regexp_pattern ) |
| |
| # Read lines matching a regexp from a file starting at a position |
| def read_matching_lines_position( file_name, position, regexp_pattern=None ): |
| try: |
| fh = open( file_name, "r" ) |
| fh.seek(position,0) |
| try: |
| rlines = [ ] |
| for line in fh.readlines(): |
| if regexp_pattern.search( line ) != None: |
| rlines.append( line ) |
| return rlines |
| finally: |
| fh.close() |
| except: |
| # No log |
| return [ ] |
| |
| # Clear logs |
| def clear_logs(): |
| # Clear out the existing logs; we don't want to confuse matters with old stuff |
| invoke_root_script( [ "rm", "-f", "/var/log/metacarta/java-agents/agents.log" ] ) |
| |
| # Read the log, searching for a particular regular expression, and return all lines |
| # that have that expression, in order. |
| def read_log( reg_exp ): |
| regexp_pattern = re.compile( reg_exp, 0 ) |
| return read_matching_lines_position( "/var/log/metacarta/java-agents/agents.log", 0, regexp_pattern ) |
| |
| # Shutdown tomcat |
| def shutdown_tomcat(): |
| invoke_root_script( [ "/etc/init.d/tomcat5.5", "stop" ] ) |
| |
| # Shutdown agents |
| def shutdown_agents(timeout=40): |
| start_time = time.time() |
| output = invoke_root_script( [ "/etc/init.d/metacarta-agents", "stop" ] ) |
| end_time = time.time() |
| if output.find("kill") != -1: |
| raise Exception("metacarta-agents did not shut down cleanly!") |
| if end_time - start_time > timeout: |
| raise Exception("Shutdown of metacarta-agents took more than %f seconds" % timeout) |
| |
| # Call this method to start tomcat |
| def start_tomcat(): |
| invoke_root_script( [ "/etc/init.d/tomcat5.5", "start" ] ) |
| |
| # Call this method to start agents |
| def start_agents(): |
| invoke_root_script( [ "/etc/init.d/metacarta-agents", "start" ] ) |
| |
| # Call this method to restart tomcat |
| def restart_tomcat(): |
| invoke_root_script( [ "/etc/init.d/tomcat5.5", "restart" ] ) |
| |
| # Call this method to restart agents |
| def restart_agents(): |
| start_time = time.time() |
| output = invoke_root_script( [ "/etc/init.d/metacarta-agents", "restart" ] ) |
| end_time = time.time() |
| if output.find("kill") != -1: |
| raise Exception("metacarta-agents did not shut down cleanly!") |
| # 2850's get so maxed out that 60 wasn't enough |
| if end_time - start_time > 120: |
| raise Exception("Restart of metacarta-agents took more than one minute") |
| |
| # Call this method to shut down ingestion |
| def stop_leafblower(): |
| from sqatools import LeafblowerHacks |
| LeafblowerHacks.stop_leafblower() |
| |
| # Call this method to start ingestion |
| def start_leafblower(): |
| from sqatools import LeafblowerHacks |
| LeafblowerHacks.start_leafblower(); |
| |
| # Call this method to set NTLMv1 mode for Share Connector |
| def set_shareconnector_ntlmv1_mode(): |
| invoke_root_script( [ "/usr/bin/shareconnector_control", "set", "ntlmv1" ] ) |
| |
| # Call this method to restore default (NTLMv2) mode for Share Connector |
| def set_shareconnector_default_mode(): |
| invoke_root_script( [ "/usr/bin/shareconnector_control", "set", "ntlmv2" ] ) |
| |
| # Get the status of the ntlm version switch for Share Connector |
| def get_shareconnector_mode(): |
| return invoke_root_script( [ "/usr/bin/shareconnector_control", "status" ] ) |
| |
| # This method deregisters a connector |
| def deregister_connector( class_name ): |
| """Deregisters a connector; used in tests that see what happens when a connector |
| has been uninstalled, where connections and jobs may remain.""" |
| invoke_script( [ "/usr/lib/metacarta/crawler-unregisterconnector", class_name ] ) |
| |
| # This method reregisters a connector |
| def register_connector( class_name, description ): |
| """Registers or re-registers a connector; used in tests that see what happens when a connector |
| has been uninstalled, and is reinstalled.""" |
| invoke_script( [ "/usr/lib/metacarta/crawler-registerconnector", class_name, description ] ) |
| |
| # This method deregisters an authority |
| def deregister_authorityconnector( class_name ): |
| """Deregisters an authority connector; used in tests that see what happens when an authority |
| has been uninstalled, where connections may remain.""" |
| invoke_script( [ "/usr/lib/metacarta/crawler-unregisterauthority", class_name ] ) |
| |
| # This method reregisters an authority connector |
| def register_authorityconnector( class_name, description ): |
| """Registers or re-registers an authority connector; used in tests that see what happens when an authority |
| has been uninstalled, and is reinstalled.""" |
| invoke_script( [ "/usr/lib/metacarta/crawler-registerauthority", class_name, description ] ) |
| |
| # This method checks what the authority webapp returns |
| def ask_authority_webapp( user_name ): |
| """Ask the authority webapp to see what it says. Returns the entire response.""" |
| try: |
| f = urllib2.urlopen("http://localhost:8180/authorityservice/UserACLs?username=%s" % urllib.quote(user_name)) |
| except urllib2.HTTPError,e: |
| return (str(e),None) |
| return (None,f.read()) |
| |
| # Dump configuration to a file |
| def export_configuration( filename ): |
| """ Export configuration to the specified file """ |
| invoke_root_script( [ "/usr/lib/metacarta/backup-crawler-configuration", filename ] ) |
| |
| # Restore configuration from a file |
| def import_configuration( filename ): |
| """ Import configuration from the specified file """ |
| invoke_root_script( [ "/usr/lib/metacarta/restore-crawler-configuration", filename ] ) |
| |
| # Call this method to prepare for the test. Turns off maintenance script, etc. |
| def setup_connector_environment( ): |
| """Set up the connector environment for tests - disable maintenance, other hooks here.""" |
| disable_maintenance() |
| |
| def disable_maintenance( ): |
| """ Disable maintenance script, then wait if it is running until it stops. |
| """ |
| invoke_root_script( [ "mv", |
| "/etc/cron.d/metacarta-postgres-maintenance-crontab", |
| "local-metacarta-postgres-maintenance-crontab-copy" ] ) |
| |
| while True: |
| try: |
| os.stat( "/var/run/metacarta/postgres-maintenance-in-progress" ) |
| time.sleep(10) |
| except Exception, e: |
| return |
| |
| # Call this method to clean up an installed machine. Restores maintenance |
| # script, etc. |
| def teardown_connector_environment( ): |
| """restore normal connector configuration - turn maintenance back on, etc.""" |
| enable_maintenance() |
| |
| def enable_maintenance( ): |
| try: |
| os.stat( "/etc/cron.d/metacarta-postgres-maintenance-crontab" ) |
| except Exception, e: |
| invoke_root_script( [ "mv", |
| "local-metacarta-postgres-maintenance-crontab-copy", |
| "/etc/cron.d/metacarta-postgres-maintenance-crontab" ] ) |
| |
| invoke_root_script( [ "rm", "-f", |
| "local-metacarta-postgres-maintenance-crontab-copy" ] ) |
| |
| # This method MUST be called in order to use the connector framework. |
| # This also has to be added to the documentation because without it search will not work. |
| def enable_connector_framework( ): |
| pass |
| |
| # Set maximum document-size limit |
| def set_max_document_size( size_value ): |
| fd = open( "/etc/metacarta/ingest_reject_size", "w" ) |
| try: |
| fd.write( str(size_value) ) |
| fd.write( "\n" ) |
| finally: |
| fd.close( ) |
| |
| # Clear maximum document-size limit |
| def clear_max_document_size( ): |
| try: |
| os.remove( "/etc/metacarta/ingest_reject_size" ) |
| except: |
| pass |
| |
| # Create a serial file, for client certificate creation |
| def create_serial_file( serial_file_name ): |
| fd = open( serial_file_name, "w" ) |
| try: |
| fd.write( "01\n" ) |
| finally: |
| fd.close( ) |
| |
| # Build a signed, duck-specific client certificate using an existing certificate authority. |
| # The certificate authority is specified by a public key and a private key, and a password. |
| def create_client_certificate( ca_public_key_file, ca_private_key_file, serial_file, client_cert_file, password ): |
| """ This code uses ssl_control and openssl to build a certificate request, and sign the certificate, |
| respectively. """ |
| invoke_root_script( [ "ssl_control", |
| "create-cert-req", |
| "client.req" ], "US\nMassachusetts\nCambridge\nMetaCarta\nEngineering/QA\n\n\n\n" ) |
| # Now invoke openssl |
| invoke_root_script( [ "openssl", |
| "x509", |
| "-req", |
| "-in", |
| "client.req", |
| "-CA", |
| ca_public_key_file, |
| "-CAkey", |
| ca_private_key_file, |
| "-CAserial", |
| serial_file, |
| "-out", |
| client_cert_file ], password + "\n" ) |
| |
| # Clean out everything |
| def reset_all( ): |
| # Clear out what's been ingested before |
| from sqatools import LeafblowerHacks |
| LeafblowerHacks.total_purge() |
| # Get rid of all jobs, etc. |
| invoke_crawler_command( "com.metacarta.crawler.ResetAll" ) |
| invoke_crawler_command( "com.metacarta.authorities.ResetAll") |
| |
| # Define the standard output connection |
| def define_gts_outputconnection( ): |
| """ Define a standard GTS output connection """ |
| define_outputconnection( "GTS", "GTS", "com.metacarta.agents.output.gts.GTSConnector", configparams=[ "Ingestion URI=http://localhost:7031/HTTPIngest" ] ) |
| |
| # Delete the standard output connection |
| def delete_gts_outputconnection( ): |
| """ Delete the standard GTS output connection """ |
| delete_outputconnection( "GTS" ) |
| |
| # Define an output connection |
| def define_outputconnection( connectionname, connectiondescription, connectionclass, |
| poolmax="10", configparams=[] ): |
| """Define a connection""" |
| listparams = [ process_argument(connectionname), |
| process_argument(connectiondescription), |
| connectionclass, |
| str(poolmax) ] |
| for item in configparams: |
| listparams.append(item) |
| invoke_crawler_command( "com.metacarta.crawler.DefineOutputConnection", argument_list=listparams ) |
| |
| # Define a repository connection |
| def define_repositoryconnection( connectionname, connectiondescription, connectionclass, authorityname="", |
| poolmax="10", configparams=[] ): |
| """Define a connection""" |
| listparams = [ process_argument(connectionname), |
| process_argument(connectiondescription), |
| connectionclass, |
| process_argument(authorityname), |
| str(poolmax) ] |
| for item in configparams: |
| listparams.append(item) |
| invoke_crawler_command( "com.metacarta.crawler.DefineRepositoryConnection", argument_list=listparams ) |
| |
| # Define an authority connection |
| def define_authorityconnection( connectionname, connectiondescription, connectionclass, |
| poolmax="10", configparams=[] ): |
| """Define a connection""" |
| listparams = [ process_argument(connectionname), |
| process_argument(connectiondescription), |
| connectionclass, |
| poolmax ] |
| for item in configparams: |
| process_argument(listparams.append(item)) |
| invoke_crawler_command( "com.metacarta.authorities.DefineAuthorityConnection", argument_list=listparams ) |
| |
| |
| # Define a job |
| def define_job( jobdescription, connectionname, xml, output_connection="GTS", output_xml="", type="specified", startmethod="windowbegin", |
| recrawlinterval=0 ): |
| """Define a job""" |
| return invoke_crawler_command( "com.metacarta.crawler.DefineJob", argument_list=[ process_argument(jobdescription), |
| process_argument(connectionname), |
| process_argument(output_connection), |
| type, |
| startmethod, |
| "accurate", |
| "%d" % (recrawlinterval * 1000*60), |
| "", |
| "", |
| "5", |
| "", |
| process_argument(xml), |
| process_argument(output_xml) ] ) |
| |
| # Start a job |
| def start_job( job_id ): |
| """Start a job""" |
| return invoke_crawler_command( "com.metacarta.crawler.StartJob", argument_list=[ process_argument(job_id) ] ) |
| |
| # Wait for job to be complete |
| def wait_job_complete( job_id ): |
| """Wait for job to finish""" |
| return invoke_crawler_command( "com.metacarta.crawler.WaitForJobInactive", argument_list=[ process_argument(job_id) ] ) |
| |
| # Wait for job to be deleted |
| def wait_job_deleted( job_id ): |
| """Wait for job to be successfully removed""" |
| return invoke_crawler_command( "com.metacarta.crawler.WaitForJobDeleted", argument_list=[ process_argument(job_id) ] ) |
| |
| # Delete a job |
| def delete_job( job_id ): |
| """Delete an existing job""" |
| return invoke_crawler_command( "com.metacarta.crawler.DeleteJob", argument_list=[ process_argument(job_id) ] ) |
| |
| # Abort a job |
| def abort_job( job_id ): |
| """Abort a running job""" |
| return invoke_crawler_command( "com.metacarta.crawler.AbortJob", argument_list=[ process_argument(job_id) ] ) |
| |
| # Pause a job |
| def pause_job( job_id ): |
| """ Pause a running job """ |
| return invoke_crawler_command( "com.metacarta.crawler.PauseJob", argument_list=[ process_argument(job_id) ] ) |
| |
| # Resume a job |
| def resume_job( job_id ): |
| """ Resume a running job """ |
| return invoke_crawler_command( "com.metacarta.crawler.RestartJob", argument_list=[ process_argument(job_id) ] ) |
| |
| # Wait for job to pause |
| def wait_job_paused( job_id ): |
| """ Wait for a job to pause """ |
| return invoke_crawler_command( "com.metacarta.crawler.WaitJobPaused", argument_list=[ process_argument(job_id) ] ) |
| |
| # Delete an output connection |
| def delete_outputconnection( connection_name ): |
| """Delete an output connection""" |
| return invoke_crawler_command( "com.metacarta.crawler.DeleteOutputConnection", argument_list=[ process_argument(connection_name) ] ) |
| |
| # Delete a repository connection |
| def delete_repositoryconnection( connection_name ): |
| """Delete a repository connection""" |
| return invoke_crawler_command( "com.metacarta.crawler.DeleteRepositoryConnection", argument_list=[ process_argument(connection_name) ] ) |
| |
| # Delete an authority connection |
| def delete_authorityconnection( connection_name ): |
| """Delete an authority connection""" |
| return invoke_crawler_command( "com.metacarta.authorities.DeleteAuthorityConnection", argument_list=[ process_argument(connection_name) ] ) |
| |
| def set_scheduled_time( jobid, minutesFromNow, intervalMinutes=None ): |
| """Add a schedule time for a job""" |
| # What we do is get the current time, add the seconds, and then call the schedule set script |
| currentTime = time.time() |
| triggerTime = currentTime + minutesFromNow * 60 |
| triggerStruct = time.localtime(triggerTime) |
| interval = "" |
| if (intervalMinutes != None): |
| interval = "%d" % intervalMinutes |
| months = [ "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december" ] |
| hours = [ "12am", "1am", "2am", "3am", "4am", "5am", "6am", "7am", "8am", "9am", "10am", "11am", "12pm", |
| "1pm", "2pm", "3pm", "4pm", "5pm", "6pm", "7pm", "8pm", "9pm", "10pm", "11pm" ] |
| return invoke_crawler_command( "com.metacarta.crawler.AddScheduledTime", argument_list=[ jobid, |
| interval, |
| "monday,tuesday,wednesday,thursday,friday,saturday,sunday", |
| "%d" % triggerStruct.tm_mday, |
| months[triggerStruct.tm_mon - 1], |
| "%d" % triggerStruct.tm_year, |
| hours[triggerStruct.tm_hour], |
| "%d" % triggerStruct.tm_min ] ) |
| |
| def change_job_doc_spec( jobid, docspec ): |
| """Change the job document specification""" |
| return invoke_crawler_command( "com.metacarta.crawler.ChangeJobDocSpec", argument_list=[ jobid, |
| docspec ] ) |
| |
| def change_auth_spec( authname, spec ): |
| """Change the authority specification""" |
| return invoke_crawler_command( "com.metacarta.authorities.ChangeAuthSpec", argument_list=[ authname, |
| spec ] ) |
| |
| def get_everyone_sid( ): |
| """ Get the sid of 'everyone' """ |
| return initialize_adtools().EveryoneSID |
| |
| def get_ad_user_sid( user ): |
| """ Get the sid of an ad user """ |
| return initialize_adtools().get_sid( user ) |
| |
| def conditionally_add_ad_user( ad_domain_info, user, password ): |
| """ Create a user if one doesn't already exist. |
| The appliance must be joined to the domain before this will work. |
| """ |
| local_ad_handle = initialize_adtools() |
| try: |
| local_ad_handle.query_entity_in_ldap( user ) |
| except local_ad_handle.ADToolsException, e: |
| add_ad_user( ad_domain_info, user, password ) |
| # if the users are going to hang around we need to ensure that their passwords don't expire |
| # this is unconditional due to a point in time where they were expiring and we want tests |
| # to force this setting |
| local_ad_handle.set_no_password_expire( ad_domain_info.domain_controller_ambassador, ad_domain_info.ad_domain, user) |
| |
| def add_ad_user( ad_domain_info, user, password ): |
| """Create a user in the specified AD domain controller""" |
| initialize_adtools().create_user( user, password, ad_domain_info.domain_controller_ambassador, ad_domain_info.ad_domain, confirm_user=False ) |
| |
| def delete_ad_user( ad_domain_info, user ): |
| """Delete a user""" |
| initialize_adtools().delete_user(user, ad_domain_info.domain_controller_ambassador) |
| |
| def configure_ad( ad_domain_info, join_multidomain = False ): |
| """Configure appliance to use AD authentication""" |
| adtools = initialize_adtools() |
| |
| # if multi-domain join requested then join all the child domains too |
| # Disable precheck of admin user; this is because this feature is broken in ntlmv2 domains |
| if join_multidomain: |
| adtools.ad_md_setup_join( ad_domain_info, False, precheck_admin_user=False ) |
| else: |
| adtools.join_ad_with_defaults( ad_domain_info.realm_admin_password, admin_user=ad_domain_info.realm_admin.split("@")[0], precheck_admin_user=False ) |
| # Added so the 76 domain works |
| adtools.add_to_group(SQAhelpers.reverse_resolve().split('.')[0], |
| adtools.CompatibilityGroup, |
| ad_domain_info.domain_controller_ambassador) |
| # Set auth_control again to specify connector framework |
| invoke_root_script( [ "auth_control", |
| "authorization", |
| "search-web-ui,secure-soap-search", |
| "--add", |
| "connector_framework" ] ) |
| |
| def turn_off_ad( ad_domain_info, leave_multidomain = False): |
| """Disable ad authentication""" |
| # Disable precheck of admin user; this is because this feature is broken in ntlmv2 domains |
| if leave_multidomain: |
| initialize_adtools().leave_ad( realm_admin_password=ad_domain_info.realm_admin_password, disable_machine_acct=False, delete_machine_acct=True, already_left=0, precheck_admin_user=False ) |
| else: |
| initialize_adtools().leave_ad( realm_admin_password=ad_domain_info.realm_admin_password, admin_user=ad_domain_info.realm_admin.split("@")[0], precheck_admin_user=False ) |
| |
| def create_crawler_user( user, password ): |
| """Create a user specifically for crawler UI, that the UI will be able to use""" |
| if user == None: |
| return |
| invoke_root_script( [ "basic_auth_control", "add", "ingest_users", user+":"+password ] ) |
| |
| def delete_crawler_user( user ): |
| """Delete the crawler user""" |
| if user == None: |
| return |
| invoke_root_script( [ "basic_auth_control", "remove", "ingest_users", user ] ) |
| |
| def add_basic_auth_user( user, password ): |
| """Set up a basic auth user and password""" |
| invoke_root_script( [ "basic_auth_control", "add", "my_ui_users", user+":"+password ], "yes\n" ) |
| |
| def delete_basic_auth_user( user ): |
| """Delete basic auth user""" |
| invoke_root_script( [ "basic_auth_control", "remove", "my_ui_users", user ], "yes\n" ) |
| |
| def configure_basic_auth( ): |
| """Reset the system so that basic auth is the authenticator, and wait for apache to come back up.""" |
| invoke_root_script( [ "basic_auth_control", "createdb", "my_ui_users" ] ) |
| invoke_root_script( [ "auth_control", "auth", "search-web-ui,soap-search,secure-soap-search", |
| "basic_auth", "my_ui_users" ] ) |
| invoke_root_script( [ "auth_control", |
| "authorization", |
| "search-web-ui,soap-search,secure-soap-search", |
| "--add", |
| "connector_framework" ] ) |
| wait_for_apache( ) |
| |
| def turn_off_basic_auth( ): |
| """Reset the system so basic auth is off""" |
| invoke_root_script( [ "auth_control", "auth", "secure-soap-search", |
| "basic_auth", "ui_users" ] ) |
| invoke_root_script(["auth_control", "auth", "search-web-ui,soap-search", "none"]) |
| wait_for_apache( ) |
| invoke_root_script( [ "basic_auth_control", "destroydb", "my_ui_users", "--force"], "yes\n" ) |
| |
| def invoke_crawler_command( classname, argument_list=[], input=None, additional_switches=[], additional_classpath=None ): |
| """ Invoke a crawler command, including the metacarta-pullagent-test.jar in addition to the full java-environment classpath |
| and definitions. Always run as tomcat user. """ |
| if additional_classpath: |
| additional_classpath = "crawler-testing-package/metacarta-pullagent-test.jar:%s" % additional_classpath |
| else: |
| additional_classpath = "crawler-testing-package/metacarta-pullagent-test.jar" |
| |
| command_arguments = [ "crawler-testing-package/executejava", "-classpath", additional_classpath ] + additional_switches+ [ classname ] + argument_list |
| return invoke_script(command_arguments,input=input) |
| |
| def invoke_script( argumentlist, input=None, stdin_encoding=None, stdout_encoding="utf-8", stderr_encoding=None ): |
| # for some reason the argument list can have null entries, map them to "" |
| for i in range(0,len(argumentlist)): |
| if argumentlist[i] == None: |
| print "Warning: None argument in invoke_script, printing stack trace" |
| traceback.print_stack() |
| argumentlist[i] = "" |
| |
| programname = argumentlist[0] |
| |
| fullargumentlist = [ "sudo", "-u", "tomcat55" ] + argumentlist |
| print " ".join([quote_escape(i) for i in fullargumentlist]) |
| |
| program = subprocess.Popen(fullargumentlist, |
| stdin=subprocess.PIPE, |
| stdout=subprocess.PIPE, |
| stderr=subprocess.PIPE) |
| if input != None: |
| if stdin_encoding != None: |
| input = input.encode(stdin_encoding) |
| else: |
| input = input.encode() |
| (outputtext, errortext) = program.communicate(input) |
| retcode = program.wait() |
| if stderr_encoding != None: |
| errortext = errortext.decode(stderr_encoding) |
| else: |
| errortext = errortext.decode() |
| if stdout_encoding != None: |
| outputtext = outputtext.decode(stdout_encoding) |
| else: |
| outputtext = outputtext.decode() |
| if retcode != 0: |
| raise Exception((u"Error response from %s: %d, message = [" % (programname,retcode)) + errortext + u"]") |
| # Right at the moment, the testing infrastructure only accepts 7-bit binary values, so we have no choice but to do what we can to bash |
| # the output to 7 bits |
| print (u"Program %s output = '" % (programname) + outputtext + u"'").encode("utf-8") |
| return outputtext |
| |
| def invoke_root_script( argumentlist, input=None, allow_errors=False ): |
| programname = argumentlist[0] |
| # Set up the escaped argument list, which is a single string with each |
| # argument included in an escaped, quoted manner |
| accumulator = [ ] |
| user = sqautils.check_username_configured() |
| if user != "root": |
| accumulator.append(quote_escape("sudo")) |
| accumulator.append(" ") |
| for item in argumentlist: |
| accumulator.append(quote_escape(item)) |
| accumulator.append(" ") |
| actualcommand = "".join(accumulator) |
| print actualcommand |
| program = subprocess.Popen(argumentlist, |
| stdin=subprocess.PIPE, |
| stdout=subprocess.PIPE, |
| stderr=subprocess.PIPE) |
| if input != None: |
| input = input.encode() |
| (outputtext, errortext) = program.communicate(input) |
| retcode = program.wait() |
| outputtext = outputtext.decode() |
| errortext = errortext.decode() |
| if allow_errors == False and retcode != 0: |
| raise Exception((u"Error response from %s: %d, message = [" % (programname,retcode)) + errortext + u"]") |
| # Right at the moment, the testing infrastructure only accepts 7-bit binary values, so we have no choice but to do what we can to bash |
| # the output to 7 bits |
| print (u"Program %s output = '" % (programname) + outputtext + u"'").encode("utf-8") |
| return outputtext |
| |
| def process_argument( mystring ): |
| if mystring == None: |
| return '' |
| return mystring |
| |
| def quote_escape( mystring ): |
| if mystring == None: |
| return '""' |
| outputresult = [ '"' ] |
| for i in range(len(mystring)): |
| character = mystring[i] |
| # In order to add memex documents into the repository, had '*' in here too, but that led to fields having '\*' in them instead of '*'. |
| if character == '"' or character == '\\' or character == '$': |
| outputresult.append('\\') |
| outputresult.append(character) |
| outputresult.append( '"' ) |
| return ''.join(outputresult) |
| |
| # Waits until apache is up -- stolen from active_directory_tool |
| def wait_for_apache( ): |
| """Wait for apache to respond to port 80""" |
| http_port = 80 |
| for x in range(0,100): |
| try: |
| try: |
| signal.alarm(15) |
| s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
| s.connect(("localhost", http_port)) |
| finally: |
| signal.alarm(0) |
| except IOError, e: |
| if e.errno == errno.EINTR: |
| raise Exception("timeout talking to port %s" % http_port) |
| else: |
| raise |
| except socket.error, e: |
| if re.search("Connection refused", str(e)): |
| time.sleep(1) |
| continue |
| else: |
| raise |
| return |
| raise Exception("Unable to connect to port %d. Check apache error log." % \ |
| http_port) |
| |
| def regexp_encode( thestring ): |
| """Encode a string so that regexp will match it exactly""" |
| rval = "" |
| for thechar in thestring: |
| if thechar == '.': |
| rval += "\." |
| elif thechar == '?': |
| rval += "\?" |
| elif thechar == '\\': |
| rval += "\\\\" |
| else: |
| rval += thechar |
| return rval |
| |
| def search_exists_check( keywords, collectionname, example, win_host=None, username=None, password=None ): |
| """Check for existence in search results of the single example supplied""" |
| |
| # The SHN and HN may get restarted out from under our query. So |
| # this should loop a few times to find what we are looking for. |
| # This means the test may take 4 seconds longer to fail, and other |
| # timing related issues may not be caught. |
| for counter in range(4): |
| matches = TestDocs.search_documents_user(keywords, |
| collection=collectionname, |
| method="soapheadnode", |
| username=username, |
| password=password, |
| win_host=win_host) |
| |
| # matches are url-encoded, full file names (including /root) |
| # Filenames in docs are therefore url encoded here |
| quotedoc = example #urllib.quote(example,safe='') |
| cn = re.compile(example) |
| for doc in matches: |
| # Use re matching |
| mo = cn.match( doc ) |
| if mo != None: |
| return |
| time.sleep(1) |
| |
| raise Exception("Expected document %s not returned when searching" % quotedoc) |
| |
| def search_nonexists_check( keywords, collectionname, example, win_host=None, username=None, password=None ): |
| """Check for nonexistence in search results of the single example supplied""" |
| |
| # The SHN and HN may get restarted out from under our query. This |
| # means the test may get success when the underlying query fails, |
| # so there's not much point in doing it. We'll do it anyway |
| # though, for the day may come when it will be meaningful to ask |
| # this question. |
| |
| # We could also query a few times here to gain more confidence |
| # that the number of results returned is 0. |
| |
| matches = TestDocs.search_documents_user(keywords, |
| collection=collectionname, |
| method="soapheadnode", |
| username=username, |
| password=password, |
| win_host=win_host) |
| |
| # matches are url-encoded, full file names (including /root) |
| # Filenames in docs are therefore url encoded here |
| quotedoc = example #urllib.quote(example,safe='') |
| cn = re.compile(example) |
| for doc in matches: |
| # Use re matching |
| mo = cn.match( doc ) |
| if mo != None: |
| raise Exception("Unexpected document %s was returned when searching: Actual results =[%s]" % (quotedoc,"".join(matches))) |
| |
| def search_check( keywords, collectionname, docs, win_host=None, username=None, password=None ): |
| """Make sure all the specified docs are in the set returned for a |
| search against the specified keywords""" |
| |
| # Reingested documents can cause duplicate documents because of race on new slice start |
| # we need to retry once if we've discovered a document that should not be there. |
| retry_unexpected = True |
| |
| # Keep looping until we find what we are looking for. |
| # This will tend to cause false successes in the case of an empty set, and slower failures |
| # if the test actually fails. |
| for counter in range(4): |
| matches = TestDocs.search_documents_user(keywords, |
| collection=collectionname, |
| method="soapheadnode", |
| username=username, |
| password=password, |
| win_host=win_host) |
| |
| for founddoc in matches: |
| if founddoc not in docs: |
| if retry_unexpected: |
| print "Retrying search because unexpected document %s was present." % (founddoc) |
| break |
| raise Exception("Unexpected document %s returned when searching; results =[%s]" % (founddoc,"".join(matches))) |
| else: |
| # If we got all the way through the loop, there were no unexpected docs to retry |
| retry_unexpected = False |
| |
| if retry_unexpected: |
| # We only want to retry once, so clear the flag that triggers retries, sleep a couple |
| # seconds to give the old slice time to shut down and then continue on to the next |
| # search attempt. |
| retry_unexpected = False |
| time.sleep(2.0) |
| continue |
| |
| # matches are url-encoded, full file names (including /root) |
| # Filenames in docs are therefore url encoded here |
| failed = False |
| for doc in docs: |
| quotedoc = doc #urllib.quote(doc,safe='') |
| if quotedoc not in matches: |
| failed = True |
| break |
| if failed == False: |
| return |
| time.sleep(1) |
| |
| raise Exception("Expected document %s not returned when searching; actual results =[%s]" % (quotedoc,"".join(matches))) |
| |
| def wait_for_ingest( timeout=3600 ): |
| docs.wait_for_ingestion(timeout=timeout) |
| |
| # Create a file system repository connection via the UI |
| def define_filesystem_repository_connection_ui( username, password, connection_name, connection_description, |
| throttles=None, |
| max_connections=None ): |
| """ The throttles argument is an array of tuples. Each tuple represents a throttle and is of the form (regexp,description,avg-fetch-rate). |
| """ |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for repository connection management and click it |
| window = vb.find_window("") |
| link = window.find_link("List repository connections") |
| link.click( ) |
| |
| # Click "add a connection" |
| window = vb.find_window("") |
| link = window.find_link("Add a connection") |
| link.click( ) |
| |
| # Find the right form elements and set them |
| window = vb.find_window("") |
| form = window.find_form("editconnection") |
| |
| # "Name" tab |
| namefield = form.find_textarea("connname") |
| descriptionfield = form.find_textarea("description") |
| namefield.set_value( connection_name ) |
| descriptionfield.set_value( connection_description ) |
| |
| # "Type" tab |
| link = window.find_link("Type tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editconnection") |
| connectortypefield = form.find_selectbox("classname") |
| connectortypefield.select_value( "com.metacarta.crawler.connectors.filesystem.FileConnector" ) |
| # Click the "Continue" button |
| continue_button = window.find_button("Continue to next page") |
| continue_button.click( ) |
| window = vb.find_window("") |
| |
| # "Throttling" tab |
| link = window.find_link("Throttling tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editconnection") |
| |
| if throttles != None: |
| for throttle in throttles: |
| regexp,description,rate = throttle |
| # Add a throttle with the specified parameters |
| regexpfield = form.find_textarea("throttle") |
| descfield = form.find_textarea("throttledesc") |
| valuefield = form.find_textarea("throttlevalue") |
| regexpfield.set_value( regexp ) |
| if description != None: |
| descfield.set_value( description ) |
| valuefield.set_value( str(rate) ) |
| add_button = window.find_button("Add throttle") |
| add_button.click() |
| window = vb.find_window("") |
| form = window.find_form("editconnection") |
| |
| if max_connections != None: |
| form.find_textarea("maxconnections").set_value( str(max_connections) ) |
| |
| # Now, save this page |
| save_button = window.find_button("Save this connection") |
| save_button.click( ) |
| |
| # See if the connector saved |
| window = vb.find_window("") |
| found_connection_name = window.find_match("<!--connection=(.*)-->",1) |
| if found_connection_name != connection_name: |
| raise Exception("Created connection doesn't match") |
| |
| # Delete a file system repository connection via the UI |
| def delete_filesystem_repository_connection_ui( username, password, connection_name ): |
| delete_repository_connection_ui( username, password, connection_name ) |
| |
| # Define a standard job using the UI |
| def define_filesystem_job_ui( username, |
| password, |
| job_name, |
| connection_name, |
| startpoints_and_matches, |
| collection_name=None, |
| document_template=None, |
| hop_filters=None, |
| hop_mode=None, |
| type="specified", |
| startmethod="windowbegin", |
| recrawlinterval=0 ): |
| """connection_name is the name of the filesystem connection. startpoints_and_matches |
| is an array, each element of which is a tuple. The tuple consists of the start point |
| path, and an array of match specifications. Each match specification is a tuple |
| consisting of a string (either "include" or "exclude"), a type (either "file" or "directory"), |
| and a match value (such as "*.txt"). |
| Legal values for type are: "specified" or "continuous" |
| Legal values for start method are: "windowbegin", "windowinside", or "disable". |
| Hop filters are an array of tuples, each one ( filter_name, filter_value ). |
| Hop mode has the legal values "accurate", "nodelete", or "neverdelete". |
| """ |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("List jobs") |
| link.click( ) |
| |
| # Grab the new window |
| window = vb.find_window("") |
| # Add a job |
| link = window.find_link("Add a job") |
| link.click( ) |
| |
| # Grab the edit window |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| |
| # "Name" tab |
| # textarea for setting description |
| form.find_textarea("description").set_value( job_name ) |
| |
| # "Connection" tab |
| link = window.find_link("Connection tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| # start method |
| if startmethod == "windowbegin": |
| startmethod_value = 0 |
| elif startmethod == "windowinside": |
| startmethod_value = 1 |
| elif startmethod == "disable": |
| startmethod_value = 2 |
| else: |
| raise Exception("Illegal start method value: '%s'" % startmethod ) |
| form.find_selectbox("startmethod").select_value( str(startmethod_value) ) |
| # connection name |
| form.find_selectbox("connectionname").select_value( connection_name ) |
| # output connection name |
| form.find_selectbox("outputname").select_value( "GTS" ) |
| # Click the "Continue" button |
| window.find_button("Continue to next screen").click( ) |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| |
| # "Scheduling" tab |
| link = window.find_link("Scheduling tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| # type |
| if type == "specified": |
| type_value = 1 |
| elif type == "continuous": |
| type_value = 0 |
| else: |
| raise Exception("Illegal type value: '%s'" % type ) |
| form.find_selectbox("scheduletype").select_value( str(type_value) ) |
| # Recrawl interval |
| if type == "continuous": |
| form.find_textarea("recrawlinterval").set_value( str(recrawlinterval) ) |
| |
| # "Hop Filters" tab |
| link = window.find_link("Hop Filters tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| if hop_filters != None: |
| for filterelement in hop_filters: |
| filter_name, filter_value = filterelement |
| if filter_value == None: |
| filter_value = "" |
| form.find_textarea("hopmax_"+filter_name).set_value(str(filter_value)) |
| if hop_mode != None: |
| if hop_mode == "accurate": |
| mode = 0 |
| elif hop_mode == "nodelete": |
| mode = 1 |
| elif hop_mode == "neverdelete": |
| mode = 2 |
| else: |
| raise Exception("Illegal mode %s" % hop_mode) |
| form.find_radiobutton("hopcountmode",str(mode)).select() |
| |
| # "Collections" tab |
| link = window.find_link("Collections tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| # textarea for setting collection |
| if collection_name != None: |
| form.find_textarea("gts_collectionname").set_value( collection_name ) |
| |
| # "Template" tab |
| link = window.find_link("Template tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| # textarea for setting document template |
| if document_template != None: |
| form.find_textarea("gts_documenttemplate").set_value( document_template ) |
| |
| # "Paths" tab |
| link = window.find_link("Paths tab") |
| link.click() |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| # Now, set up paths and matches |
| path_index = 0 |
| for pathelement in startpoints_and_matches: |
| path, matches = pathelement |
| # Add the path |
| form.find_textarea("specpath").set_value( path ) |
| # Click the "Add" button |
| window.find_button("Add new path").click( ) |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| # Now, go through the matches |
| for match in matches: |
| include_exclude, match_type, match_value = match |
| assert include_exclude == "include" or include_exclude == "exclude" |
| assert match_type == "file" or match_type == "directory" |
| form.find_selectbox("specflavor_%d" % path_index).select_value(include_exclude) |
| form.find_selectbox("spectype_%d" % path_index).select_value(match_type) |
| form.find_textarea("specmatch_%d" % path_index).set_value(match_value) |
| window.find_button("Add new match for path #%d" % path_index).click( ) |
| window = vb.find_window("") |
| form = window.find_form("editjob") |
| path_index += 1 |
| |
| # Finally, submit the form |
| window.find_button("Save this job").click( ) |
| window = vb.find_window("") |
| jobid = window.find_match("<!--jobid=(.*)-->",1) |
| return jobid |
| |
| # Delete a job using the UI |
| def delete_job_ui( username, |
| password, |
| jobid ): |
| """Delete the specified job using the UI. No attempt |
| is made to insure that the job actually can be deleted. If it is running |
| then this operation will fail. |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("List jobs") |
| if link == None: |
| raise Exception("Can't find list link for jobs"); |
| link.click( ) |
| |
| # Grab the new window |
| window = vb.find_window("") |
| # Find the delete link |
| deletelink = window.find_link("Delete job "+jobid) |
| if deletelink == None: |
| raise Exception("Can't find delete link for job %s" % jobid) |
| deletelink.click( ) |
| |
| # Grab the new window, and confirm that the job is gone |
| window = vb.find_window("") |
| window.check_no_match("View job "+jobid) |
| |
| # Start a job using the UI |
| def start_job_ui( username, |
| password, |
| jobid ): |
| """ Start a job, using the UI to do it. This does not confirm |
| that the job has started, due to the difficulty of getting |
| the timing right, but it does kick the job off at least. |
| """ |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Manage jobs") |
| link.click( ) |
| |
| # Grab the new window |
| window = vb.find_window("") |
| # Find the link that would start the job |
| link = window.find_link("Start job "+jobid) |
| # Click it! |
| link.click( ) |
| |
| # View repository connection via the UI (and check for 'working' status) |
| def view_repository_connection_ui( username, password, connection_name, match_string="Connection working" ): |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for repository connection management and click it |
| window = vb.find_window("") |
| link = window.find_link("List repository connections") |
| link.click( ) |
| |
| # Now, find the delete link for this connection |
| window = vb.find_window("") |
| link = window.find_link("View "+connection_name) |
| link.click( ) |
| |
| # Verify that the connection was OK |
| window = vb.find_window("") |
| window.find_match(match_string) |
| |
| # Delete a repository connection via the UI |
| def delete_repository_connection_ui( username, password, connection_name ): |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for repository connection management and click it |
| window = vb.find_window("") |
| link = window.find_link("List repository connections") |
| link.click( ) |
| |
| # Now, find the delete link for this connection |
| window = vb.find_window("") |
| link = window.find_link("Delete "+connection_name) |
| link.click( ) |
| |
| # Verify that the connection was deleted |
| window = vb.find_window("") |
| # simply make sure it's not an error screen |
| window.find_match("List of Repository Connections") |
| |
| # Delete an authority connection via the UI |
| def delete_authority_connection_ui( username, password, connection_name ): |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for repository connection management and click it |
| window = vb.find_window("") |
| link = window.find_link("List authorities") |
| link.click( ) |
| |
| # Now, find the delete link for this connection |
| window = vb.find_window("") |
| link = window.find_link("Delete "+connection_name) |
| link.click( ) |
| |
| # Verify that the connection was deleted |
| window = vb.find_window("") |
| # simply make sure it's not an error screen |
| window.find_match("List of Authority Connections") |
| |
| |
| # Pause a job using the UI |
| def pause_job_ui( username, |
| password, |
| jobid ): |
| """ Pause a job, using the UI to do it. This does not confirm |
| that the job has paused, due to the difficulty of getting |
| the timing right, but it does pause the job off at least. |
| """ |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Manage jobs") |
| link.click( ) |
| |
| # Grab the new window |
| window = vb.find_window("") |
| # Find the link that would start the job |
| link = window.find_link("Pause job "+jobid) |
| # Click it! |
| link.click( ) |
| |
| # Resume a job using the UI |
| def resume_job_ui( username, |
| password, |
| jobid ): |
| """ Resume a job, using the UI to do it. This does not confirm |
| that the job has resumed, due to the difficulty of getting |
| the timing right. |
| """ |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Manage jobs") |
| link.click( ) |
| |
| # Grab the new window |
| window = vb.find_window("") |
| # Find the link that would start the job |
| link = window.find_link("Resume job "+jobid) |
| # Click it! |
| link.click( ) |
| |
| # Get a job's status string from the UI |
| def get_job_status_ui( username, password, jobid ): |
| """ Find a job's status in the UI given it's ID. |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Manage jobs") |
| link.click( ) |
| |
| # Grab the new window |
| window = vb.find_window("") |
| # Use the built-in function to look for a match |
| return window.find_match( "<td class=\"columncell\"><!--jobid=%s-->([^<]*)</td><td class=\"columncell\">([^<]*)</td>" % jobid, group=2) |
| |
| def find_job_by_name_ui( username, password, jobname, connectionname ): |
| """ Look for a job matching the provided description. """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("List jobs") |
| link.click( ) |
| window = vb.find_window("") |
| window.find_match_no_newlines("<tr[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*</tr[^>]*>" % (jobname,connectionname)) |
| |
| link = window.find_link("Manage jobs") |
| link.click( ) |
| |
| # Get the text and look for something that matches what we are looking for. |
| # This is complicated by the fact that python's regexp processing doesn't like newlines, so eliminate those first. |
| window = vb.find_window("") |
| return window.find_match_no_newlines("<!--jobid=([0123456789]*)-->%s<" % (jobname), group=1) |
| |
| def find_connection_by_name_ui( username, password, connectionname ): |
| """ Look for a connection matching the provided description. """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for repository connection management and click it |
| window = vb.find_window("") |
| link = window.find_link("List repository connections") |
| link.click( ) |
| |
| # Look for a view link for the correct connection name |
| window = vb.find_window("") |
| window.find_link("View %s" % connectionname) |
| |
| def find_status_by_job_name_ui( username, password, jobname, jobstatus, totalqueue, activequeue, remainingqueue ): |
| """ Look for a matching jobstatus. """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for repository connection management and click it |
| window = vb.find_window("") |
| link = window.find_link("Manage jobs") |
| link.click( ) |
| |
| # Get the text and look for something that matches what we are looking for. |
| # This is complicated by the fact that python's regexp processing doesn't like newlines, so eliminate those first. |
| window = vb.find_window("") |
| return window.find_match_no_newlines("<tr[^>]*>.*<td[^>]*><!--jobid=([0123456789]*)-->%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*<td[^>]*>%s</td[^>]*>.*</tr[^>]*>" % (jobname,jobstatus,totalqueue,activequeue,remainingqueue), group=1) |
| |
| # Parse a date of the form 03-25-2008 |
| def parse_date( date_value ): |
| date_fields = date_value.split("-") |
| return (int(date_fields[2]),int(date_fields[0]),int(date_fields[1])) |
| |
| # Parse a time of the form 10:11:20.523 |
| def parse_time( time_value ): |
| time_fields = time_value.split(":") |
| return (int(time_fields[0]), int(time_fields[1]), float(time_fields[2])) |
| |
| # Parse a time value from a report into seconds since epoch. |
| # Time values look something like this: 03-25-2008 10:11:20.523 |
| # Note there is no timezone, so the value must be parsed in terms of the appliance's |
| # default timezone. |
| def parse_date_time( time_value ): |
| date_and_time = time_value.split(" ") |
| year,mon,mday = parse_date(date_and_time[0]) |
| hour,min,sec = parse_time(date_and_time[1]) |
| return time.mktime((year,mon,mday,hour,min,int(sec),-1,-1,-1)) |
| |
| # Helper method: Decode html found in result data. This removes nobr's, a tags, and converts br's to newlines |
| def decode_html( raw_data ): |
| replace_data = "" |
| start_pos = 0 |
| while True: |
| new_pos = raw_data.find("<a",start_pos) |
| if new_pos == -1: |
| replace_data = replace_data + raw_data[start_pos:len(raw_data)] |
| break |
| replace_data = replace_data + raw_data[start_pos:new_pos] |
| end_pos = raw_data.find(">",new_pos) |
| if end_pos == -1: |
| raise Exception("Can't find end of '<a' tag") |
| start_pos = end_pos+1 |
| replace_data = replace_data.replace("<nobr>","") |
| replace_data = replace_data.replace("</nobr>","") |
| replace_data = replace_data.replace("<nobr/>","") |
| replace_data = replace_data.replace("<br/>","\n") |
| replace_data = replace_data.replace("</a>","") |
| |
| return replace_data.strip() |
| |
| # Helper method to parse a single table row. Returns data as an array. |
| def process_table_row( data, start_position ): |
| row_end = data.find("</tr>",start_position) |
| if row_end == -1: |
| raise Exception( "Couldn't find end of row in '%s' at position %d" % (data,start_position) ) |
| current_position = start_position |
| rval = [] |
| while True: |
| new_position = data.find("<td",current_position) |
| if new_position == -1 or new_position > row_end: |
| return rval |
| new_position = data.find(">",new_position) |
| if new_position == -1: |
| raise Exception("Missing td endbracket") |
| new_position = new_position + 1 |
| end_position = data.find("</td>",new_position) |
| if end_position == -1: |
| raise Exception("Missing </td> tag") |
| if end_position > row_end: |
| raise Exception("Found </tr> before </td>") |
| raw_data = data[new_position:end_position] |
| current_position = end_position + 4 |
| rval.append( decode_html(raw_data) ) |
| return rval |
| |
| # Helper method to parse a single result row. Returns data as a dictionary, with one element for each <td> cell |
| def process_data_row( data, headers, start_position ): |
| row_data = process_table_row( data, start_position ) |
| if len(row_data) != len(headers): |
| raise Exception( "Row data does not agree with header data in '%s' at position %d" % (data,start_position)) |
| rval = {} |
| for index in range(0,len(row_data)): |
| row_value = row_data[index] |
| header_value = headers[index] |
| rval[header_value] = row_value |
| return rval |
| |
| # Helper method to parse an entire resultset. Returns result as an array of dictionaries. |
| def parse_tableresult( data ): |
| # We look for tr's of class "headerrow" followed by "evendatarow" and "odddatarow" |
| current_pos = data.find('<tr class="headerrow">') |
| if current_pos == -1: |
| raise Exception( "Missing report header row in data '%s'" % data ) |
| # Parse the header row into header data |
| headers = process_table_row( data, current_pos ) |
| rval = [] |
| while True: |
| # Find evendatarow |
| new_pos = data.find("<tr class=\"evendatarow\">",current_pos) |
| if new_pos == -1: |
| break |
| rval.append( process_data_row( data, headers, new_pos ) ) |
| current_pos = new_pos |
| new_pos = data.find("<tr class=\"odddatarow\">",current_pos) |
| if new_pos == -1: |
| break |
| rval.append( process_data_row( data, headers, new_pos ) ) |
| current_pos = new_pos |
| |
| return rval |
| |
| # Sanity check a report page |
| def report_sanity_check( window, more): |
| """ Look at the previous/next buttons to make sure they grey out when they are supposed to, etc. """ |
| # First, we should not be able to find the "Previous page" button. |
| found_it = False |
| try: |
| window.find_button("Previous page") |
| found_it = True |
| except: |
| pass |
| if found_it: |
| raise Exception("Previous page button showed up when it shouldn't have") |
| # The "next" button should not show up either, because the max results should contain anything we try to do with the test |
| found_it = False |
| try: |
| window.find_button("Next page") |
| found_it = True |
| except: |
| pass |
| if found_it and not more: |
| raise Exception("Next page button showed up when it shouldn't have") |
| if not found_it and more: |
| raise Exception("Next page button didn't show up but it should have") |
| |
| range = window.find_match("<td class=\"description\"><nobr>Rows:</nobr></td><td class=\"value\">([0-9]*-([0-9]*|END))</td>", 1) |
| |
| values = range.split("-") |
| if values[0] != "0": |
| raise Exception("Row range should have begun at zero - instead saw %s" % values[0]) |
| if values[1] != "END" and not more: |
| raise Exception("Row range should have ended with END - instead saw %s" % values[1]) |
| |
| # Format activity list in a manner acceptable to the API methods |
| def format_activity_list(activities_list): |
| """ Format a list of activities in a manner acceptable to the API scripts """ |
| return ",".join(activities_list) |
| |
| # Format time value in a manner acceptable to the API methods |
| def format_time(ms_since_epoch): |
| """ Format a time in ms since epoch in a manner acceptable to the API scripts """ |
| if ms_since_epoch != None and len(str(ms_since_epoch)) > 0: |
| return str(ms_since_epoch) |
| return "" |
| |
| # Format window size in minutes for the API methods |
| def format_window_size(minutes): |
| """ Format window size in minutes """ |
| if minutes != None and len(str(minutes)) > 0: |
| return str(minutes) |
| return "5" |
| |
| # Split a comma-delimited line into multiple columns, unescaping in the process |
| def split_api_result_line(result_line): |
| """ Split a comma-delimited line into multiple columns, unescaping in the process |
| """ |
| return_value = [] |
| current_value = "" |
| index = 0 |
| for index in range(len(result_line)): |
| result_char = result_line[index] |
| if result_char == '\\': |
| index += 1 |
| result_char = result_line[index] |
| current_value += result_char |
| elif result_char == ',': |
| index += 1 |
| return_value += [ current_value ] |
| current_value = "" |
| else: |
| current_value += result_char |
| return_value += [ current_value ] |
| return return_value |
| |
| # Digest report API call result and an array of dictionaries corresponding to the results returned |
| def process_api_result(result, column_list): |
| """ Digest API result and make an array of dictionaries out of it """ |
| return_value = [] |
| for result_line in result.splitlines(): |
| value_array = split_api_result_line(result_line) |
| dict = {} |
| for index in range(len(column_list)): |
| column_name = column_list[index] |
| value = value_array[index] |
| dict[column_name] = value.strip() |
| return_value += [ dict ] |
| return return_value |
| |
| # List jobs using the API and return the results. |
| def list_jobs_api( ): |
| """ List jobs using the API """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-listjobs"] ) |
| return process_api_result(result,["identifier", |
| "description", |
| "connection", |
| "startmode", |
| "runmode", |
| "hopcountmode", |
| "priority", |
| "rescaninterval", |
| "expirationinterval", |
| "reseedinterval", |
| "documenttemplate"]) |
| |
| # List job statuses using the API and return the results |
| def list_job_statuses_api( ): |
| """ List job statuses using the API and return the results """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-listjobstatuses"] ) |
| return process_api_result(result,["identifier", |
| "description", |
| "status", |
| "inqueue", |
| "outstanding", |
| "processed", |
| "starttime", |
| "endtime", |
| "errortext"]) |
| |
| # Get job collections using the API and return the results |
| def get_job_collections_api( job_id ): |
| """ Get job collections using the API and return the results |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-getjobcollections", job_id] ) |
| return process_api_result(result,["collection"]) |
| |
| # Get job schedule using the API and return the results |
| def get_job_schedule_api( job_id ): |
| """ Get job schedule using the API and return the results """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-getjobschedule", job_id] ) |
| return process_api_result(result,["daysofweek", |
| "years", |
| "months", |
| "days", |
| "hours", |
| "minutes", |
| "timezone", |
| "duration"]) |
| |
| # Run a simple history report using the API and return the results. |
| def run_simple_history_report_api( connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| start_result_row=0, max_result_count=10000 ): |
| """ Run a simple history report. Return an array of dictionaries, each dictionary having fields that correspond to the data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-runsimplehistory", |
| connection_name, |
| format_activity_list(activities_list), |
| format_time(start_time), |
| format_time(end_time), |
| entity_regexp, |
| result_regexp, |
| "", |
| str(start_result_row), |
| str(max_result_count) ] ) |
| # Decode the result |
| return process_api_result(result,["identifier","activity","start_time","elapsed_time","result_code","result_desc","byte_count"]) |
| |
| # Run a simple history report from the UI and return the results. |
| def run_simple_history_report_ui( username, password, connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| max_result_count=10000, more=False ): |
| """ Run a simple history report. Return an array of dictionaries, each dictionary having fields that correspond to the header data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the |
| whole tuple, so (None,None,None,None,None) would be one way of indicating that. |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Simple history") |
| link.click( ) |
| |
| # Select the connection |
| # Grab the new window |
| window = vb.find_window("") |
| form = window.find_form("report") |
| form.find_selectbox("reportconnection").select_value( connection_name ) |
| # Submit |
| window.find_button("Continue").click( ) |
| |
| # Set up all the other report parameters |
| # Select the activities |
| window = vb.find_window("") |
| form = window.find_form("report") |
| activities_select = form.find_selectbox("reportactivities") |
| for activity in activities_list: |
| activities_select.multi_select_value( activity ) |
| # Entity match |
| if entity_regexp != None: |
| form.find_textarea("reportentitymatch").set_value( entity_regexp ) |
| if result_regexp != None: |
| form.find_textarea("reportresultcodematch").set_value( result_regexp ) |
| # Select the start time and end time |
| if start_time != None: |
| start_hour, start_minute, start_day, start_month, start_year = start_time |
| if start_hour != None: |
| form.find_selectbox("reportstarthour").select_value( str(start_hour) ) |
| else: |
| form.find_selectbox("reportstarthour").select_value( "" ) |
| if start_minute != None: |
| form.find_selectbox("reportstartminute").select_value( str(start_minute) ) |
| else: |
| form.find_selectbox("reportstartminute").select_value( "" ) |
| if start_day != None: |
| form.find_selectbox("reportstartday").select_value( str(start_day) ) |
| else: |
| form.find_selectbox("reportstartday").select_value( "" ) |
| if start_month != None: |
| form.find_selectbox("reportstartmonth").select_value( str(start_month) ) |
| else: |
| form.find_selectbox("reportstartmonth").select_value( "" ) |
| if start_year != None: |
| form.find_selectbox("reportstartyear").select_value( str(start_year) ) |
| else: |
| form.find_selectbox("reportstartyear").select_value( "" ) |
| if end_time != None: |
| end_hour, end_minute, end_day, end_month, end_year = end_time |
| if end_hour != None: |
| form.find_selectbox("reportendhour").select_value( str(end_hour) ) |
| else: |
| form.find_selectbox("reportendhour").select_value( "" ) |
| if end_minute != None: |
| form.find_selectbox("reportendminute").select_value( str(end_minute) ) |
| else: |
| form.find_selectbox("reportendminute").select_value( "" ) |
| if end_day != None: |
| form.find_selectbox("reportendday").select_value( str(end_day) ) |
| else: |
| form.find_selectbox("reportendday").select_value( "" ) |
| if end_month != None: |
| form.find_selectbox("reportendmonth").select_value( str(end_month) ) |
| else: |
| form.find_selectbox("reportendmonth").select_value( "" ) |
| if end_year != None: |
| form.find_selectbox("reportendyear").select_value( str(end_year) ) |
| else: |
| form.find_selectbox("reportendyear").select_value( "" ) |
| form.find_textarea("rowcount").set_value(str(max_result_count)) |
| |
| # Fire off the query |
| window.find_button("Execute this query").click( ) |
| |
| # Get the window contents, and scrape out the report data |
| window = vb.find_window("") |
| # Make sure everything about the response is consistent |
| report_sanity_check(window,more) |
| return parse_tableresult( window.get_data() ) |
| |
| # Run a max activity history report using the API and return the results. |
| def run_max_activity_history_report_api( connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| start_result_row=0, max_result_count=10000, |
| entity_bin_regexp=None, window_size_minutes=None ): |
| """ Run a max activity history report. Return an array of dictionaries, each dictionary having fields that correspond to the data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-runmaxactivityhistory", |
| connection_name, |
| format_activity_list(activities_list), |
| format_time(start_time), |
| format_time(end_time), |
| entity_regexp, |
| result_regexp, |
| "", |
| entity_bin_regexp, |
| format_window_size(window_size_minutes), |
| str(start_result_row), |
| str(max_result_count) ] ) |
| # Decode the result |
| return process_api_result(result,["identifier_bucket","starttime_ms","endtime_ms","activity_count"]) |
| |
| # Run a max activity history report from the UI and return the results. |
| def run_max_activity_history_report_ui( username, password, connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| max_result_count=10000, more=False, |
| entity_bin_regexp=None, window_size_minutes=None ): |
| """ Run a max activity history report. Return an array of dictionaries, each dictionary having fields that correspond to the header data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the |
| whole tuple, so (None,None,None,None,None) would be one way of indicating that. |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Maximum activity") |
| link.click( ) |
| |
| # Select the connection |
| # Grab the new window |
| window = vb.find_window("") |
| form = window.find_form("report") |
| form.find_selectbox("reportconnection").select_value( connection_name ) |
| # Submit |
| window.find_button("Continue").click( ) |
| |
| # Set up all the other report parameters |
| # Select the activities |
| window = vb.find_window("") |
| form = window.find_form("report") |
| activities_select = form.find_selectbox("reportactivities") |
| for activity in activities_list: |
| activities_select.multi_select_value( activity ) |
| # Entity match |
| if entity_regexp != None: |
| form.find_textarea("reportentitymatch").set_value( entity_regexp ) |
| if result_regexp != None: |
| form.find_textarea("reportresultcodematch").set_value( result_regexp ) |
| # Select the start time and end time |
| if start_time != None: |
| start_hour, start_minute, start_day, start_month, start_year = start_time |
| if start_hour != None: |
| form.find_selectbox("reportstarthour").select_value( str(start_hour) ) |
| else: |
| form.find_selectbox("reportstarthour").select_value( "" ) |
| if start_minute != None: |
| form.find_selectbox("reportstartminute").select_value( str(start_minute) ) |
| else: |
| form.find_selectbox("reportstartminute").select_value( "" ) |
| if start_day != None: |
| form.find_selectbox("reportstartday").select_value( str(start_day) ) |
| else: |
| form.find_selectbox("reportstartday").select_value( "" ) |
| if start_month != None: |
| form.find_selectbox("reportstartmonth").select_value( str(start_month) ) |
| else: |
| form.find_selectbox("reportstartmonth").select_value( "" ) |
| if start_year != None: |
| form.find_selectbox("reportstartyear").select_value( str(start_year) ) |
| else: |
| form.find_selectbox("reportstartyear").select_value( "" ) |
| if end_time != None: |
| end_hour, end_minute, end_day, end_month, end_year = end_time |
| if end_hour != None: |
| form.find_selectbox("reportendhour").select_value( str(end_hour) ) |
| else: |
| form.find_selectbox("reportendhour").select_value( "" ) |
| if end_minute != None: |
| form.find_selectbox("reportendminute").select_value( str(end_minute) ) |
| else: |
| form.find_selectbox("reportendminute").select_value( "" ) |
| if end_day != None: |
| form.find_selectbox("reportendday").select_value( str(end_day) ) |
| else: |
| form.find_selectbox("reportendday").select_value( "" ) |
| if end_month != None: |
| form.find_selectbox("reportendmonth").select_value( str(end_month) ) |
| else: |
| form.find_selectbox("reportendmonth").select_value( "" ) |
| if end_year != None: |
| form.find_selectbox("reportendyear").select_value( str(end_year) ) |
| else: |
| form.find_selectbox("reportendyear").select_value( "" ) |
| |
| if entity_bin_regexp != None: |
| form.find_textarea("reportbucketdesc").set_value( entity_bin_regexp ) |
| if window_size_minutes != None: |
| form.find_textarea("reportinterval").set_value( str(window_size_minutes) ) |
| |
| form.find_textarea("rowcount").set_value(str(max_result_count)) |
| |
| # Fire off the query |
| window.find_button("Execute this query").click( ) |
| |
| # Get the window contents, and scrape out the report data |
| window = vb.find_window("") |
| # Make sure everything about the response is consistent |
| report_sanity_check(window,more) |
| # Parse the result |
| return parse_tableresult( window.get_data() ) |
| |
| # Run a max bandwidth history report using the API and return the results. |
| def run_max_bandwidth_history_report_api( connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| start_result_row=0, max_result_count=10000, |
| entity_bin_regexp=None, window_size_minutes=None ): |
| """ Run a max bandwidth report. Return an array of dictionaries, each dictionary having fields that correspond to the data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-runmaxbandwidthhistory", |
| connection_name, |
| format_activity_list(activities_list), |
| format_time(start_time), |
| format_time(end_time), |
| entity_regexp, |
| result_regexp, |
| "", |
| entity_bin_regexp, |
| format_window_size(window_size_minutes), |
| str(start_result_row), |
| str(max_result_count) ] ) |
| # Decode the result |
| return process_api_result(result,["identifier_bucket","starttime_ms","endtime_ms","byte_count"]) |
| |
| # Run a max bandwidth history report from the UI and return the results. |
| def run_max_bandwidth_history_report_ui( username, password, connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| max_result_count=10000, more=False, |
| entity_bin_regexp=None, window_size_minutes=None ): |
| """ Run a max bandwidth report. Return an array of dictionaries, each dictionary having fields that correspond to the header data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the |
| whole tuple, so (None,None,None,None,None) would be one way of indicating that. |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Maximum bandwidth") |
| link.click( ) |
| |
| # Select the connection |
| # Grab the new window |
| window = vb.find_window("") |
| form = window.find_form("report") |
| form.find_selectbox("reportconnection").select_value( connection_name ) |
| # Submit |
| window.find_button("Continue").click( ) |
| |
| # Set up all the other report parameters |
| # Select the activities |
| window = vb.find_window("") |
| form = window.find_form("report") |
| activities_select = form.find_selectbox("reportactivities") |
| for activity in activities_list: |
| activities_select.multi_select_value( activity ) |
| # Entity match |
| if entity_regexp != None: |
| form.find_textarea("reportentitymatch").set_value( entity_regexp ) |
| if result_regexp != None: |
| form.find_textarea("reportresultcodematch").set_value( result_regexp ) |
| # Select the start time and end time |
| if start_time != None: |
| start_hour, start_minute, start_day, start_month, start_year = start_time |
| if start_hour != None: |
| form.find_selectbox("reportstarthour").select_value( str(start_hour) ) |
| else: |
| form.find_selectbox("reportstarthour").select_value( "" ) |
| if start_minute != None: |
| form.find_selectbox("reportstartminute").select_value( str(start_minute) ) |
| else: |
| form.find_selectbox("reportstartminute").select_value( "" ) |
| if start_day != None: |
| form.find_selectbox("reportstartday").select_value( str(start_day) ) |
| else: |
| form.find_selectbox("reportstartday").select_value( "" ) |
| if start_month != None: |
| form.find_selectbox("reportstartmonth").select_value( str(start_month) ) |
| else: |
| form.find_selectbox("reportstartmonth").select_value( "" ) |
| if start_year != None: |
| form.find_selectbox("reportstartyear").select_value( str(start_year) ) |
| else: |
| form.find_selectbox("reportstartyear").select_value( "" ) |
| if end_time != None: |
| end_hour, end_minute, end_day, end_month, end_year = end_time |
| if end_hour != None: |
| form.find_selectbox("reportendhour").select_value( str(end_hour) ) |
| else: |
| form.find_selectbox("reportendhour").select_value( "" ) |
| if end_minute != None: |
| form.find_selectbox("reportendminute").select_value( str(end_minute) ) |
| else: |
| form.find_selectbox("reportendminute").select_value( "" ) |
| if end_day != None: |
| form.find_selectbox("reportendday").select_value( str(end_day) ) |
| else: |
| form.find_selectbox("reportendday").select_value( "" ) |
| if end_month != None: |
| form.find_selectbox("reportendmonth").select_value( str(end_month) ) |
| else: |
| form.find_selectbox("reportendmonth").select_value( "" ) |
| if end_year != None: |
| form.find_selectbox("reportendyear").select_value( str(end_year) ) |
| else: |
| form.find_selectbox("reportendyear").select_value( "" ) |
| |
| if entity_bin_regexp != None: |
| form.find_textarea("reportbucketdesc").set_value( entity_bin_regexp ) |
| if window_size_minutes != None: |
| form.find_textarea("reportinterval").set_value( str(window_size_minutes) ) |
| |
| form.find_textarea("rowcount").set_value(str(max_result_count)) |
| |
| # Fire off the query |
| window.find_button("Execute this query").click( ) |
| |
| # Get the window contents, and scrape out the report data |
| window = vb.find_window("") |
| # Make sure everything about the response is consistent |
| report_sanity_check(window,more) |
| # Parse the result |
| return parse_tableresult( window.get_data() ) |
| |
| # Run a result histogram history report using the API and return the results. |
| def run_result_histogram_history_report_api( connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| start_result_row=0, max_result_count=10000, |
| entity_bin_regexp=None, result_bin_regexp=None ): |
| """ Run a result histogram report. Return an array of dictionaries, each dictionary having fields that correspond to the data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-runresulthistory", |
| connection_name, |
| format_activity_list(activities_list), |
| format_time(start_time), |
| format_time(end_time), |
| entity_regexp, |
| result_regexp, |
| "", |
| entity_bin_regexp, |
| result_bin_regexp, |
| str(start_result_row), |
| str(max_result_count) ] ) |
| # Decode the result |
| return process_api_result(result,["identifier_bucket","resultcode_bucket","event_count"]) |
| |
| # Run a result histogram history report from the UI and return the results. |
| def run_result_histogram_history_report_ui( username, password, connection_name, |
| activities_list, |
| start_time=None, end_time=None, |
| entity_regexp=None, result_regexp=None, |
| max_result_count=10000, more=False, |
| entity_bin_regexp=None, result_bin_regexp=None ): |
| """ Run a result histogram report. Return an array of dictionaries, each dictionary having fields that correspond to the header data |
| from the report. For arguments, None indicates the default (start time |
| is one hour ago, end time is now). Inside the tuples, any None in any one place indicates the current time for the |
| whole tuple, so (None,None,None,None,None) would be one way of indicating that. |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Result histogram") |
| link.click( ) |
| |
| # Select the connection |
| # Grab the new window |
| window = vb.find_window("") |
| form = window.find_form("report") |
| form.find_selectbox("reportconnection").select_value( connection_name ) |
| # Submit |
| window.find_button("Continue").click( ) |
| |
| # Set up all the other report parameters |
| # Select the activities |
| window = vb.find_window("") |
| form = window.find_form("report") |
| activities_select = form.find_selectbox("reportactivities") |
| for activity in activities_list: |
| activities_select.multi_select_value( activity ) |
| # Entity match |
| if entity_regexp != None: |
| form.find_textarea("reportentitymatch").set_value( entity_regexp ) |
| if result_regexp != None: |
| form.find_textarea("reportresultcodematch").set_value( result_regexp ) |
| # Select the start time and end time |
| if start_time != None: |
| start_hour, start_minute, start_day, start_month, start_year = start_time |
| if start_hour != None: |
| form.find_selectbox("reportstarthour").select_value( str(start_hour) ) |
| else: |
| form.find_selectbox("reportstarthour").select_value( "" ) |
| if start_minute != None: |
| form.find_selectbox("reportstartminute").select_value( str(start_minute) ) |
| else: |
| form.find_selectbox("reportstartminute").select_value( "" ) |
| if start_day != None: |
| form.find_selectbox("reportstartday").select_value( str(start_day) ) |
| else: |
| form.find_selectbox("reportstartday").select_value( "" ) |
| if start_month != None: |
| form.find_selectbox("reportstartmonth").select_value( str(start_month) ) |
| else: |
| form.find_selectbox("reportstartmonth").select_value( "" ) |
| if start_year != None: |
| form.find_selectbox("reportstartyear").select_value( str(start_year) ) |
| else: |
| form.find_selectbox("reportstartyear").select_value( "" ) |
| if end_time != None: |
| end_hour, end_minute, end_day, end_month, end_year = end_time |
| if end_hour != None: |
| form.find_selectbox("reportendhour").select_value( str(end_hour) ) |
| else: |
| form.find_selectbox("reportendhour").select_value( "" ) |
| if end_minute != None: |
| form.find_selectbox("reportendminute").select_value( str(end_minute) ) |
| else: |
| form.find_selectbox("reportendminute").select_value( "" ) |
| if end_day != None: |
| form.find_selectbox("reportendday").select_value( str(end_day) ) |
| else: |
| form.find_selectbox("reportendday").select_value( "" ) |
| if end_month != None: |
| form.find_selectbox("reportendmonth").select_value( str(end_month) ) |
| else: |
| form.find_selectbox("reportendmonth").select_value( "" ) |
| if end_year != None: |
| form.find_selectbox("reportendyear").select_value( str(end_year) ) |
| else: |
| form.find_selectbox("reportendyear").select_value( "" ) |
| |
| if entity_bin_regexp != None: |
| form.find_textarea("reportbucketdesc").set_value( entity_bin_regexp ) |
| if result_bin_regexp != None: |
| form.find_textarea("reportresultdesc").set_value( result_bin_regexp ) |
| |
| form.find_textarea("rowcount").set_value(str(max_result_count)) |
| |
| # Fire off the query |
| window.find_button("Execute this query").click( ) |
| |
| # Get the window contents, and scrape out the report data |
| window = vb.find_window("") |
| # Make sure everything about the response is consistent |
| report_sanity_check(window,more) |
| # Parse the result |
| return parse_tableresult( window.get_data() ) |
| |
| document_state_dictionary = { "never_been_processed" : 0, "processed_at_least_once" : 1 } |
| document_status_dictionary = { "no_longer_active" : 0, "in_progress" : 1, |
| "being_expired" : 2, "being_deleted" : 3, |
| "available_for_processing" : 4, "available_for_expiration" : 5, |
| "not_yet_processable" : 6, "not_yet_expirable" : 7, |
| "waiting_forever" : 8 } |
| |
| # Format the job id list |
| def format_job_list(job_list): |
| """ Format a list of jobs in a manner acceptable to API methods """ |
| return ",".join(job_list) |
| |
| # Format a time offset in minutes |
| def format_offset_minutes(time_offset_minutes): |
| """ Format a time offset given in minutes as milliseconds """ |
| if time_offset_minutes == None: |
| return "0" |
| return str(int(float(time_offset_minutes) * 60000L)) |
| |
| api_state_map = { "never_been_processed" : "neverprocessed", "processed_at_least_once" : "previouslyprocessed" } |
| |
| # Format a state list |
| def format_state_list(document_states): |
| """ Convert standard state strings into API state strings """ |
| state_array = [] |
| for state in document_states: |
| state_array += [ api_state_map[state] ] |
| return ",".join(state_array) |
| |
| api_status_map = { "no_longer_active" : "inactive", "in_progress" : "processing", |
| "being_expired" : "expiring", "being_deleted" : "deleting", |
| "available_for_processing" : "readyforprocessing", "available_for_expiration" : "readyforexpiration", |
| "not_yet_processable" : "waitingforprocessing", "not_yet_expirable" : "waitingforexpiration", |
| "waiting_forever" : "waitingforever" } |
| |
| # Format a status list |
| def format_status_list(document_statuses): |
| """ Convert standard status strings into API status strings """ |
| status_array = [] |
| for status in document_statuses: |
| status_array += [ api_status_map[status] ] |
| return ",".join(status_array) |
| |
| # Run a document status report using the API and return the results. |
| def run_document_status_api( connection_name, |
| job_list, |
| time_offset_minutes=None, |
| document_states=[ "never_been_processed", "processed_at_least_once" ], |
| document_statuses=[ "no_longer_active", "in_progress", "being_expired", "being_deleted", |
| "available_for_processing", "available_for_expiration", |
| "not_yet_processable", "not_yet_expirable", "waiting_forever" ], |
| identifier_regexp=None, |
| start_result_row=0, max_result_count=10000 ): |
| """ Run a document queue status report. Return an array of dictionaries, each dictionary having fields that correspond to data |
| from the report. For arguments, document_statuses is an array of strings, whose legal values are: |
| "no_longer_active", "in_progress", "being_expired", "being_deleted", "available_for_processing", |
| "available_for_expiration", "not_yet_processable", "not_yet_expirable", "waiting_forever". document_states is an array of strings, |
| whose legal values are: "never_been_processed", and "processed_at_least_once". |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-rundocumentstatus", |
| connection_name, |
| format_job_list(job_list), |
| format_offset_minutes(time_offset_minutes), |
| format_state_list(document_states), |
| format_status_list(document_statuses), |
| identifier_regexp, |
| "", |
| str(start_result_row), |
| str(max_result_count) ] ) |
| # Decode the result |
| return process_api_result(result,["doc_identifier","job_description","document_state","document_status","when_scheduled","action_to_take","remaining_retrycount","retrylimit_time"] ) |
| |
| |
| # Run a document status report from the UI and return the results. |
| def run_document_status_ui( username, password, connection_name, |
| job_list, |
| time_offset_minutes=None, |
| document_states=[ "never_been_processed", "processed_at_least_once" ], |
| document_statuses=[ "no_longer_active", "in_progress", "being_expired", "being_deleted", |
| "available_for_processing", "available_for_expiration", |
| "not_yet_processable", "not_yet_expirable", "waiting_forever" ], |
| identifier_regexp=None, |
| max_result_count=10000, more=False ): |
| """ Run a document queue status report. Return an array of dictionaries, each dictionary having fields that correspond to the header data |
| from the report. For arguments, document_statuses is an array of strings, whose legal values are: |
| "no_longer_active", "in_progress", "being_expired", "being_deleted", "available_for_processing", |
| "available_for_expiration", "not_yet_processable", "not_yet_expirable", "waiting_forever". document_states is an array of strings, |
| whose legal values are: "never_been_processed", and "processed_at_least_once". |
| """ |
| |
| # Set up virtual browser instance |
| vb = VirtualBrowser.VirtualBrowser( username=username, password=password ) |
| |
| # First, go to main page |
| vb.load_main_window( "http://localhost/crawler/index.jsp" ) |
| |
| # Find the link for job management and click it |
| window = vb.find_window("") |
| link = window.find_link("Document status") |
| link.click( ) |
| |
| # Select the connection |
| # Grab the new window |
| window = vb.find_window("") |
| form = window.find_form("report") |
| form.find_selectbox("statusconnection").select_value( connection_name ) |
| # Submit |
| window.find_button("Continue").click( ) |
| |
| # Set up all the other report parameters |
| window = vb.find_window("") |
| form = window.find_form("report") |
| |
| # Select the jobs |
| if job_list != None and len(job_list) > 0: |
| job_select = form.find_selectbox("statusjobs") |
| for job in job_list: |
| job_select.multi_select_value( job ) |
| else: |
| raise Exception("There must be some jobs!") |
| |
| # Submit |
| window.find_button("Continue").click( ) |
| window = vb.find_window("") |
| form = window.find_form("report") |
| |
| # Set the time offset, if any |
| if time_offset_minutes != None: |
| form.find_textarea( "status_schedule_offset" ).set_value( str(time_offset_minutes) ) |
| |
| # Select the states |
| if document_states != None: |
| state_select = form.find_selectbox("statusdocumentstates") |
| for document_state in document_states: |
| value = document_state_dictionary[ document_state ] |
| state_select.multi_select_value( str(value) ) |
| |
| # Select the statuses |
| if document_statuses != None: |
| status_select = form.find_selectbox("statusdocumentstatuses") |
| for document_status in document_statuses: |
| value = document_status_dictionary[ document_status ] |
| status_select.multi_select_value( str(value) ) |
| |
| # Entity match |
| if identifier_regexp != None: |
| form.find_textarea("statusidentifiermatch").set_value( identifier_regexp ) |
| |
| form.find_textarea("rowcount").set_value(str(max_result_count)) |
| |
| # Fire off the query |
| window.find_button("Execute this query").click( ) |
| |
| # Get the window contents, and scrape out the report data |
| window = vb.find_window("") |
| # Make sure everything about the response is consistent |
| report_sanity_check(window,more) |
| # Parse the result |
| return parse_tableresult( window.get_data() ) |
| |
| # Run a queue status report using the API and return the results. |
| def run_queue_status_api( connection_name, |
| job_list, |
| time_offset_minutes=None, |
| document_states=[ "never_been_processed", "processed_at_least_once" ], |
| document_statuses=[ "no_longer_active", "in_progress", "being_expired", "being_deleted", |
| "available_for_processing", "available_for_expiration", |
| "not_yet_processable", "not_yet_expirable", "waiting_forever" ], |
| identifier_regexp=None, |
| start_result_row=0, max_result_count=10000, |
| bucket_regexp=None ): |
| """ Run a queue status report. Return an array of dictionaries, each dictionary having fields that correspond to data |
| from the report. For arguments, document_statuses is an array of strings, whose legal values are: |
| "no_longer_active", "in_progress", "being_expired", "being_deleted", "available_for_processing", |
| "available_for_expiration", "not_yet_processable", "not_yet_expirable". document_states is an array of strings, |
| whose legal values are: "never_been_processed", and "processed_at_least_once". |
| """ |
| result = invoke_script( ["/usr/lib/metacarta/crawler-runqueuestatus", |
| connection_name, |
| format_job_list(job_list), |
| format_offset_minutes(time_offset_minutes), |
| format_state_list(document_states), |
| format_status_list(document_statuses), |
| identifier_regexp, |
| "", |
| bucket_regexp, |
| str(start_result_row), |
| str(max_result_count) ] ) |
| # Decode the result |
| return process_api_result(result,["id_bucket","inactive_count","processing_count","expiring_count","deleting_count", |
| "process_ready_count","expire_ready_count","process_waiting_count","expire_waiting_count","waiting_forever_count" ] ) |
| |
| # Build a time value for a report given a time in seconds since epoch, using the current timezone |
| def build_report_time(seconds_since_epoch): |
| """ Build a report time structure (hours, minutes, days, month, year) from a value of seconds since epoch. """ |
| time_struct = time.localtime(seconds_since_epoch) |
| hours = time_struct.tm_hour |
| minutes = time_struct.tm_min |
| days = time_struct.tm_mday - 1 |
| month = time_struct.tm_mon - 1 |
| year = time_struct.tm_year |
| return (hours, minutes, days, month, year) |
| |
| # Build a time value for an report given a time in seconds since epoch |
| def build_api_time(seconds_since_epoch): |
| """ Build a report time string (ms since epoch) from a value of seconds since epoch. """ |
| return str(int(seconds_since_epoch * 1000)) |
| |
| # Miscellaneous file system test helpers. Dave put these here because he wanted his new tests to be able to use them; they really are pretty test-specific though. |
| |
| # Copy a folder to a (new) area |
| def copy_folder( source, target ): |
| invoke_root_script( [ "mkdir", "-p", target ] ) |
| invoke_root_script( [ "cp", "-r", source, target ] ) |
| |
| # Remove a folder |
| def delete_folder( target ): |
| invoke_root_script( [ "rm", "-rf", target ] ) |
| |
| def preclean( username, print_errors=True ): |
| ''' Clean up everything we might have done during the execution of this test. |
| This will include all jobs and ingested documents. ''' |
| |
| try: |
| reset_all() |
| except Exception, e: |
| if print_errors: |
| print "Error resetting all jobs" |
| print e |
| |
| # Remove test documents first |
| for folder in [ "/root/crawlarea", "/root/crawlarea2" ]: |
| try: |
| delete_folder( folder ) |
| except Exception, e: |
| if print_errors: |
| print "Error removing %s" % folder |
| print e |
| |
| try: |
| delete_crawler_user( username ) |
| except Exception, e: |
| if print_errors: |
| print "Error removing crawler user" |
| print e |
| |
| try: |
| teardown_connector_environment( ) |
| except Exception, e: |
| if print_errors: |
| print "Error cleaning up debs" |
| print e |
| |
| try: |
| # Since one of the tests deregisters the filesystem connector, reregister it here to be sure it exists. |
| register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector") |
| except Exception, e: |
| if print_errors: |
| print "Error reregistering file system connector" |
| print e |