legacy-tests/connectorframework_basic.py - manifoldcf - Git at Google

 #!/usr/bin/python

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements. See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import pg
 import re
 import os
 import sys
 import time
 import socket
 import shutil
 import urllib2
 import traceback
 import ConnectorHelpers
 import VirtualBrowser
 from threading import Thread
 import sqatools.LicenseMakerClient
 from sqatools import sqautils

 sys.path.append("/usr/lib/metacarta")

 import MetaCartaVersion

 # Dump output of lsof into log
 def dump_debug():
     print "Starting dump of lsof"
     ConnectorHelpers.invoke_root_script( [ "lsof" ] )
     print "Starting dump of ps -aux"
     ConnectorHelpers.invoke_root_script( [ "ps", "-aux" ] )
     print "Done dump"

 # Copy a folder to a (new) area
 def copy_folder( source, target ):
     ConnectorHelpers.invoke_root_script( [ "mkdir", "-p", target ] )
     ConnectorHelpers.invoke_root_script( [ "cp", "-r", source, target ] )

 # Remove a folder
 def delete_folder( target ):
     ConnectorHelpers.invoke_root_script( [ "rm", "-rf", target ] )

 # Run the spinner for looking for database/external lock deadlocks
 def run_lock_spinner( import_file_name ):
     ConnectorHelpers.invoke_crawler_command( "com.metacarta.crawler.JobStartSpinner", argument_list=[ ConnectorHelpers.process_argument(import_file_name) ] )

 # Extract a password from a conf file
 def extract_password(file_name):
     fd = open(file_name,"r")
     try:
         for line in fd.readlines():
             index = line.find("com.metacarta.ingest.password=")
             if index == 0:
                 # Strip off the newline at the end
                 return line[len("com.metacarta.ingest.password="):len(line)-1]
     finally:
         fd.close()
     raise Exception("Password not found!")

 class run_ingestion_test_server_thread(Thread):
     def __init__ (self, response):
         Thread.__init__(self)
         self.response = response
         self.setDaemon(True)

     def run(self):
         try:
             # Start the ingestion test server
             # Warning: This will hang until shut down!
             while True:
                 output = ConnectorHelpers.invoke_root_script( ["nc", "-l", "localhost", "7031"] )
                 # Because system health checker doesn't die when advertised, we *may* need to restart nc
                 if output.find("MetaCarta-Verbose-Response:") == -1:
                     break

         except Exception, e:
             self.response += str(e)

 # Disable the ability of the appliance to receive data from localhost port 7031
 def startup_fake_ingestion_service(response):
     """ Use nc to simulate a busted ingestion system """
     # Start nc, by invoking in a different thread
     thread = run_ingestion_test_server_thread(response)
     thread.start()
     # Sleep until we think nc is listening
     time.sleep(20)
     print "Ingestion test server successfully started up"
     return thread

 # Re-enable the ability of the appliance to receive data from localhost port 7031
 def shutdown_fake_ingestion_service(thread=None, response=None):
     """ Undo changes from disable """
     # Send a shutdown signal.  If this is nc listening, it should cause it to exit but not respond.  If nc has already exited, it should timeout and get an exception.
     # Otherwise, there should be a real response
     try:
         socket.setdefaulttimeout(10)
         ConnectorHelpers.invoke_curl("http://localhost:7031/services/HTTPIngest/?STATUS")
         socket.setdefaulttimeout(1000000)
     except:
         # Must already have been shut down; continue
         pass

     socket.setdefaulttimeout(1000000)

     print "Ingest system fakeout successfully shut down"
     # If there's a thread we know about, let it exit, and report any errors
     if thread:
         thread.join()
         if response:
             if len(response) > 0:
                 raise Exception("Ingestion fakeout server had problems: %s" % response)

 # This class runs the second kind of ingestion fakeout we try, which returns
 class run_non_timeout_ingestion_test_server_thread(Thread):
     def __init__ (self, response, mode):
         Thread.__init__(self)
         self.response = response
         self.mode = mode
         self.setDaemon(True)

     def run(self):
         try:
         # Start the ingestion test server
         # Warning: This will hang until shut down!
             ConnectorHelpers.invoke_root_script( ["python", "ingestion_fakeout_server.py", self.mode] )
         except Exception, e:
             self.response += str(e)

 # Run the real (non-timeout) fakeout service
 def startup_non_timeout_fake_ingestion_service(response,mode="500"):
     """ Use our own script to simulate a busted ingestion system that returns 500 on every request """
     # Start nc, by invoking in a different thread
     thread = run_non_timeout_ingestion_test_server_thread(response,mode)
     thread.start()

     # Loop until we think server is listening
     while True:
         error_seen = True
         try:
             ConnectorHelpers.invoke_curl("http://localhost:7031/checkalive")
             error_seen = False
         except:
             pass
         if error_seen:
             time.sleep(1)
         else:
             break

     print "Non-timeout ingestion test server successfully started up"
     return thread

 # Re-enable the ability of the appliance to receive data from localhost port 7031
 def shutdown_non_timeout_fake_ingestion_service(thread=None, response=None):
     """ Send shutdown signal, and wait for system to exit """
     # Send a shutdown signal.  If this is nc listening, it should cause it to exit but not respond.  If nc has already exited, it should timeout and get an exception.
     # Otherwise, there should be a real response
     try:
         socket.setdefaulttimeout(10)
         ConnectorHelpers.invoke_curl("http://localhost:7031/shutdown")
         socket.setdefaulttimeout(1000000)
     except:
         # Must already have been shut down; continue
         pass

     socket.setdefaulttimeout(1000000)

     print "Non-timeout ingest system fakeout successfully shut down"
     # If there's a thread we know about, let it exit, and report any errors
     if thread:
         thread.join()
         if response:
             if len(response) > 0:
                 raise Exception("Non-timeout ingestion fakeout server had problems: %s" % response)


 # Stop health checkers
 def stop_health_checker():
     ConnectorHelpers.invoke_root_script( [ "/etc/init.d/system_health_monitor", "stop" ] )
     # Health monitor doesn't really stop in synch with the above, so wait a while to be sure
     time.sleep(60)

 # Start health checkers
 def start_health_checker():
     ConnectorHelpers.invoke_root_script( [ "/etc/init.d/system_health_monitor", "start" ] )


 # Edit file system repository connection via the UI (for BPA spinner test)
 def resave_filesystem_repository_connection_ui( username, password, connection_name ):
     # Set up virtual browser instance
     vb = VirtualBrowser.VirtualBrowser( username=username, password=password )

     # First, go to main page
     vb.load_main_window( "http://localhost/crawler/index.jsp" )

     # Find the link for repository connection management and click it
     window = vb.find_window("")
     link = window.find_link("List repository connections")
     link.click( )

     # Now, find the delete link for this connection
     window = vb.find_window("")
     link = window.find_link("Edit "+connection_name)
     link.click( )

     # Find the "save" button
     window = vb.find_window("")
     link = window.find_button("Save this connection").click();

     # See if the connector saved
     window = vb.find_window("")
     found_connection_name = window.find_match("<!--connection=(.*)-->",1)
     if found_connection_name != connection_name:
         raise Exception("Edited connection doesn't match")

 # Crawl user credentials
 username = "testingest"
 password = "testingest"

 # A document template we can use to verify that that works
 document_template_text = '<template>\n' + \
         '<filter tagger_name="geo">\n' + \
         '<end_regex>-{10}</end_regex>\n' + \
         '</filter>\n' + \
         '</template>\n'

 def count_java_heap_dumps( ):
     """ Count the number of heap dumps in the /common/metacarta/java-heap-dumps directory """
     results = ConnectorHelpers.invoke_root_script( [ "ls","-1","/common/metacarta/java-heap-dumps" ] )
     return len(results.splitlines())

 def start_database( ):
     """ Start the database """
     ConnectorHelpers.invoke_root_script(["/etc/init.d/postgresql-8.3", "start"])
     time.sleep(15)

 def stop_database( ):
     """ Stop the database """
     ConnectorHelpers.invoke_root_script(["/etc/init.d/postgresql-8.3", "stop"])

 def system_health_check( ):
     """ Return True if the authority status comes out as a one line error, or
         False if it comes out as "skipping", or an exception if anything else.
     """
     text = ConnectorHelpers.invoke_root_script(["/usr/bin/check_system_health"], allow_errors=True)
     for line in text.splitlines():
         if line.find("Authority check already in progress") != -1:
             return False
         if line.find("Exception checking on authorities") != -1:
             if line.find("[Error getting connection]") != -1:
                 return True
     raise Exception("Expected check_system_health to return a single-line status for authority checks, instead saw %s" % text)

 signatures = [ "Schema upgrade in progress",
         "Schema incorrect for table",
         "Extra field for table",
         "Field definition incorrect for table",
         "Indexes incorrect for table",
         "Index definition incorrect for table",
         "Unexpected index definition for table",
         "Index definition for table" ]

 def gather_schema_errors( ):
     """ Return a list of errors that seem to be schema-related """
     rval = []
     text = ConnectorHelpers.invoke_root_script(["/usr/bin/check_system_health"], allow_errors=True)
     for line in text.splitlines():
         # Look for the pertinent signatures in this line
         for signature in signatures:
             if line.find(signature) != -1:
                 rval += [ line ]
                 break
     return rval

 # Create a outofmemory repository connection via the UI
 def define_outofmemory_repository_connection_ui( username, password, connection_name, connection_description,
                                                                         throttles=None,
                                                                         max_connections=None,
                                                                         failure_mode=None):
     """ The throttles argument is an array of tuples.  Each tuple represents a throttle and is of the form (regexp,description,avg-fetch-rate).
     """
     # Set up virtual browser instance
     vb = VirtualBrowser.VirtualBrowser( username=username, password=password )

     # First, go to main page
     vb.load_main_window( "http://localhost/crawler/index.jsp" )

     # Find the link for repository connection management and click it
     window = vb.find_window("")
     link = window.find_link("List repository connections")
     link.click( )

     # Click "add a connection"
     window = vb.find_window("")
     link = window.find_link("Add a connection")
     link.click( )

     # Find the right form elements and set them
     window = vb.find_window("")
     form = window.find_form("editconnection")

     # "Name" tab
     namefield = form.find_textarea("connname")
     descriptionfield = form.find_textarea("description")
     namefield.set_value( connection_name )
     descriptionfield.set_value( connection_description )

     # "Type" tab
     link = window.find_link("Type tab")
     link.click()
     window = vb.find_window("")
     form = window.find_form("editconnection")
     connectortypefield = form.find_selectbox("classname")
     connectortypefield.select_value( "com.metacarta.crawler.connectors.outofmemory.OutOfMemoryConnector" )
     # Click the "Continue" button
     continue_button = window.find_button("Continue to next page")
     continue_button.click( )
     window = vb.find_window("")

     # "Throttling" tab
     link = window.find_link("Throttling tab")
     link.click()
     window = vb.find_window("")
     form = window.find_form("editconnection")

     if throttles != None:
         for throttle in throttles:
             regexp,description,rate = throttle
             # Add a throttle with the specified parameters
             regexpfield = form.find_textarea("throttle")
             descfield = form.find_textarea("throttledesc")
             valuefield = form.find_textarea("throttlevalue")
             regexpfield.set_value( regexp )
             if description != None:
                 descfield.set_value( description )
             valuefield.set_value( str(rate) )
             add_button = window.find_button("Add throttle")
             add_button.click()
             window = vb.find_window("")
             form = window.find_form("editconnection")

     if max_connections != None:
         form.find_textarea("maxconnections").set_value( str(max_connections) )

     # "Failure Mode" tab
     link = window.find_link("Failure Mode tab")
     link.click()
     window = vb.find_window("")
     form = window.find_form("editconnection")
     if failure_mode != None:
         form.find_selectbox("failuremode").select_value(failure_mode)

     # Now, save this page
     save_button = window.find_button("Save this connection")
     save_button.click( )

     # See if the connector saved
     window = vb.find_window("")
     found_connection_name = window.find_match("<!--connection=(.*)-->",1)
     if found_connection_name != connection_name:
         raise Exception("Created connection doesn't match")


 # Define a standard job using the UI
 def define_outofmemory_job_ui( username,
                 password,
                 job_name,
                 connection_name,
                 startpoints_and_matches,
                 collection_name=None,
                 document_template=None,
                 hop_filters=None,
                 hop_mode=None,
                 type="specified",
                 startmethod="windowbegin",
                 recrawlinterval=0 ):
     """connection_name is the name of the filesystem connection.  startpoints_and_matches
        is an array, each element of which is a tuple.  The tuple consists of the start point
        path, and an array of match specifications.  Each match specification is a tuple
        consisting of a string (either "include" or "exclude"), a type (either "file" or "directory"),
        and a match value (such as "*.txt").
        Legal values for type are: "specified" or "continuous"
        Legal values for start method are: "windowbegin", "windowinside", or "disable".
        Hop filters are an array of tuples, each one ( filter_name, filter_value ).
        Hop mode has the legal values "accurate", "nodelete", or "neverdelete".
     """
     # We should be able to use the filesystem connector job creation UI here
     return ConnectorHelpers.define_filesystem_job_ui(username,password,job_name,connection_name,
         startpoints_and_matches,collection_name=collection_name,document_template=document_template,
         hop_filters=hop_filters,hop_mode=hop_mode,type=type,startmethod=startmethod,recrawlinterval=recrawlinterval)

 def preclean( print_errors=True ):
     ''' Clean up everything we might have done during the execution of this test.
         This will include all jobs and ingested documents. '''

     # Restore ingestion system
     try:
         shutdown_fake_ingestion_service()
     except Exception, e:
         if print_errors:
             print "Error restoring ingestion system"
             print e

     try:
         shutdown_non_timeout_fake_ingestion_service()
     except Exception, e:
         if print_errors:
             print "Error restoring ingestion system"
             print e

     try:
         ConnectorHelpers.start_leafblower()
     except Exception, e:
         if print_errors:
             print "Error starting leafblower"
             print e

     try:
         start_health_checker()
     except Exception, e:
         if print_errors:
             print "Error starting health checker"
             print e

     # Set clock back to actual time, if needed
     try:
         ConnectorHelpers.restore_clock()
     except Exception, e:
         if print_errors:
             print "Error restoring clock"
             print e

     # Start database if it is stopped
     try:
         start_database( )
     except Exception, e:
         if print_errors:
             print "Error starting database"
             print e

     # Restore schema if it has been altered
     print "Restoring schema."
     db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
     try:
         # First, get hold of the column definitions for intrinsiclink
         schema_query = "SELECT pg_attribute.attname AS field_col," + \
                 "CASE pg_type.typname WHEN 'int2' THEN 'smallint' WHEN 'int4' THEN 'int'" + \
                         " WHEN 'int8' THEN 'bigint' WHEN 'varchar' THEN 'varchar(' || pg_attribute.atttypmod-4 || ')'" + \
                         " WHEN 'float8' THEN 'double'" + \
                         " WHEN 'text' THEN 'longtext'" + \
                         " WHEN 'bpchar' THEN 'char(' || pg_attribute.atttypmod-4 || ')'" + \
                         " ELSE pg_type.typname END AS type_col," + \
                 "CASE WHEN pg_attribute.attnotnull THEN 'no' ELSE 'yes' END AS null_col," + \
                 "CASE pg_type.typname WHEN 'varchar' THEN substring(pg_attrdef.adsrc from '^(.*).*$') ELSE pg_attrdef.adsrc END AS Default " + \
                 "FROM pg_class INNER JOIN pg_attribute ON (pg_class.oid=pg_attribute.attrelid) INNER JOIN pg_type ON (pg_attribute.atttypid=pg_type.oid) " + \
                         "LEFT JOIN pg_attrdef ON (pg_class.oid=pg_attrdef.adrelid AND pg_attribute.attnum=pg_attrdef.adnum) " + \
                 "WHERE pg_class.relname='%s' AND pg_attribute.attnum>=1 AND NOT pg_attribute.attisdropped " + \
                 "ORDER BY pg_attribute.attnum"
         schema_results = db.query(schema_query % "intrinsiclink").dictresult()
         seen_isnew = False
         for row in schema_results:
             field_name = row["field_col"]
             if field_name == "isnew":
                 seen_isnew = True
             elif field_name == "wasnew":
                 # Delete this column!
                 db.query("ALTER TABLE intrinsiclink DROP COLUMN wasnew")
         if seen_isnew == False:
             # Create isnew column
             db.query("ALTER TABLE intrinsiclink ADD COLUMN isnew CHAR(1) NULL")

         index_query = "SELECT pg_catalog.pg_get_indexdef(i.indexrelid, 0, true) AS indexdef FROM pg_catalog.pg_class c, pg_catalog.pg_class c2, pg_catalog.pg_index i " + \
                 "WHERE c.relname = '%s' AND c.oid = i.indrelid AND i.indexrelid = c2.oid"
         index_results = db.query(index_query % "intrinsiclink").dictresult()
         seen_dropindex = False
         for definition in index_results:
             indexdef = definition["indexdef"]
             if indexdef.find("(jobid, childidhash, isnew)") != -1:
                 seen_dropindex = True
             elif indexdef.find("(isnew)") != -1 and indexdef.find("temporaryindex") != -1:
                 # Drop this index
                 db.query("DROP INDEX temporaryindex")
         if seen_dropindex == False:
             # Recreate missing index
             db.query("CREATE INDEX i123 ON intrinsiclink (jobid,childidhash,isnew)")
     finally:
         db.close()
     print "Done restoring schema"

     # Start agents if it is down
     try:
         ConnectorHelpers.start_agents()
     except Exception, e:
         if print_errors:
             print "Error starting agents service"
             print e

     try:
         ConnectorHelpers.reset_all()
     except Exception, e:
         if print_errors:
             print "Error resetting all jobs"
             print e

     # Remove saved crawl configuration files, if any
     for file in [ "test_crawl_1.conf", "test_crawl_2.conf", "test_crawl_3.conf" ]:
         try:
             os.unlink( file )
         except Exception, e:
             if print_errors:
                 print "Error removing %s" % file
                 print e

     # Remove test documents first
     for folder in [ "/root/crawlarea", "/root/crawlarea2" ]:
         try:
             delete_folder( folder )
         except Exception, e:
             if print_errors:
                 print "Error removing %s" % folder
                 print e

     try:
         sqatools.LicenseMakerClient.revoke_license()
     except Exception, e:
         if print_errors:
             print "Error revoking license"
             print e

     try:
         ConnectorHelpers.delete_crawler_user( username )
     except Exception, e:
         if print_errors:
             print "Error removing crawler user"
             print e

     try:
         ConnectorHelpers.teardown_connector_environment( )
     except Exception, e:
         if print_errors:
             print "Error cleaning up debs"
             print e

     try:
         # Since one of the tests deregisters the filesystem connector, reregister it here to be sure it exists.
         ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
     except Exception, e:
         if print_errors:
             print "Error reregistering file system connector"
             print e

 # Main
 if __name__ == '__main__':

     print "Precleaning!"

     preclean( print_errors=False )

     print "Clearing metacarta logs"
     log_pos = ConnectorHelpers.get_metacarta_log_pos( )
     agents_log_pos = ConnectorHelpers.get_metacarta_log_pos( log_name="/var/log/metacarta/java-agents/agents.log" )

     print "Setup Connector Environment."
     ConnectorHelpers.setup_connector_environment()

     print "Setting up file area."
     copy_folder("/root/testfiles","/root/crawlarea")
     copy_folder("/root/testfiles2","/root/crawlarea")

     ConnectorHelpers.create_crawler_user( username, password )

     # PHASE 0: Checking whether reset-crawler script seems to work

     print "Trying reset-crawler command..."

     ConnectorHelpers.invoke_root_script( [ "/usr/lib/metacarta/reset-crawler" ] )
     sqautils.wait_for_service("tomcat")
     saw_exception = False
     try:
         ConnectorHelpers.invoke_script( [ "/usr/lib/metacarta/reset-crawler" ] )
     except:
         saw_exception = True
     if saw_exception == False:
         raise Exception("Running /usr/lib/metacarta/reset-crawler as non-root should have failed but didn't!")

     # PHASE 0.1: See if the security on the database is OK
     try:
         ConnectorHelpers.invoke_root_script( [ "/usr/bin/psql", "--port", "5432", "-U", "metacarta", "-c", "\"SELECT * FROM jobs;\"" ], input="incorrect\n" )
         succeeded = True
     except:
         succeeded = False

     if succeeded:
         raise Exception("Was able to talk with psql on port 5432 with incorrect password!")


     print "Checking schema checker."

     # PHASE 0.2: Try out check_system_health after mucking with the schema
     ConnectorHelpers.shutdown_agents( )

     schema_errors = gather_schema_errors( )
     # Initially there should be no schema errors
     if len(schema_errors) != 0:
         raise Exception("Unexpected schema errors detected! %s" % schema_errors)

     # For all the schema alterations, be sure to do it on a table that is only used by metacarta-agents!
     # I've chosen intrinsiclink for this purpose.  Its normal schema is:
     #Column    |          Type          | Modifiers
     #--------------+------------------------+-----------
     #isnew        | character(1)           |
     #linktype     | character varying(255) |
     #childidhash  | character varying(40)  |
     #parentidhash | character varying(40)  | not null
     #jobid        | bigint                 | not null
     #Indexes:
     #"i1237996140680" UNIQUE, btree (jobid, linktype, parentidhash, childidhash)
     #"i1237996140678" btree (jobid, childidhash, isnew)
     #"i1237996140679" btree (jobid, parentidhash)
     #Foreign-key constraints:
     #"intrinsiclink_jobid_fkey" FOREIGN KEY (jobid) REFERENCES jobs(id) ON DELETE RESTRICT
     # It *should* be empty, after a successful preclean

     db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
     try:

         # Alter table to have an additional unexpected column
         db.query("ALTER TABLE intrinsiclink ADD COLUMN foobar VARCHAR(20) NOT NULL")
         schema_errors = gather_schema_errors( )
         if len(schema_errors) != 1:
             raise Exception("After adding a column, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
         db.query("ALTER TABLE intrinsiclink DROP COLUMN foobar")

         # Alter table to have a column substitution
         db.query("ALTER TABLE intrinsiclink ADD COLUMN wasnew CHAR(1) NULL")
         db.query("ALTER TABLE intrinsiclink DROP COLUMN isnew")
         # Not only does this cause the loss of a column, but it also causes the loss of an index.  So we see 2 errors...
         schema_errors = gather_schema_errors( )
         if len(schema_errors) != 2:
             raise Exception("After substituting a column, expected 2 schema errors, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
         db.query("ALTER TABLE intrinsiclink ADD COLUMN isnew CHAR(1) NULL")
         db.query("ALTER TABLE intrinsiclink DROP COLUMN wasnew")

         # At this point, we will have still lost the index on the isnew column, so we can test an index delete here
         schema_errors = gather_schema_errors( )
         if len(schema_errors) != 1:
             raise Exception("After deleting an index, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))

         # Add an incorrect index
         db.query("CREATE INDEX temporaryindex ON intrinsiclink (isnew)")

         schema_errors = gather_schema_errors( )
         if len(schema_errors) != 1:
             raise Exception("After index substitution, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))

         # Add in the correct index again
         db.query("CREATE INDEX i123 ON intrinsiclink (jobid,childidhash,isnew)")

         # Still should get 1 schema error because we have one extra index
         schema_errors = gather_schema_errors( )
         if len(schema_errors) != 1:
             raise Exception("With addition index, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))

         # Drop the bad index
         db.query("DROP INDEX temporaryindex")

         # Now, schema test should be OK
         schema_errors = gather_schema_errors( )
         if len(schema_errors) != 0:
             raise Exception("It looks like the test is screwed up; schema test failed after restoration")

     finally:
         db.close()

     ConnectorHelpers.start_agents( )

     # Create a standard GTS output connection
     ConnectorHelpers.define_gts_outputconnection( )

     print "Dump and restore empty configuration"

     # Check what happens when we dump and restore an empty configuration (21045)
     job_list = ConnectorHelpers.list_jobs_api( )
     if len(job_list) != 0:
         raise Exception("Expecting zero jobs, instead found %d" % len(job_list))
     ConnectorHelpers.export_configuration( "test_crawl_1.conf" )
     # There should be no connector-related configuration to blow away at this point!
     #ConnectorHelpers.reset_all( )
     # Restore the configuration
     ConnectorHelpers.import_configuration( "test_crawl_1.conf" )
     # Check that there are still zero jobs
     job_list = ConnectorHelpers.list_jobs_api( )
     if len(job_list) != 0:
         raise Exception("Expecting zero jobs, instead found %d" % len(job_list))

     print "Dump and restore configuration that has lots of jobs, with funky characters too"

     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "OneDocumentTest", "One Document Test" )
     job_name_list = {}
     job_count = 50
     for job_index in range(job_count):
         # I never intend to actually crawl this, so it can be utterly screwy and that's OK
         job_name = u"One Document Test Job \u00d8 %d" % job_index
         job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    job_name,
                                    "OneDocumentTest",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "f007.txt" ), ( "include", "directory", "*" ) ] ) ] )
         job_name_list[job_name] = job_name
     ConnectorHelpers.export_configuration( "test_crawl_3.conf" )
     job_list = ConnectorHelpers.list_jobs_api( )
     if len(job_list) != job_count:
         raise Exception("Expecting %d jobs, instead found %d" % (job_count,len(job_list)))
     # Blow away the config
     ConnectorHelpers.reset_all( )
     # Restore the configuration
     ConnectorHelpers.import_configuration( "test_crawl_3.conf" )
     # Check that there are still 50 jobs
     job_list = ConnectorHelpers.list_jobs_api( )
     if len(job_list) != job_count:
         raise Exception("Expecting %d jobs, instead found %d" % (job_count,len(job_list)))
     # Now, check to see that these jobs adhere to specifications
     for job_record in job_list:
         job_id = job_record["identifier"]
         job_name = job_record["description"]
         if not job_name_list.has_key(job_name):
             raise Exception(u"One of the restored jobs does not have a recognized name!  %s" % job_name)
         ConnectorHelpers.delete_job(job_id)
     # Test performance hack; get all the deletes started and then wait for the deletes to all complete.
     for job_record in job_list:
         job_id = job_record["identifier"]
         ConnectorHelpers.wait_job_deleted(job_id)

     ConnectorHelpers.delete_repositoryconnection("OneDocumentTest")

     print "Verify that the help link exists and seems correct."

     # Set up virtual browser instance
     vb = VirtualBrowser.VirtualBrowser( username, password )

     # First, go to main page
     vb.load_main_window( "http://localhost/crawler/index.jsp" )

     # Find the link for help
     window = vb.find_window("")
     link = window.find_link("Help")
     if link == None:
         raise Exception("Could not find help link in UI navigation")
     if link.url != "/documentation/ConnectorGuide.pdf":
         raise Exception("The help link was wrong: Saw '%s'" % link.url)

     print "Run two jobs with identical documents at the same time, and make sure we can restart metacarta-agents during this process."

     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "FileSystem", "FileSystem Connection",throttles=[("",None,"20000")] )

     # Define job
     job_id_1 = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "Test job 1",
                                    "FileSystem",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
     job_id_2 = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "Test job 2",
                                    "FileSystem",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )

     # Run the job to completion
     ConnectorHelpers.start_job_ui( username, password, job_id_1 )
     ConnectorHelpers.start_job_ui( username, password, job_id_2 )

     # Immediately restart metacarta-agents.  If failure, agents won't actually ever come back up, so test will timeout.
     ConnectorHelpers.restart_agents()

     ConnectorHelpers.wait_job_complete( job_id_1 )
     ConnectorHelpers.wait_job_complete( job_id_2 )

     # Get job status.  Sometimes we get this far but the status is messed up (job is done but there's an active document still)
     result = ConnectorHelpers.list_job_statuses_api()
     if len(result) != 2:
         raise Exception("Expected two jobs, found %d" % len(result))
     if result[0]["status"] != "done":
         raise Exception("Expected job status to be 'done', instead found '%s'" % result[0]["status"])
     if result[1]["status"] != "done":
         raise Exception("Expected job status to be 'done', instead found '%s'" % result[1]["status"])
     if result[0]["outstanding"] != str(0):
         raise Exception("Expected active documents to be 0, instead found '%s'" % result[0]["outstanding"])
     if result[1]["outstanding"] != str(0):
         raise Exception("Expected active documents to be 0, instead found '%s'" % result[1]["outstanding"])

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # See if we can find the documents we just ingested
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "good" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "kidneys" ], None, [ "root/crawlarea/testfiles/f003.txt" ] )
     ConnectorHelpers.search_check( [ "pub" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "city" ], None, [ "root/crawlarea/testfiles/f005.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )

     # Clean up both jobs simultaneously too.
     ConnectorHelpers.delete_job_ui( username, password, job_id_1 )
     ConnectorHelpers.delete_job_ui( username, password, job_id_2 )

     ConnectorHelpers.wait_job_deleted( job_id_1 )
     ConnectorHelpers.wait_job_deleted( job_id_2 )

     ConnectorHelpers.delete_repository_connection_ui( username, password, "FileSystem" )

     print "Force the crawler to run out of memory, and see that it shuts down."

     # Shutdown health checker first; otherwise we can get locks stuck, and this would be messy to clean up.
     stop_health_checker()
     # Wait long enough so we can be sure there are no outstanding connector-related health activities going on.
     time.sleep(30)

     old_heap_dumps = count_java_heap_dumps()

     define_outofmemory_repository_connection_ui( username, password, "OutOfMemoryTest", "Out of Memory Test" )
     job_id = define_outofmemory_job_ui( username,
                                    password,
                                    "Out of Memory Test Job",
                                    "OutOfMemoryTest",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*" ), ( "include", "directory", "*" ) ] ) ] )

     ConnectorHelpers.start_job_ui( username, password, job_id )
     # Give it some time to shut itself down
     time.sleep(60)
     # Verify that metacarta-agents is indeed gone
     if ConnectorHelpers.find_daemon_pid( ) != None:
         raise Exception("metacarta-agents should have aborted, but it's still running!")
     # Clean up locks enough so that we can't die trying to abort the job.
     ConnectorHelpers.shutdown_tomcat( )
     ConnectorHelpers.invoke_script(["/usr/lib/metacarta/core-lockclean"])
     # Now, abort the job.  This must happen before reset-crawler, because otherwise we might just run out of memory again.
     ConnectorHelpers.abort_job( job_id )
     # It stopped.  Now, we have to reset locks because the oom may have messed them up.  This will start services back up too.
     ConnectorHelpers.invoke_root_script(["/usr/lib/metacarta/reset-crawler"])
     # Start health checker
     start_health_checker()
     # The job should now abort properly
     ConnectorHelpers.wait_job_complete( job_id )

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )
     ConnectorHelpers.delete_repository_connection_ui( username, password, "OutOfMemoryTest" )

     new_heap_dumps = count_java_heap_dumps()
     if new_heap_dumps != old_heap_dumps + 1:
         raise Exception("Expected there to be %d heap dumps, instead found %d" % (old_heap_dumps+1,new_heap_dumps))

     print "Check that a delayed seeding phase does not permit job to abort until done."

     define_outofmemory_repository_connection_ui( username, password, "OutOfMemoryTest", "Out of Memory Test", failure_mode="seedingdelay" )
     job_id = define_outofmemory_job_ui( username,
                                    password,
                                    "Out of Memory Test Job",
                                    "OutOfMemoryTest",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*" ), ( "include", "directory", "*" ) ] ) ] )

     ConnectorHelpers.start_job_ui( username, password, job_id )
     # When the above is clicked, the UI immediately gives feedback that the job is starting.  Unfortunately you cannot tell the difference in the UI between the "READYFORSTARTUP" state and the
     # "STARTINGUP" state.  Aborts function differently in each state.
     # To sidestep this issue, we wait for 20 seconds after we see the 'starting up' in the UI, in order to be pretty certain the job has entered the "STARTINGUP" state.
     # Once truly in the "STARTINGUP" state, the job is guaranteed to stay in that state for at least 2 minutes.
     while True:
         start_began_time = time.time()
         job_state = ConnectorHelpers.get_job_status_ui( username, password, job_id )
         if job_state == "Starting up":
             time.sleep(20)
             break
         if job_state == "Running":
             raise Exception("Test problem: saw 'running' state without seeing 'starting up' phase")
         time.sleep(10)

     # Now, abort the job
     ConnectorHelpers.abort_job( job_id )
     # The job should NOT abort right away!!  Indeed, we should see the job stay in the "aborting" state for about 120-30 seconds.  Since this is approximate, we'll wait
     # only 60 seconds before checking the job state; it'd better not stop aborting by then!
     time.sleep(60-(time.time()-start_began_time))
     job_state = ConnectorHelpers.get_job_status_ui( username, password, job_id )
     if job_state != "Aborting":
         raise Exception("Expected job to stay in the Aborting state for an extended period of time, when interrupted during startup phase")

     ConnectorHelpers.wait_job_complete( job_id )

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )
     ConnectorHelpers.delete_repository_connection_ui( username, password, "OutOfMemoryTest" )

     print "Seeing whether broken pipe ingestion errors get handled correctly"

     # We need a file that's at least large enough to cause packet transmission without a flush.  The test documents are all too small by themselves - so build a big one....
     f_out = open("/root/crawlarea/bigfile.txt","w")
     try:
         for iteration in range(10000):
             f_in = open("/root/crawlarea/testfiles/f001.txt","r")
             try:
                 for line in f_in.readlines():
                     f_out.write(line)
             finally:
                 f_in.close()
     finally:
         f_out.close()

     define_outofmemory_repository_connection_ui( username, password, "OutOfMemoryTest", "Out of Memory Test", failure_mode="ingestiondelay" )
     job_id = define_outofmemory_job_ui( username,
                                    password,
                                    "Out of Memory Test Job",
                                    "OutOfMemoryTest",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*" ), ( "include", "directory", "*" ) ] ) ] )

     ConnectorHelpers.start_job_ui( username, password, job_id )
     # If the code is working properly, broken pipe errors will be treated as 400's.
     # If not, the job will retry documents indefinitely, and the test will fail for that reason.
     ConnectorHelpers.wait_job_complete( job_id )

     # Get rid of big file
     os.unlink("/root/crawlarea/bigfile.txt")

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )
     ConnectorHelpers.delete_repository_connection_ui( username, password, "OutOfMemoryTest" )

     print "Ingest using broken ingestion system."

     # Set up an ingestion of exactly one document
     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "OneDocumentTest", "One Document Test" )
     job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "One Document Test Job",
                                    "OneDocumentTest",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "f007.txt" ), ( "include", "directory", "*" ) ] ) ] )

     stop_health_checker()
     ConnectorHelpers.stop_leafblower()
     # Set up dummy ingest listener
     response = ""
     this_thread = startup_fake_ingestion_service(response)

     print "Looking at debug info after nc-based fake ingestion service started"
     dump_debug()

     # Start the job.  The job would normally run for many hours, because we would need to wait for the retries to give up, so I'm going to abort it after some short period of time.
     ConnectorHelpers.start_job_ui( username, password, job_id )
     time.sleep(60)
     ConnectorHelpers.abort_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # The connector framework should have logged a -2 error for the first ingest activity!
     results = ConnectorHelpers.run_simple_history_report_api( "OneDocumentTest", [ "document ingest (GTS)" ] )
     # There should have been one or more ingestion attempts
     if len(results) == 0:
         raise Exception("No ingestion attempts were reported!  Expected at least one.")

     # Check to be sure that at least one received a -2 error
     saw_proper_error = False
     for result in results:
         if int(result["result_code"]) == -2:
             saw_proper_error = True
             break

     if not saw_proper_error:
         raise Exception("Did not see expected -2 error in ingest history results")

     shutdown_fake_ingestion_service(this_thread,response)

     print "Looking at debug info after nc-based fake ingestion service stopped"
     dump_debug()

     # Next, try an ingestion service that just returns 500 errors
     response = ""
     this_thread = startup_non_timeout_fake_ingestion_service(response)

     print "Looking at debug info after homegrown fake ingestion service started"
     dump_debug()

     # Start the job, wait for a time, then abort it.  This should run for a long time, and generate a few warnings in the log, which we'll check for later.
     ConnectorHelpers.start_job_ui( username, password, job_id )
     time.sleep(60)
     ConnectorHelpers.abort_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     shutdown_non_timeout_fake_ingestion_service(this_thread,response)

     print "Looking at debug info after homegrown fake ingestion service stopped"
     dump_debug()

     # Check for at least one error in the log of form:
     # "Error 500 from ingestion request; ingestion will be retried again later"
     lines = ConnectorHelpers.read_metacarta_log( "Error 500 from ingestion request; ingestion will be retried again later", agents_log_pos, log_name="/var/log/metacarta/java-agents/agents.log" )
     if len(lines) == 0:
         raise Exception("Did not see expected ingestion request retry message in log!")

     # Delete the job, without anything listening on 7031.  This will cause a number of deleted documents to be queued.
     ConnectorHelpers.delete_job( job_id )
     for iteration in range(5):
         # Wait a little while so they *do* get queued,
         time.sleep(10)
         # Now, stop metacarta-agents.  This should replicate bug 29943.
         ConnectorHelpers.shutdown_agents()
         # Look for inconsistencies in the database
         # We should never see rows in intrinsiclink that refer to non-existent jobs
         db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
         try:
             bad_results = db.query("select count(*) as mycount from intrinsiclink t0 where not exists(select 'x' from jobs t1 where t0.jobid=t1.id)").dictresult()
             found_count = None
             for result in bad_results:
                 found_count = int(result["mycount"])
             if found_count != 0:
                 raise Exception("Detected schema inconsistency!  Job is gone, but %d rows in intrinsiclink table refer to it." % found_count)
         finally:
             db.close()

         # No schema inconsistency: restart agents
         ConnectorHelpers.start_agents()
         # Startup time is at least 45 seconds, because the secure random number generator is invoked during startup if httpposter is invoked.
         time.sleep(45)


     # No problems detected: restart leafblower, and let job finish deleting
     ConnectorHelpers.start_leafblower()
     start_health_checker()

     ConnectorHelpers.wait_job_deleted( job_id )

     # Check once again that there are no dangling intrinsiclink rows!!
     db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
     try:
         bad_results = db.query("select count(*) as mycount from intrinsiclink t0 where not exists(select 'x' from jobs t1 where t0.jobid=t1.id)").dictresult()
         found_count = None
         for result in bad_results:
             found_count = int(result["mycount"])
         if found_count != 0:
             raise Exception("Detected schema inconsistency!  Job is gone, but %d rows in intrinsiclink table refer to it." % found_count)
     finally:
         db.close()

     ConnectorHelpers.delete_repository_connection_ui( username, password, "OneDocumentTest" )

     print "Non-typical connection name test."

     # PHASE 0.9: Try creating and removing some odd connection names.  This is not definitive, as we must rely on the correctness of the virtual browser to get
     # the correct results...

     # ConnectorHelpers.define_repositoryconnection( "FileSystem",
     #                            "FileSystem Connection",
     #                            "com.metacarta.crawler.connectors.filesystem.FileConnector" )
     # Do via the UI, with one stupid throttle (to test that part of the UI)
     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "Test%2BTest", "Odd Connection" )
     ConnectorHelpers.delete_repository_connection_ui( username, password, "Test%2BTest" )

     # PHASE 1: Ingestion

     print "Ingestion Test."

     # Define repository connection
     # ConnectorHelpers.define_repositoryconnection( "FileSystem",
     #                            "FileSystem Connection",
     #                            "com.metacarta.crawler.connectors.filesystem.FileConnector" )
     # Do via the UI, with one stupid throttle (to test that part of the UI)
     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "FileSystem", "FileSystem Connection",throttles=[("",None,"20000")] )

     # Spinner test to make sure we aren't leaking file descriptors from tomcat for BPA callout.
     # 3 handles will be leaked each iteration, if broken, out of a max number of 1024.
     for counter in range(1,1024):
         resave_filesystem_repository_connection_ui( username, password, "FileSystem" )

     # Define job
     # doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     # job_id = ConnectorHelpers.define_job( "Test job",
     #                     "FileSystem",
     #                     doc_spec_xml )
     job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "Test job",
                                    "FileSystem",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )

     # Run the job to completion
     # ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.start_job_ui( username, password, job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # See if we can find the documents we just ingested
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "good" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "kidneys" ], None, [ "root/crawlarea/testfiles/f003.txt" ] )
     ConnectorHelpers.search_check( [ "pub" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "city" ], None, [ "root/crawlarea/testfiles/f005.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )

     # Check job definition and job status via API
     result = ConnectorHelpers.list_jobs_api()
     if len(result) != 1:
         raise Exception("Expected one job, found %d" % len(result))
     if result[0]["identifier"] != job_id:
         raise Exception("Expected job identifier to be %s, instead found %s" % (job_id,result[0]["identifier"]))
     if result[0]["description"] != "Test job":
         raise Exception("Expected job description to be 'Test job', instead found '%s'" % result[0]["description"])
     if result[0]["connection"] != "FileSystem":
         raise Exception("Expected job connection to be 'FileSystem', instead found '%s'" % result[0]["connection"])

     result = ConnectorHelpers.list_job_statuses_api()
     if len(result) != 1:
         raise Exception("Expected one job, found %d" % len(result))
     if result[0]["identifier"] != job_id:
         raise Exception("Expected job identifier to be %s, instead found %s" % (job_id,result[0]["identifier"]))
     if result[0]["description"] != "Test job":
         raise Exception("Expected job description to be 'Test job', instead found '%s'" % result[0]["description"])
     if result[0]["status"] != "done":
         raise Exception("Expected job status to be 'done', instead found '%s'" % result[0]["status"])

     # Success: done
     print "Done ingestion test."


     # PHASE 2: Document Change Detection

     print "Document Change Test."
     o = open( "/root/crawlarea/testfiles/f002.txt", "w" )
     o.write("Now this document is at 50N 75E, and the keyword is platypus")
     o.close()
     o = open( "/root/crawlarea/testfiles/f004.txt", "w" )
     o.write("No longer about drinking establishments at 23N 15W")
     o.close()

     # Added 7/21/2008: Set clock forward 18 months, and wait long enough so that all current Thread.sleep()'s (if present)
     # will wake up, and go back to sleep.
     ConnectorHelpers.set_clock_forward()
     time.sleep(60)
     # Restore the clock, because we should not be ACTIVELY doing anything
     # with the daemon while the clock is wrong.
     ConnectorHelpers.restore_clock()

     # Restart job, which should pick up the changes
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # Look for state of index being right
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "good" ], None, [ ] )
     ConnectorHelpers.search_check( [ "kidneys" ], None, [ "root/crawlarea/testfiles/f003.txt" ] )
     ConnectorHelpers.search_check( [ "pub" ], None, [ ] )
     ConnectorHelpers.search_check( [ "city" ], None, [ "root/crawlarea/testfiles/f005.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )

     print "Done Document Change Test."

     # PHASE 3: Document Delete Detection

     print "Document Delete Test."
     os.remove( "/root/crawlarea/testfiles/f003.txt" )
     os.remove( "/root/crawlarea/testfiles/f005.txt" )
     # Restart job, which should pick up the changes
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )
     ConnectorHelpers.search_check( [ "kidneys" ], None, [ ] )
     ConnectorHelpers.search_check( [ "city" ], None, [ ] )
     print "Done Document Delete Test."

     # PHASE 4: Document Addition Detection

     print "Document Add Test."
     o = open( "/root/crawlarea/testfiles/f009.txt", "w" )
     o.write("Now this document is at 50N 75E, and the keyword is albemarle")
     o.close()
     o = open( "/root/crawlarea/testfiles/f010.txt", "w" )
     o.write("No longer about golfcarts at 23N 15W")
     o.close()
     o = open( "/root/crawlarea/testfiles/f011.txt", "w" )
     o.write("------------\n")
     o.write("No sodapop should show up for 12N 72W")
     o.close()

     # Restart job, which should pick up the changes
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "sodapop" ], None, [ "root/crawlarea/testfiles/f011.txt" ] )
     print "Done Document Add Test."

     # PHASE 4.5: Run all the reports via the API and check the results for being sensible
     simple_result = ConnectorHelpers.run_simple_history_report_api( "FileSystem",
         [ "job start", "job end" ] )
     if len(simple_result) != 8:
         raise Exception("Expected 8 job start/job end events, found %d" % len(simple_result))
     max_bandwidth_result = ConnectorHelpers.run_max_bandwidth_history_report_api( "FileSystem",
         [ "document ingest (GTS)" ], entity_bin_regexp="()" )
     if len(max_bandwidth_result) != 1:
         raise Exception("Expected 1 result row from bandwidth report, found %d" % len(max_bandwidth_result))
     max_activity_result = ConnectorHelpers.run_max_activity_history_report_api( "FileSystem",
         [ "document ingest (GTS)" ], entity_bin_regexp="()" )
     if len(max_activity_result) != 1:
         raise Exception("Expected 1 result row from activity report, found %d" % len(max_activity_result))
     result_report = ConnectorHelpers.run_result_histogram_history_report_api( "FileSystem",
         [ "document ingest (GTS)" ], entity_bin_regexp="()", result_bin_regexp="()" )
     if len(result_report) != 1:
         raise Exception("Expected 1 result row from result histogram report, found %d" % len(result_report))
     document_status = ConnectorHelpers.run_document_status_api( "FileSystem",
         [ job_id ] )
     expected_queue_length = 15
     if len(document_status) != expected_queue_length:
         raise Exception("Expected %d documents in queue, found %d" % (expected_queue_length,len(document_status)))
     queue_status = ConnectorHelpers.run_queue_status_api( "FileSystem",
         [ job_id ], bucket_regexp="()" )
     if len(queue_status) != 1:
         raise Exception("Expected 1 result row from queue status report, found %d" % len(queue_status))
     if int(queue_status[0]["inactive_count"]) != expected_queue_length:
         raise Exception("Expected %d inactive queued documents, found %d" % (expected_queue_length,int(queue_status[0]["inactive_count"])))

     # PHASE 5: Delete Job

     print "Job Delete Test."
     # ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.delete_job_ui( username, password, job_id )
     print "...job delete request sent"
     ConnectorHelpers.wait_job_deleted( job_id )
     print "...job has vanished"
     # Make sure the documents all went away
     ConnectorHelpers.search_check( [ "reference" ], None, [] )
     ConnectorHelpers.search_check( [ "good" ], None, [] )
     ConnectorHelpers.search_check( [ "kidneys" ], None, [] )
     ConnectorHelpers.search_check( [ "pub" ], None, [] )
     ConnectorHelpers.search_check( [ "city" ], None, [] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [] )
     ConnectorHelpers.search_check( [ "albemarle" ], None, [] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [] )
     ConnectorHelpers.search_check( [ "sodapop" ], None, [] )
     print "Done Job Delete Test."

     # PHASE 6: Scheduled Ingestion
     print "Scheduled Ingestion Test."
     # Define job again
     # doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                                         password,
                                                         "Test job",
                                                         "FileSystem",
                                                         [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )

     # Set the schedule.  One minute is too short; might miss the window.
     # We need to be sure we hit the window.  We can estimate sanity without having the test fail in obscure ways by calculating the time interval ourselves.
     min_scheduled_time_begin = time.time() + 120.0
     # For the max time, give an additional minute's slop, because various CF threads don't run all the time
     max_scheduled_time_begin = min_scheduled_time_begin + 60.0 + 60.0
     # Sets the time for the test to run to the current time, plus 2 minutes.  Since the current time might be (say) 3 m 59 s, and the execute time would then be
     # 6 m, the actual interval may well be as little as 2 minutes.
     ConnectorHelpers.set_scheduled_time( job_id, 3 )

     # Dump the configuration
     ConnectorHelpers.export_configuration( "test_crawl_2.conf" )
     # Blow away all connector-related stuff
     ConnectorHelpers.reset_all( )
     # Restore the configuration
     ConnectorHelpers.import_configuration( "test_crawl_2.conf" )
     # Check to be sure we didn't miss the window!
     if time.time() >= min_scheduled_time_begin:
         raise Exception("Test invalid: Test setup exceeded limits, so scheduling won't fire")

     # Everything should be back and work, as if we hadn't blown everything away and restored it.  The only thing we must do is find the job_id, since it has changed.
     job_id = ConnectorHelpers.find_job_by_name_ui( username, password, "Test job", "FileSystem" )

     # Sleep until we are sure it should have fired
     sleep_amt = max_scheduled_time_begin - time.time()
     if sleep_amt > 0:
         time.sleep(sleep_amt)

     # Wait for job inactive
     ConnectorHelpers.wait_job_complete( job_id )
     # Make sure we can find our stuff
     ConnectorHelpers.wait_for_ingest( )
     # Look for state of index being right
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )

     # Try out the API call for getting the schedule
     result = ConnectorHelpers.get_job_schedule_api( job_id )
     if len(result) != 1:
         raise Exception("Expected one schedule record, instead found %d" % len(result))
     result[0]["daysofweek"]
     result[0]["years"]
     result[0]["months"]
     result[0]["days"]
     result[0]["hours"]
     result[0]["minutes"]
     result[0]["timezone"]
     result[0]["duration"]

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     # PHASE 7: Time Window Ingestion
     print "Time Window Ingestion Test."
     # This test requires enough documents to keep crawler busy for >1 minute,
     # which I don't have yet - so skip for now. MHL
     print "Done Time Window Ingestion Test."

     # PHASE 7.1: Check that pathological situations in scheduler don't mess us up.

     #stop metacarta-agents
     ConnectorHelpers.shutdown_agents()

     #create the job
     #doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                           password,
                                           "Test job",
                                           "FileSystem",
                                           [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )

     #set up the job schedule: run in Jan,Feb,Mar,April,May,June,July,Aug,Sept,Oct,Nov,Dec, but don't set any other info
     ConnectorHelpers.invoke_crawler_command( "com.metacarta.crawler.AddScheduledTime", argument_list=[ job_id,
                             "",
                             "",
                             "",
                             "january,february,march,april,may,june,july,august,september,october,november,december",
                             "",
                             "",
                             "" ] )

     #screw with the 'last job run' timestamp in the database, to set it to a
     #  known magic value, e.g 12:00AM June 29, 2009 GMT: 1246233600000
     # This time is carefully picked because it must be greater than the 28th of the month, and yet after we've advanced to midnight
     # we must still be in the same month; the next advance will thus be to go by days towards the first of the next month, which is
     # what would fail.
     db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
     try:
         db.query("update jobs set lastchecktime=1246233600000 where id=%s" % job_id)
     finally:
         db.close()

     #start metacarta-agents
     ConnectorHelpers.start_agents()

     #wait until we're sure scheduler has had a chance to look at the record in question
     time.sleep(30)

     #try to shut down metacarta-agents; it should succeed if fixed; otherwise it will time out.
     ConnectorHelpers.shutdown_agents()

     # Now, clean up job
     ConnectorHelpers.start_agents()
     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )


     # PHASE 8: Crawl from seeds
     print "Crawl From Seeds Test."
     # define sample job with two sets of seeds
     #doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint><startpoint path="/root/crawlarea2"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     job_id = ConnectorHelpers.define_filesystem_job_ui( username, password, "Test job",
                              "FileSystem",
                              [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ), ( "/root/crawlarea2", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )

     # Now, crawl
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )
     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )
     # Verify correctness of ingestion
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "humid" ], None, [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt"  ] )

     # Modify document specification to remove testfiles2 area
     doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea/testfiles"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     ConnectorHelpers.change_job_doc_spec( job_id, doc_spec_xml )
     # Rerun
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )
     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )
     # Verify correctness of ingestion
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "humid" ], None, [] )
     ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )
     print "Done Crawl From Seeds Test."

     # PHASE 9: Used to be "crawl everything crawled before", but it's no longer meaningful, since the
     # it's the connector that determines how the crawler behaves now.

     # PHASE 10: Adaptive crawling test
     print "Adaptive Crawl Test."
     # define sample job with two sets of seeds
     #doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint><startpoint path="/root/crawlarea2"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     job_id = ConnectorHelpers.define_filesystem_job_ui( username, password, "Test job",
                              "FileSystem",
                              [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ), ( "/root/crawlarea2", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ],
                              type="continuous",
                              recrawlinterval=2 )

     # Now, crawl
     ConnectorHelpers.start_job( job_id )
     # Job will not end, so we simply need to wait one minute.
     time.sleep(1 * 60)
     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )
     # Verify correctness of ingestion
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "humid" ], None, [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     # Now, change a document and see if the recrawl happens
     o = open( "/root/crawlarea/testfiles/f002.txt", "w" )
     o.write("Now this document is at 49N 75E, and the keyword is castle")
     o.close()
     # Simply wait to see if the reingest occurs (it should after about 1 min)
     time.sleep(2 * 60)
     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )
     # Verify correctness of ingestion
     ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "platypus" ], None, [] )
     ConnectorHelpers.search_check( [ "castle" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "humid" ], None, [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     ConnectorHelpers.abort_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )
     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     # PHASE 11: Ingest into collection test
     print "Collection Ingestion Test (also with unregistered connector)."

     job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "Test job",
                                    "FileSystem",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ],
                                    collection_name="Zena",
                                    document_template=document_template_text )

     # Unregister the file system connector, and restart the services.  This should not prevent us from starting the job!  But when the job starts it should do
     # nothing until we reregister the connector.
     ConnectorHelpers.deregister_connector("com.metacarta.crawler.connectors.filesystem.FileConnector")
     # Recycle the services to be sure there's no already-created handles around
     ConnectorHelpers.restart_tomcat()
     ConnectorHelpers.restart_agents()
     time.sleep(60)

     # Run the job to completion
     # ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.start_job_ui( username, password, job_id )
     # Wait a while.  The job should start, but go nowhere.
     time.sleep(60)
     # Check the status in the UI, by fetching the job status page
     jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
     if jobstatus != "Starting up":
         raise Exception("Expected to see 'Starting up' status, but saw '%s'" % jobstatus)
     # OK, reregister the connector now.  This should cause the job to wake up and actually start.  We have to be fast, though, to unregister exactly at that point.
     ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
     # Wait for the job to leave the "Starting up" state; shut down everything the moment that happens
     it_started = False
     for retry in range(30):
         jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
         if jobstatus == "Running":
             it_started = True
             break
         if jobstatus != "Starting up":
             raise Exception( "Expecting job to start, but wound up with status '%s' instead" % jobstatus )
         time.sleep(1)
     if it_started == False:
         raise Exception( "Job did not start as expected when connector was reregistered" )
     # Stop agents, and deregister connector again
     ConnectorHelpers.deregister_connector("com.metacarta.crawler.connectors.filesystem.FileConnector")
     # We should have immediately entered the "Running, no connector" state
     jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
     if jobstatus != "Running, no connector":
         raise Exception("Expected to see 'Running, no connector' status, but saw '%s'" % jobstatus)

     # Pause the job
     ConnectorHelpers.pause_job_ui( username, password, job_id )
     # Check the status to see if we indeed paused.
     jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
     if jobstatus != "Paused":
         raise Exception("Expected to see 'Paused' status, but saw '%s'" % jobstatus)

     # Resume the job
     ConnectorHelpers.resume_job_ui( username, password, job_id )
     # Check the status to see if we indeed paused.
     jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
     if jobstatus != "Running, no connector":
         raise Exception("Expected to see 'Running, no connector' status, but saw '%s'" % jobstatus)

     # OK, reregister the connector once again now.  This should cause the job to wake up and finish.
     ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")

     ConnectorHelpers.wait_job_complete( job_id )

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # See if we can find the documents we just ingested, except for the one that should have been eliminated by the document template
     ConnectorHelpers.search_check( [ "reference" ], "Zena", [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], "Zena", [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], "Zena", [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], "Zena", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "castle" ], "Zena", [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], "Zena", [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], "Zena", [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], "Zena", [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "sodapop" ], "Zena", [ ] )
     ConnectorHelpers.search_check( [ "humid" ], "Zena", [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], "Zena", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     # Create a second job that ingests the same documents using a different collection (part of the test for 24171)
     job_id_2 = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "Test job 2",
                                    "FileSystem",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ],
                                    collection_name="Boomer",
                                    document_template=document_template_text )

     # Run this job to completion.  It should replace all the collections with the new one...
     ConnectorHelpers.start_job( job_id_2 )
     ConnectorHelpers.wait_job_complete( job_id_2 )
     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # Confirm that all the documents have been reingested, but into the new collection.
     ConnectorHelpers.search_check( [ "reference" ], "Boomer", [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], "Boomer", [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], "Boomer", [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], "Boomer", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "castle" ], "Boomer", [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], "Boomer", [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], "Boomer", [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], "Boomer", [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "sodapop" ], "Boomer", [ ] )
     ConnectorHelpers.search_check( [ "humid" ], "Boomer", [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], "Boomer", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     # Now, delete the second job.  This should *not* delete the documents, because they are shared, but the "Boomer" collection
     # should still have documents in it.
     ConnectorHelpers.delete_job( job_id_2 )
     ConnectorHelpers.wait_job_deleted( job_id_2 )

     ConnectorHelpers.search_check( [ "reference" ], "Boomer", [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], "Boomer", [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], "Boomer", [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], "Boomer", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "castle" ], "Boomer", [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], "Boomer", [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], "Boomer", [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], "Boomer", [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "sodapop" ], "Boomer", [ ] )
     ConnectorHelpers.search_check( [ "humid" ], "Boomer", [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], "Boomer", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     # Rerun the *first* job.  This should detect the fact that the collection needs to change, and the documents should thus be updated.
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )
     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # All should have reverted to how it was prior to job_2  being introduced!
     ConnectorHelpers.search_check( [ "reference" ], "Zena", [ "root/crawlarea/testfiles/f001.txt" ] )
     ConnectorHelpers.search_check( [ "interesting" ], "Zena", [ "root/crawlarea/testfiles/f006.txt" ] )
     ConnectorHelpers.search_check( [ "smelly" ], "Zena", [ "root/crawlarea/testfiles/f007.txt" ] )
     ConnectorHelpers.search_check( [ "restaurants" ], "Zena", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
     ConnectorHelpers.search_check( [ "castle" ], "Zena", [ "root/crawlarea/testfiles/f002.txt" ] )
     ConnectorHelpers.search_check( [ "establishments" ], "Zena", [ "root/crawlarea/testfiles/f004.txt" ] )
     ConnectorHelpers.search_check( [ "albemarle" ], "Zena", [ "root/crawlarea/testfiles/f009.txt" ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], "Zena", [ "root/crawlarea/testfiles/f010.txt" ] )
     ConnectorHelpers.search_check( [ "sodapop" ], "Zena", [ ] )
     ConnectorHelpers.search_check( [ "humid" ], "Zena", [ "root/crawlarea/testfiles2/f002.txt" ] )
     ConnectorHelpers.search_check( [ "document" ], "Zena", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )

     # For added fun, make sure we can delete a job when the connector has been removed!
     ConnectorHelpers.deregister_connector("com.metacarta.crawler.connectors.filesystem.FileConnector")
     # Recycle the services to be sure there's no already-created handles around
     ConnectorHelpers.restart_tomcat()
     ConnectorHelpers.restart_agents()
     time.sleep(60)

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     # Job cleanup should leave nothing around
     ConnectorHelpers.search_check( [ "reference" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "interesting" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "smelly" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "restaurants" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "castle" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "establishments" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "albemarle" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "golfcarts" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "humid" ], "Zena", [  ] )
     ConnectorHelpers.search_check( [ "document" ], "Zena", [  ] )

     print "Done Collection Ingestion Test."

     # ConnectorHelpers.delete_repositoryconnection( "FileSystem" )
     ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )

     # OK, reregister the connector now.  This should have no other affect than getting the system back to normal.
     ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")

     # Phase 12: Throttling and report generation test

     # With this test, we will establish a fetch-rate throttle, and then crawl with it to verify that the crawler obeys the average fetch rate restrictions.
     # For this to work, the file system connector must also provide a "fetch" activity that we can run reports against.
     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "FileSystem", "FileSystem Connection",throttles=[("","Limit fetch rate to two per minute","2")] )

     # Create and run the job.  The job should take about 5 minutes to run, given the throttle settings.
     job_id = ConnectorHelpers.define_filesystem_job_ui( username,
                                    password,
                                    "Test job",
                                    "FileSystem",
                                    [ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )

     # Run the job to completion
     ConnectorHelpers.start_job_ui( username, password, job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # Now, delete the job
     ConnectorHelpers.delete_job_ui( username, password, job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     # Run some history reports from the UI.  These reports should confirm an average read rate of 2 documents per minute, and should contain all expected events.
     simple_results = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem", [ "read document" ] )
     if len(simple_results) != 15:
         raise Exception("Expecting 15 simple report result rows; got %d" % len(simple_results))

     max_activity_results = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem", [ "read document" ], entity_bin_regexp="()" )
     if len(max_activity_results) != 1:
         raise Exception("Expecting 1 row in max activity report; got %d" % len(max_activity_results))
     rate_column = float(max_activity_results[0]["Highest Activity Rate [per min]"])
     if rate_column > 3.0:
         raise Exception("Maximum fetch rate exceeded the 1-sigma limit of 3.0 documents per minute; got %f" % rate_column)

     max_bandwidth_results = ConnectorHelpers.run_max_bandwidth_history_report_ui( username, password, "FileSystem", [ "read document" ], entity_bin_regexp="()" )
     if len(max_bandwidth_results) != 1:
         raise Exception("Expecting 1 row in max bandwidth report; got %d" % len(max_bandwidth_results))

     result_histogram = ConnectorHelpers.run_result_histogram_history_report_ui( username, password, "FileSystem", [ "read document" ], entity_bin_regexp="()", result_bin_regexp="(.*)" )

     if len(result_histogram) != 1:
         raise Exception("Expecting 1 row from result histogram; got %d" % len(result_histogram))
     if result_histogram[0]["Result Class"] != "ok":
         raise Exception("Expected only 'ok' results, got '%s'" % result_histogram[0]["Result Class"])
     if result_histogram[0]["Event Count"] != "15":
         raise Exception("Expected EventCount to be 15, was %s" % result_histogram[0]["Event Count"])

     # We need to make sure that a report screen where connection has been chosen is still happy after the connection goes away.
     # Open up a virtual browser window accordingly...
     # Set up virtual browser instance
     vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
     vb.load_main_window( "http://localhost/crawler/index.jsp" )
     window = vb.find_window("")
     link = window.find_link("Simple history")
     link.click( )
     window = vb.find_window("")
     form = window.find_form("report")
     form.find_selectbox("reportconnection").select_value( "FileSystem" )
     window.find_button("Continue").click( )
     window = vb.find_window("")
     form = window.find_form("report")
     activities_select = form.find_selectbox("reportactivities")
     activities_select.multi_select_value( "read document" )
     # Fire off the query
     window.find_button("Execute this query").click( )
     # Make sure we could fire off the query again if we wanted
     window = vb.find_window("")
     form = window.find_form("report")
     go_button = window.find_button("Execute this query")

     # Remove the connection.
     ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )

     # Make sure when we press "Go", something reasonable happens
     go_button.click()
     window = vb.find_window("")
     # Report form should be present
     form = window.find_form("report")
     # Go button should be gone, but we should have a "continue" button back...
     window.find_button("Continue")

     print "Done Report Tests."

     print "Spin to detect deadlock condition"

     # This uses the crawl configuration saved during the scheduling test.  We really don't care much about it except the queries that will be fired
     # off, so it's fine to rerun this as long as we use ResetAll to clean up whatever configuration garbage is around at the end.
     run_lock_spinner( "test_crawl_2.conf" )
     ConnectorHelpers.reset_all()
     ConnectorHelpers.define_gts_outputconnection( )

     # Next phase: Run postgresql maintenance script

     print "Running maintenance script"
     ConnectorHelpers.run_maintenance()
     print "Done with maintenance script test"

     print "Testing for error page script injection"
     response = ConnectorHelpers.invoke_curl( "http://localhost/crawler/error.jsp?text=%3Cscript%3Ealert(%27test%27)%3C/script%3E&target=%27%3E%3Cscript%3Ealert(%27test%27)%3C/script%3E", user_name=username, password=password )
     if response.find("<script>alert('test')</script>") != -1:
         raise Exception("Script injection seems to have taken place into error.jsp!  Response = %s" % response)
     print "Done with error page injection test"

     # Delete standard GTS output
     ConnectorHelpers.delete_gts_outputconnection( )
     ConnectorHelpers.delete_crawler_user( username )

     delete_folder("/root/crawlarea")
     sqatools.LicenseMakerClient.revoke_license()
     ConnectorHelpers.teardown_connector_environment( )

     # Last check: Be sure that there are no errors in the metacarta log due to postgresql connections being dropped
     print "Checking log for postgresql EOF errors"
     lines = ConnectorHelpers.read_metacarta_log( "unexpected EOF on client connection", log_pos )
     if len(lines) > 0:
         raise Exception("Found %d EOF errors in postgresql log output!" % len(lines) )


     print "Testing check-system-health when database is down"

     stop_database()

     # Kick off check_system_health until we don't see the 'skipping' message about authorities
     while True:
         if system_health_check( ):
             break

     start_database()
     ConnectorHelpers.start_agents( )

     print "Basic ConnectorFramework tests PASSED"