blob: a733ca3018cef5bba1317e94740a9ca01e1c8628 [file] [log] [blame]
#!/usr/bin/python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pg
import re
import os
import sys
import time
import socket
import shutil
import urllib2
import traceback
import ConnectorHelpers
import VirtualBrowser
from threading import Thread
import sqatools.LicenseMakerClient
from sqatools import sqautils
sys.path.append("/usr/lib/metacarta")
import MetaCartaVersion
# Dump output of lsof into log
def dump_debug():
print "Starting dump of lsof"
ConnectorHelpers.invoke_root_script( [ "lsof" ] )
print "Starting dump of ps -aux"
ConnectorHelpers.invoke_root_script( [ "ps", "-aux" ] )
print "Done dump"
# Copy a folder to a (new) area
def copy_folder( source, target ):
ConnectorHelpers.invoke_root_script( [ "mkdir", "-p", target ] )
ConnectorHelpers.invoke_root_script( [ "cp", "-r", source, target ] )
# Remove a folder
def delete_folder( target ):
ConnectorHelpers.invoke_root_script( [ "rm", "-rf", target ] )
# Run the spinner for looking for database/external lock deadlocks
def run_lock_spinner( import_file_name ):
ConnectorHelpers.invoke_crawler_command( "com.metacarta.crawler.JobStartSpinner", argument_list=[ ConnectorHelpers.process_argument(import_file_name) ] )
# Extract a password from a conf file
def extract_password(file_name):
fd = open(file_name,"r")
try:
for line in fd.readlines():
index = line.find("com.metacarta.ingest.password=")
if index == 0:
# Strip off the newline at the end
return line[len("com.metacarta.ingest.password="):len(line)-1]
finally:
fd.close()
raise Exception("Password not found!")
class run_ingestion_test_server_thread(Thread):
def __init__ (self, response):
Thread.__init__(self)
self.response = response
self.setDaemon(True)
def run(self):
try:
# Start the ingestion test server
# Warning: This will hang until shut down!
while True:
output = ConnectorHelpers.invoke_root_script( ["nc", "-l", "localhost", "7031"] )
# Because system health checker doesn't die when advertised, we *may* need to restart nc
if output.find("MetaCarta-Verbose-Response:") == -1:
break
except Exception, e:
self.response += str(e)
# Disable the ability of the appliance to receive data from localhost port 7031
def startup_fake_ingestion_service(response):
""" Use nc to simulate a busted ingestion system """
# Start nc, by invoking in a different thread
thread = run_ingestion_test_server_thread(response)
thread.start()
# Sleep until we think nc is listening
time.sleep(20)
print "Ingestion test server successfully started up"
return thread
# Re-enable the ability of the appliance to receive data from localhost port 7031
def shutdown_fake_ingestion_service(thread=None, response=None):
""" Undo changes from disable """
# Send a shutdown signal. If this is nc listening, it should cause it to exit but not respond. If nc has already exited, it should timeout and get an exception.
# Otherwise, there should be a real response
try:
socket.setdefaulttimeout(10)
ConnectorHelpers.invoke_curl("http://localhost:7031/services/HTTPIngest/?STATUS")
socket.setdefaulttimeout(1000000)
except:
# Must already have been shut down; continue
pass
socket.setdefaulttimeout(1000000)
print "Ingest system fakeout successfully shut down"
# If there's a thread we know about, let it exit, and report any errors
if thread:
thread.join()
if response:
if len(response) > 0:
raise Exception("Ingestion fakeout server had problems: %s" % response)
# This class runs the second kind of ingestion fakeout we try, which returns
class run_non_timeout_ingestion_test_server_thread(Thread):
def __init__ (self, response, mode):
Thread.__init__(self)
self.response = response
self.mode = mode
self.setDaemon(True)
def run(self):
try:
# Start the ingestion test server
# Warning: This will hang until shut down!
ConnectorHelpers.invoke_root_script( ["python", "ingestion_fakeout_server.py", self.mode] )
except Exception, e:
self.response += str(e)
# Run the real (non-timeout) fakeout service
def startup_non_timeout_fake_ingestion_service(response,mode="500"):
""" Use our own script to simulate a busted ingestion system that returns 500 on every request """
# Start nc, by invoking in a different thread
thread = run_non_timeout_ingestion_test_server_thread(response,mode)
thread.start()
# Loop until we think server is listening
while True:
error_seen = True
try:
ConnectorHelpers.invoke_curl("http://localhost:7031/checkalive")
error_seen = False
except:
pass
if error_seen:
time.sleep(1)
else:
break
print "Non-timeout ingestion test server successfully started up"
return thread
# Re-enable the ability of the appliance to receive data from localhost port 7031
def shutdown_non_timeout_fake_ingestion_service(thread=None, response=None):
""" Send shutdown signal, and wait for system to exit """
# Send a shutdown signal. If this is nc listening, it should cause it to exit but not respond. If nc has already exited, it should timeout and get an exception.
# Otherwise, there should be a real response
try:
socket.setdefaulttimeout(10)
ConnectorHelpers.invoke_curl("http://localhost:7031/shutdown")
socket.setdefaulttimeout(1000000)
except:
# Must already have been shut down; continue
pass
socket.setdefaulttimeout(1000000)
print "Non-timeout ingest system fakeout successfully shut down"
# If there's a thread we know about, let it exit, and report any errors
if thread:
thread.join()
if response:
if len(response) > 0:
raise Exception("Non-timeout ingestion fakeout server had problems: %s" % response)
# Stop health checkers
def stop_health_checker():
ConnectorHelpers.invoke_root_script( [ "/etc/init.d/system_health_monitor", "stop" ] )
# Health monitor doesn't really stop in synch with the above, so wait a while to be sure
time.sleep(60)
# Start health checkers
def start_health_checker():
ConnectorHelpers.invoke_root_script( [ "/etc/init.d/system_health_monitor", "start" ] )
# Edit file system repository connection via the UI (for BPA spinner test)
def resave_filesystem_repository_connection_ui( username, password, connection_name ):
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections")
link.click( )
# Now, find the delete link for this connection
window = vb.find_window("")
link = window.find_link("Edit "+connection_name)
link.click( )
# Find the "save" button
window = vb.find_window("")
link = window.find_button("Save this connection").click();
# See if the connector saved
window = vb.find_window("")
found_connection_name = window.find_match("<!--connection=(.*)-->",1)
if found_connection_name != connection_name:
raise Exception("Edited connection doesn't match")
# Crawl user credentials
username = "testingest"
password = "testingest"
# A document template we can use to verify that that works
document_template_text = '<template>\n' + \
'<filter tagger_name="geo">\n' + \
'<end_regex>-{10}</end_regex>\n' + \
'</filter>\n' + \
'</template>\n'
def count_java_heap_dumps( ):
""" Count the number of heap dumps in the /common/metacarta/java-heap-dumps directory """
results = ConnectorHelpers.invoke_root_script( [ "ls","-1","/common/metacarta/java-heap-dumps" ] )
return len(results.splitlines())
def start_database( ):
""" Start the database """
ConnectorHelpers.invoke_root_script(["/etc/init.d/postgresql-8.3", "start"])
time.sleep(15)
def stop_database( ):
""" Stop the database """
ConnectorHelpers.invoke_root_script(["/etc/init.d/postgresql-8.3", "stop"])
def system_health_check( ):
""" Return True if the authority status comes out as a one line error, or
False if it comes out as "skipping", or an exception if anything else.
"""
text = ConnectorHelpers.invoke_root_script(["/usr/bin/check_system_health"], allow_errors=True)
for line in text.splitlines():
if line.find("Authority check already in progress") != -1:
return False
if line.find("Exception checking on authorities") != -1:
if line.find("[Error getting connection]") != -1:
return True
raise Exception("Expected check_system_health to return a single-line status for authority checks, instead saw %s" % text)
signatures = [ "Schema upgrade in progress",
"Schema incorrect for table",
"Extra field for table",
"Field definition incorrect for table",
"Indexes incorrect for table",
"Index definition incorrect for table",
"Unexpected index definition for table",
"Index definition for table" ]
def gather_schema_errors( ):
""" Return a list of errors that seem to be schema-related """
rval = []
text = ConnectorHelpers.invoke_root_script(["/usr/bin/check_system_health"], allow_errors=True)
for line in text.splitlines():
# Look for the pertinent signatures in this line
for signature in signatures:
if line.find(signature) != -1:
rval += [ line ]
break
return rval
# Create a outofmemory repository connection via the UI
def define_outofmemory_repository_connection_ui( username, password, connection_name, connection_description,
throttles=None,
max_connections=None,
failure_mode=None):
""" The throttles argument is an array of tuples. Each tuple represents a throttle and is of the form (regexp,description,avg-fetch-rate).
"""
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections")
link.click( )
# Click "add a connection"
window = vb.find_window("")
link = window.find_link("Add a connection")
link.click( )
# Find the right form elements and set them
window = vb.find_window("")
form = window.find_form("editconnection")
# "Name" tab
namefield = form.find_textarea("connname")
descriptionfield = form.find_textarea("description")
namefield.set_value( connection_name )
descriptionfield.set_value( connection_description )
# "Type" tab
link = window.find_link("Type tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
connectortypefield = form.find_selectbox("classname")
connectortypefield.select_value( "com.metacarta.crawler.connectors.outofmemory.OutOfMemoryConnector" )
# Click the "Continue" button
continue_button = window.find_button("Continue to next page")
continue_button.click( )
window = vb.find_window("")
# "Throttling" tab
link = window.find_link("Throttling tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
if throttles != None:
for throttle in throttles:
regexp,description,rate = throttle
# Add a throttle with the specified parameters
regexpfield = form.find_textarea("throttle")
descfield = form.find_textarea("throttledesc")
valuefield = form.find_textarea("throttlevalue")
regexpfield.set_value( regexp )
if description != None:
descfield.set_value( description )
valuefield.set_value( str(rate) )
add_button = window.find_button("Add throttle")
add_button.click()
window = vb.find_window("")
form = window.find_form("editconnection")
if max_connections != None:
form.find_textarea("maxconnections").set_value( str(max_connections) )
# "Failure Mode" tab
link = window.find_link("Failure Mode tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
if failure_mode != None:
form.find_selectbox("failuremode").select_value(failure_mode)
# Now, save this page
save_button = window.find_button("Save this connection")
save_button.click( )
# See if the connector saved
window = vb.find_window("")
found_connection_name = window.find_match("<!--connection=(.*)-->",1)
if found_connection_name != connection_name:
raise Exception("Created connection doesn't match")
# Define a standard job using the UI
def define_outofmemory_job_ui( username,
password,
job_name,
connection_name,
startpoints_and_matches,
collection_name=None,
document_template=None,
hop_filters=None,
hop_mode=None,
type="specified",
startmethod="windowbegin",
recrawlinterval=0 ):
"""connection_name is the name of the filesystem connection. startpoints_and_matches
is an array, each element of which is a tuple. The tuple consists of the start point
path, and an array of match specifications. Each match specification is a tuple
consisting of a string (either "include" or "exclude"), a type (either "file" or "directory"),
and a match value (such as "*.txt").
Legal values for type are: "specified" or "continuous"
Legal values for start method are: "windowbegin", "windowinside", or "disable".
Hop filters are an array of tuples, each one ( filter_name, filter_value ).
Hop mode has the legal values "accurate", "nodelete", or "neverdelete".
"""
# We should be able to use the filesystem connector job creation UI here
return ConnectorHelpers.define_filesystem_job_ui(username,password,job_name,connection_name,
startpoints_and_matches,collection_name=collection_name,document_template=document_template,
hop_filters=hop_filters,hop_mode=hop_mode,type=type,startmethod=startmethod,recrawlinterval=recrawlinterval)
def preclean( print_errors=True ):
''' Clean up everything we might have done during the execution of this test.
This will include all jobs and ingested documents. '''
# Restore ingestion system
try:
shutdown_fake_ingestion_service()
except Exception, e:
if print_errors:
print "Error restoring ingestion system"
print e
try:
shutdown_non_timeout_fake_ingestion_service()
except Exception, e:
if print_errors:
print "Error restoring ingestion system"
print e
try:
ConnectorHelpers.start_leafblower()
except Exception, e:
if print_errors:
print "Error starting leafblower"
print e
try:
start_health_checker()
except Exception, e:
if print_errors:
print "Error starting health checker"
print e
# Set clock back to actual time, if needed
try:
ConnectorHelpers.restore_clock()
except Exception, e:
if print_errors:
print "Error restoring clock"
print e
# Start database if it is stopped
try:
start_database( )
except Exception, e:
if print_errors:
print "Error starting database"
print e
# Restore schema if it has been altered
print "Restoring schema."
db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
try:
# First, get hold of the column definitions for intrinsiclink
schema_query = "SELECT pg_attribute.attname AS field_col," + \
"CASE pg_type.typname WHEN 'int2' THEN 'smallint' WHEN 'int4' THEN 'int'" + \
" WHEN 'int8' THEN 'bigint' WHEN 'varchar' THEN 'varchar(' || pg_attribute.atttypmod-4 || ')'" + \
" WHEN 'float8' THEN 'double'" + \
" WHEN 'text' THEN 'longtext'" + \
" WHEN 'bpchar' THEN 'char(' || pg_attribute.atttypmod-4 || ')'" + \
" ELSE pg_type.typname END AS type_col," + \
"CASE WHEN pg_attribute.attnotnull THEN 'no' ELSE 'yes' END AS null_col," + \
"CASE pg_type.typname WHEN 'varchar' THEN substring(pg_attrdef.adsrc from '^(.*).*$') ELSE pg_attrdef.adsrc END AS Default " + \
"FROM pg_class INNER JOIN pg_attribute ON (pg_class.oid=pg_attribute.attrelid) INNER JOIN pg_type ON (pg_attribute.atttypid=pg_type.oid) " + \
"LEFT JOIN pg_attrdef ON (pg_class.oid=pg_attrdef.adrelid AND pg_attribute.attnum=pg_attrdef.adnum) " + \
"WHERE pg_class.relname='%s' AND pg_attribute.attnum>=1 AND NOT pg_attribute.attisdropped " + \
"ORDER BY pg_attribute.attnum"
schema_results = db.query(schema_query % "intrinsiclink").dictresult()
seen_isnew = False
for row in schema_results:
field_name = row["field_col"]
if field_name == "isnew":
seen_isnew = True
elif field_name == "wasnew":
# Delete this column!
db.query("ALTER TABLE intrinsiclink DROP COLUMN wasnew")
if seen_isnew == False:
# Create isnew column
db.query("ALTER TABLE intrinsiclink ADD COLUMN isnew CHAR(1) NULL")
index_query = "SELECT pg_catalog.pg_get_indexdef(i.indexrelid, 0, true) AS indexdef FROM pg_catalog.pg_class c, pg_catalog.pg_class c2, pg_catalog.pg_index i " + \
"WHERE c.relname = '%s' AND c.oid = i.indrelid AND i.indexrelid = c2.oid"
index_results = db.query(index_query % "intrinsiclink").dictresult()
seen_dropindex = False
for definition in index_results:
indexdef = definition["indexdef"]
if indexdef.find("(jobid, childidhash, isnew)") != -1:
seen_dropindex = True
elif indexdef.find("(isnew)") != -1 and indexdef.find("temporaryindex") != -1:
# Drop this index
db.query("DROP INDEX temporaryindex")
if seen_dropindex == False:
# Recreate missing index
db.query("CREATE INDEX i123 ON intrinsiclink (jobid,childidhash,isnew)")
finally:
db.close()
print "Done restoring schema"
# Start agents if it is down
try:
ConnectorHelpers.start_agents()
except Exception, e:
if print_errors:
print "Error starting agents service"
print e
try:
ConnectorHelpers.reset_all()
except Exception, e:
if print_errors:
print "Error resetting all jobs"
print e
# Remove saved crawl configuration files, if any
for file in [ "test_crawl_1.conf", "test_crawl_2.conf", "test_crawl_3.conf" ]:
try:
os.unlink( file )
except Exception, e:
if print_errors:
print "Error removing %s" % file
print e
# Remove test documents first
for folder in [ "/root/crawlarea", "/root/crawlarea2" ]:
try:
delete_folder( folder )
except Exception, e:
if print_errors:
print "Error removing %s" % folder
print e
try:
sqatools.LicenseMakerClient.revoke_license()
except Exception, e:
if print_errors:
print "Error revoking license"
print e
try:
ConnectorHelpers.delete_crawler_user( username )
except Exception, e:
if print_errors:
print "Error removing crawler user"
print e
try:
ConnectorHelpers.teardown_connector_environment( )
except Exception, e:
if print_errors:
print "Error cleaning up debs"
print e
try:
# Since one of the tests deregisters the filesystem connector, reregister it here to be sure it exists.
ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
except Exception, e:
if print_errors:
print "Error reregistering file system connector"
print e
# Main
if __name__ == '__main__':
print "Precleaning!"
preclean( print_errors=False )
print "Clearing metacarta logs"
log_pos = ConnectorHelpers.get_metacarta_log_pos( )
agents_log_pos = ConnectorHelpers.get_metacarta_log_pos( log_name="/var/log/metacarta/java-agents/agents.log" )
print "Setup Connector Environment."
ConnectorHelpers.setup_connector_environment()
print "Setting up file area."
copy_folder("/root/testfiles","/root/crawlarea")
copy_folder("/root/testfiles2","/root/crawlarea")
ConnectorHelpers.create_crawler_user( username, password )
# PHASE 0: Checking whether reset-crawler script seems to work
print "Trying reset-crawler command..."
ConnectorHelpers.invoke_root_script( [ "/usr/lib/metacarta/reset-crawler" ] )
sqautils.wait_for_service("tomcat")
saw_exception = False
try:
ConnectorHelpers.invoke_script( [ "/usr/lib/metacarta/reset-crawler" ] )
except:
saw_exception = True
if saw_exception == False:
raise Exception("Running /usr/lib/metacarta/reset-crawler as non-root should have failed but didn't!")
# PHASE 0.1: See if the security on the database is OK
try:
ConnectorHelpers.invoke_root_script( [ "/usr/bin/psql", "--port", "5432", "-U", "metacarta", "-c", "\"SELECT * FROM jobs;\"" ], input="incorrect\n" )
succeeded = True
except:
succeeded = False
if succeeded:
raise Exception("Was able to talk with psql on port 5432 with incorrect password!")
print "Checking schema checker."
# PHASE 0.2: Try out check_system_health after mucking with the schema
ConnectorHelpers.shutdown_agents( )
schema_errors = gather_schema_errors( )
# Initially there should be no schema errors
if len(schema_errors) != 0:
raise Exception("Unexpected schema errors detected! %s" % schema_errors)
# For all the schema alterations, be sure to do it on a table that is only used by metacarta-agents!
# I've chosen intrinsiclink for this purpose. Its normal schema is:
#Column | Type | Modifiers
#--------------+------------------------+-----------
#isnew | character(1) |
#linktype | character varying(255) |
#childidhash | character varying(40) |
#parentidhash | character varying(40) | not null
#jobid | bigint | not null
#Indexes:
#"i1237996140680" UNIQUE, btree (jobid, linktype, parentidhash, childidhash)
#"i1237996140678" btree (jobid, childidhash, isnew)
#"i1237996140679" btree (jobid, parentidhash)
#Foreign-key constraints:
#"intrinsiclink_jobid_fkey" FOREIGN KEY (jobid) REFERENCES jobs(id) ON DELETE RESTRICT
# It *should* be empty, after a successful preclean
db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
try:
# Alter table to have an additional unexpected column
db.query("ALTER TABLE intrinsiclink ADD COLUMN foobar VARCHAR(20) NOT NULL")
schema_errors = gather_schema_errors( )
if len(schema_errors) != 1:
raise Exception("After adding a column, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
db.query("ALTER TABLE intrinsiclink DROP COLUMN foobar")
# Alter table to have a column substitution
db.query("ALTER TABLE intrinsiclink ADD COLUMN wasnew CHAR(1) NULL")
db.query("ALTER TABLE intrinsiclink DROP COLUMN isnew")
# Not only does this cause the loss of a column, but it also causes the loss of an index. So we see 2 errors...
schema_errors = gather_schema_errors( )
if len(schema_errors) != 2:
raise Exception("After substituting a column, expected 2 schema errors, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
db.query("ALTER TABLE intrinsiclink ADD COLUMN isnew CHAR(1) NULL")
db.query("ALTER TABLE intrinsiclink DROP COLUMN wasnew")
# At this point, we will have still lost the index on the isnew column, so we can test an index delete here
schema_errors = gather_schema_errors( )
if len(schema_errors) != 1:
raise Exception("After deleting an index, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
# Add an incorrect index
db.query("CREATE INDEX temporaryindex ON intrinsiclink (isnew)")
schema_errors = gather_schema_errors( )
if len(schema_errors) != 1:
raise Exception("After index substitution, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
# Add in the correct index again
db.query("CREATE INDEX i123 ON intrinsiclink (jobid,childidhash,isnew)")
# Still should get 1 schema error because we have one extra index
schema_errors = gather_schema_errors( )
if len(schema_errors) != 1:
raise Exception("With addition index, expected 1 schema error, instead saw %d: %s" % (len(schema_errors),str(schema_errors)))
# Drop the bad index
db.query("DROP INDEX temporaryindex")
# Now, schema test should be OK
schema_errors = gather_schema_errors( )
if len(schema_errors) != 0:
raise Exception("It looks like the test is screwed up; schema test failed after restoration")
finally:
db.close()
ConnectorHelpers.start_agents( )
# Create a standard GTS output connection
ConnectorHelpers.define_gts_outputconnection( )
print "Dump and restore empty configuration"
# Check what happens when we dump and restore an empty configuration (21045)
job_list = ConnectorHelpers.list_jobs_api( )
if len(job_list) != 0:
raise Exception("Expecting zero jobs, instead found %d" % len(job_list))
ConnectorHelpers.export_configuration( "test_crawl_1.conf" )
# There should be no connector-related configuration to blow away at this point!
#ConnectorHelpers.reset_all( )
# Restore the configuration
ConnectorHelpers.import_configuration( "test_crawl_1.conf" )
# Check that there are still zero jobs
job_list = ConnectorHelpers.list_jobs_api( )
if len(job_list) != 0:
raise Exception("Expecting zero jobs, instead found %d" % len(job_list))
print "Dump and restore configuration that has lots of jobs, with funky characters too"
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "OneDocumentTest", "One Document Test" )
job_name_list = {}
job_count = 50
for job_index in range(job_count):
# I never intend to actually crawl this, so it can be utterly screwy and that's OK
job_name = u"One Document Test Job \u00d8 %d" % job_index
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
job_name,
"OneDocumentTest",
[ ( "/root/crawlarea", [ ( "include", "file", "f007.txt" ), ( "include", "directory", "*" ) ] ) ] )
job_name_list[job_name] = job_name
ConnectorHelpers.export_configuration( "test_crawl_3.conf" )
job_list = ConnectorHelpers.list_jobs_api( )
if len(job_list) != job_count:
raise Exception("Expecting %d jobs, instead found %d" % (job_count,len(job_list)))
# Blow away the config
ConnectorHelpers.reset_all( )
# Restore the configuration
ConnectorHelpers.import_configuration( "test_crawl_3.conf" )
# Check that there are still 50 jobs
job_list = ConnectorHelpers.list_jobs_api( )
if len(job_list) != job_count:
raise Exception("Expecting %d jobs, instead found %d" % (job_count,len(job_list)))
# Now, check to see that these jobs adhere to specifications
for job_record in job_list:
job_id = job_record["identifier"]
job_name = job_record["description"]
if not job_name_list.has_key(job_name):
raise Exception(u"One of the restored jobs does not have a recognized name! %s" % job_name)
ConnectorHelpers.delete_job(job_id)
# Test performance hack; get all the deletes started and then wait for the deletes to all complete.
for job_record in job_list:
job_id = job_record["identifier"]
ConnectorHelpers.wait_job_deleted(job_id)
ConnectorHelpers.delete_repositoryconnection("OneDocumentTest")
print "Verify that the help link exists and seems correct."
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username, password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for help
window = vb.find_window("")
link = window.find_link("Help")
if link == None:
raise Exception("Could not find help link in UI navigation")
if link.url != "/documentation/ConnectorGuide.pdf":
raise Exception("The help link was wrong: Saw '%s'" % link.url)
print "Run two jobs with identical documents at the same time, and make sure we can restart metacarta-agents during this process."
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "FileSystem", "FileSystem Connection",throttles=[("",None,"20000")] )
# Define job
job_id_1 = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job 1",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
job_id_2 = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job 2",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
# Run the job to completion
ConnectorHelpers.start_job_ui( username, password, job_id_1 )
ConnectorHelpers.start_job_ui( username, password, job_id_2 )
# Immediately restart metacarta-agents. If failure, agents won't actually ever come back up, so test will timeout.
ConnectorHelpers.restart_agents()
ConnectorHelpers.wait_job_complete( job_id_1 )
ConnectorHelpers.wait_job_complete( job_id_2 )
# Get job status. Sometimes we get this far but the status is messed up (job is done but there's an active document still)
result = ConnectorHelpers.list_job_statuses_api()
if len(result) != 2:
raise Exception("Expected two jobs, found %d" % len(result))
if result[0]["status"] != "done":
raise Exception("Expected job status to be 'done', instead found '%s'" % result[0]["status"])
if result[1]["status"] != "done":
raise Exception("Expected job status to be 'done', instead found '%s'" % result[1]["status"])
if result[0]["outstanding"] != str(0):
raise Exception("Expected active documents to be 0, instead found '%s'" % result[0]["outstanding"])
if result[1]["outstanding"] != str(0):
raise Exception("Expected active documents to be 0, instead found '%s'" % result[1]["outstanding"])
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# See if we can find the documents we just ingested
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "good" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "kidneys" ], None, [ "root/crawlarea/testfiles/f003.txt" ] )
ConnectorHelpers.search_check( [ "pub" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "city" ], None, [ "root/crawlarea/testfiles/f005.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
# Clean up both jobs simultaneously too.
ConnectorHelpers.delete_job_ui( username, password, job_id_1 )
ConnectorHelpers.delete_job_ui( username, password, job_id_2 )
ConnectorHelpers.wait_job_deleted( job_id_1 )
ConnectorHelpers.wait_job_deleted( job_id_2 )
ConnectorHelpers.delete_repository_connection_ui( username, password, "FileSystem" )
print "Force the crawler to run out of memory, and see that it shuts down."
# Shutdown health checker first; otherwise we can get locks stuck, and this would be messy to clean up.
stop_health_checker()
# Wait long enough so we can be sure there are no outstanding connector-related health activities going on.
time.sleep(30)
old_heap_dumps = count_java_heap_dumps()
define_outofmemory_repository_connection_ui( username, password, "OutOfMemoryTest", "Out of Memory Test" )
job_id = define_outofmemory_job_ui( username,
password,
"Out of Memory Test Job",
"OutOfMemoryTest",
[ ( "/root/crawlarea", [ ( "include", "file", "*" ), ( "include", "directory", "*" ) ] ) ] )
ConnectorHelpers.start_job_ui( username, password, job_id )
# Give it some time to shut itself down
time.sleep(60)
# Verify that metacarta-agents is indeed gone
if ConnectorHelpers.find_daemon_pid( ) != None:
raise Exception("metacarta-agents should have aborted, but it's still running!")
# Clean up locks enough so that we can't die trying to abort the job.
ConnectorHelpers.shutdown_tomcat( )
ConnectorHelpers.invoke_script(["/usr/lib/metacarta/core-lockclean"])
# Now, abort the job. This must happen before reset-crawler, because otherwise we might just run out of memory again.
ConnectorHelpers.abort_job( job_id )
# It stopped. Now, we have to reset locks because the oom may have messed them up. This will start services back up too.
ConnectorHelpers.invoke_root_script(["/usr/lib/metacarta/reset-crawler"])
# Start health checker
start_health_checker()
# The job should now abort properly
ConnectorHelpers.wait_job_complete( job_id )
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
ConnectorHelpers.delete_repository_connection_ui( username, password, "OutOfMemoryTest" )
new_heap_dumps = count_java_heap_dumps()
if new_heap_dumps != old_heap_dumps + 1:
raise Exception("Expected there to be %d heap dumps, instead found %d" % (old_heap_dumps+1,new_heap_dumps))
print "Check that a delayed seeding phase does not permit job to abort until done."
define_outofmemory_repository_connection_ui( username, password, "OutOfMemoryTest", "Out of Memory Test", failure_mode="seedingdelay" )
job_id = define_outofmemory_job_ui( username,
password,
"Out of Memory Test Job",
"OutOfMemoryTest",
[ ( "/root/crawlarea", [ ( "include", "file", "*" ), ( "include", "directory", "*" ) ] ) ] )
ConnectorHelpers.start_job_ui( username, password, job_id )
# When the above is clicked, the UI immediately gives feedback that the job is starting. Unfortunately you cannot tell the difference in the UI between the "READYFORSTARTUP" state and the
# "STARTINGUP" state. Aborts function differently in each state.
# To sidestep this issue, we wait for 20 seconds after we see the 'starting up' in the UI, in order to be pretty certain the job has entered the "STARTINGUP" state.
# Once truly in the "STARTINGUP" state, the job is guaranteed to stay in that state for at least 2 minutes.
while True:
start_began_time = time.time()
job_state = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if job_state == "Starting up":
time.sleep(20)
break
if job_state == "Running":
raise Exception("Test problem: saw 'running' state without seeing 'starting up' phase")
time.sleep(10)
# Now, abort the job
ConnectorHelpers.abort_job( job_id )
# The job should NOT abort right away!! Indeed, we should see the job stay in the "aborting" state for about 120-30 seconds. Since this is approximate, we'll wait
# only 60 seconds before checking the job state; it'd better not stop aborting by then!
time.sleep(60-(time.time()-start_began_time))
job_state = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if job_state != "Aborting":
raise Exception("Expected job to stay in the Aborting state for an extended period of time, when interrupted during startup phase")
ConnectorHelpers.wait_job_complete( job_id )
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
ConnectorHelpers.delete_repository_connection_ui( username, password, "OutOfMemoryTest" )
print "Seeing whether broken pipe ingestion errors get handled correctly"
# We need a file that's at least large enough to cause packet transmission without a flush. The test documents are all too small by themselves - so build a big one....
f_out = open("/root/crawlarea/bigfile.txt","w")
try:
for iteration in range(10000):
f_in = open("/root/crawlarea/testfiles/f001.txt","r")
try:
for line in f_in.readlines():
f_out.write(line)
finally:
f_in.close()
finally:
f_out.close()
define_outofmemory_repository_connection_ui( username, password, "OutOfMemoryTest", "Out of Memory Test", failure_mode="ingestiondelay" )
job_id = define_outofmemory_job_ui( username,
password,
"Out of Memory Test Job",
"OutOfMemoryTest",
[ ( "/root/crawlarea", [ ( "include", "file", "*" ), ( "include", "directory", "*" ) ] ) ] )
ConnectorHelpers.start_job_ui( username, password, job_id )
# If the code is working properly, broken pipe errors will be treated as 400's.
# If not, the job will retry documents indefinitely, and the test will fail for that reason.
ConnectorHelpers.wait_job_complete( job_id )
# Get rid of big file
os.unlink("/root/crawlarea/bigfile.txt")
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
ConnectorHelpers.delete_repository_connection_ui( username, password, "OutOfMemoryTest" )
print "Ingest using broken ingestion system."
# Set up an ingestion of exactly one document
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "OneDocumentTest", "One Document Test" )
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"One Document Test Job",
"OneDocumentTest",
[ ( "/root/crawlarea", [ ( "include", "file", "f007.txt" ), ( "include", "directory", "*" ) ] ) ] )
stop_health_checker()
ConnectorHelpers.stop_leafblower()
# Set up dummy ingest listener
response = ""
this_thread = startup_fake_ingestion_service(response)
print "Looking at debug info after nc-based fake ingestion service started"
dump_debug()
# Start the job. The job would normally run for many hours, because we would need to wait for the retries to give up, so I'm going to abort it after some short period of time.
ConnectorHelpers.start_job_ui( username, password, job_id )
time.sleep(60)
ConnectorHelpers.abort_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
# The connector framework should have logged a -2 error for the first ingest activity!
results = ConnectorHelpers.run_simple_history_report_api( "OneDocumentTest", [ "document ingest (GTS)" ] )
# There should have been one or more ingestion attempts
if len(results) == 0:
raise Exception("No ingestion attempts were reported! Expected at least one.")
# Check to be sure that at least one received a -2 error
saw_proper_error = False
for result in results:
if int(result["result_code"]) == -2:
saw_proper_error = True
break
if not saw_proper_error:
raise Exception("Did not see expected -2 error in ingest history results")
shutdown_fake_ingestion_service(this_thread,response)
print "Looking at debug info after nc-based fake ingestion service stopped"
dump_debug()
# Next, try an ingestion service that just returns 500 errors
response = ""
this_thread = startup_non_timeout_fake_ingestion_service(response)
print "Looking at debug info after homegrown fake ingestion service started"
dump_debug()
# Start the job, wait for a time, then abort it. This should run for a long time, and generate a few warnings in the log, which we'll check for later.
ConnectorHelpers.start_job_ui( username, password, job_id )
time.sleep(60)
ConnectorHelpers.abort_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
shutdown_non_timeout_fake_ingestion_service(this_thread,response)
print "Looking at debug info after homegrown fake ingestion service stopped"
dump_debug()
# Check for at least one error in the log of form:
# "Error 500 from ingestion request; ingestion will be retried again later"
lines = ConnectorHelpers.read_metacarta_log( "Error 500 from ingestion request; ingestion will be retried again later", agents_log_pos, log_name="/var/log/metacarta/java-agents/agents.log" )
if len(lines) == 0:
raise Exception("Did not see expected ingestion request retry message in log!")
# Delete the job, without anything listening on 7031. This will cause a number of deleted documents to be queued.
ConnectorHelpers.delete_job( job_id )
for iteration in range(5):
# Wait a little while so they *do* get queued,
time.sleep(10)
# Now, stop metacarta-agents. This should replicate bug 29943.
ConnectorHelpers.shutdown_agents()
# Look for inconsistencies in the database
# We should never see rows in intrinsiclink that refer to non-existent jobs
db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
try:
bad_results = db.query("select count(*) as mycount from intrinsiclink t0 where not exists(select 'x' from jobs t1 where t0.jobid=t1.id)").dictresult()
found_count = None
for result in bad_results:
found_count = int(result["mycount"])
if found_count != 0:
raise Exception("Detected schema inconsistency! Job is gone, but %d rows in intrinsiclink table refer to it." % found_count)
finally:
db.close()
# No schema inconsistency: restart agents
ConnectorHelpers.start_agents()
# Startup time is at least 45 seconds, because the secure random number generator is invoked during startup if httpposter is invoked.
time.sleep(45)
# No problems detected: restart leafblower, and let job finish deleting
ConnectorHelpers.start_leafblower()
start_health_checker()
ConnectorHelpers.wait_job_deleted( job_id )
# Check once again that there are no dangling intrinsiclink rows!!
db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
try:
bad_results = db.query("select count(*) as mycount from intrinsiclink t0 where not exists(select 'x' from jobs t1 where t0.jobid=t1.id)").dictresult()
found_count = None
for result in bad_results:
found_count = int(result["mycount"])
if found_count != 0:
raise Exception("Detected schema inconsistency! Job is gone, but %d rows in intrinsiclink table refer to it." % found_count)
finally:
db.close()
ConnectorHelpers.delete_repository_connection_ui( username, password, "OneDocumentTest" )
print "Non-typical connection name test."
# PHASE 0.9: Try creating and removing some odd connection names. This is not definitive, as we must rely on the correctness of the virtual browser to get
# the correct results...
# ConnectorHelpers.define_repositoryconnection( "FileSystem",
# "FileSystem Connection",
# "com.metacarta.crawler.connectors.filesystem.FileConnector" )
# Do via the UI, with one stupid throttle (to test that part of the UI)
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "Test%2BTest", "Odd Connection" )
ConnectorHelpers.delete_repository_connection_ui( username, password, "Test%2BTest" )
# PHASE 1: Ingestion
print "Ingestion Test."
# Define repository connection
# ConnectorHelpers.define_repositoryconnection( "FileSystem",
# "FileSystem Connection",
# "com.metacarta.crawler.connectors.filesystem.FileConnector" )
# Do via the UI, with one stupid throttle (to test that part of the UI)
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "FileSystem", "FileSystem Connection",throttles=[("",None,"20000")] )
# Spinner test to make sure we aren't leaking file descriptors from tomcat for BPA callout.
# 3 handles will be leaked each iteration, if broken, out of a max number of 1024.
for counter in range(1,1024):
resave_filesystem_repository_connection_ui( username, password, "FileSystem" )
# Define job
# doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
# job_id = ConnectorHelpers.define_job( "Test job",
# "FileSystem",
# doc_spec_xml )
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
# Run the job to completion
# ConnectorHelpers.start_job( job_id )
ConnectorHelpers.start_job_ui( username, password, job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# See if we can find the documents we just ingested
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "good" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "kidneys" ], None, [ "root/crawlarea/testfiles/f003.txt" ] )
ConnectorHelpers.search_check( [ "pub" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "city" ], None, [ "root/crawlarea/testfiles/f005.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
# Check job definition and job status via API
result = ConnectorHelpers.list_jobs_api()
if len(result) != 1:
raise Exception("Expected one job, found %d" % len(result))
if result[0]["identifier"] != job_id:
raise Exception("Expected job identifier to be %s, instead found %s" % (job_id,result[0]["identifier"]))
if result[0]["description"] != "Test job":
raise Exception("Expected job description to be 'Test job', instead found '%s'" % result[0]["description"])
if result[0]["connection"] != "FileSystem":
raise Exception("Expected job connection to be 'FileSystem', instead found '%s'" % result[0]["connection"])
result = ConnectorHelpers.list_job_statuses_api()
if len(result) != 1:
raise Exception("Expected one job, found %d" % len(result))
if result[0]["identifier"] != job_id:
raise Exception("Expected job identifier to be %s, instead found %s" % (job_id,result[0]["identifier"]))
if result[0]["description"] != "Test job":
raise Exception("Expected job description to be 'Test job', instead found '%s'" % result[0]["description"])
if result[0]["status"] != "done":
raise Exception("Expected job status to be 'done', instead found '%s'" % result[0]["status"])
# Success: done
print "Done ingestion test."
# PHASE 2: Document Change Detection
print "Document Change Test."
o = open( "/root/crawlarea/testfiles/f002.txt", "w" )
o.write("Now this document is at 50N 75E, and the keyword is platypus")
o.close()
o = open( "/root/crawlarea/testfiles/f004.txt", "w" )
o.write("No longer about drinking establishments at 23N 15W")
o.close()
# Added 7/21/2008: Set clock forward 18 months, and wait long enough so that all current Thread.sleep()'s (if present)
# will wake up, and go back to sleep.
ConnectorHelpers.set_clock_forward()
time.sleep(60)
# Restore the clock, because we should not be ACTIVELY doing anything
# with the daemon while the clock is wrong.
ConnectorHelpers.restore_clock()
# Restart job, which should pick up the changes
ConnectorHelpers.start_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# Look for state of index being right
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "good" ], None, [ ] )
ConnectorHelpers.search_check( [ "kidneys" ], None, [ "root/crawlarea/testfiles/f003.txt" ] )
ConnectorHelpers.search_check( [ "pub" ], None, [ ] )
ConnectorHelpers.search_check( [ "city" ], None, [ "root/crawlarea/testfiles/f005.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
print "Done Document Change Test."
# PHASE 3: Document Delete Detection
print "Document Delete Test."
os.remove( "/root/crawlarea/testfiles/f003.txt" )
os.remove( "/root/crawlarea/testfiles/f005.txt" )
# Restart job, which should pick up the changes
ConnectorHelpers.start_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
ConnectorHelpers.search_check( [ "kidneys" ], None, [ ] )
ConnectorHelpers.search_check( [ "city" ], None, [ ] )
print "Done Document Delete Test."
# PHASE 4: Document Addition Detection
print "Document Add Test."
o = open( "/root/crawlarea/testfiles/f009.txt", "w" )
o.write("Now this document is at 50N 75E, and the keyword is albemarle")
o.close()
o = open( "/root/crawlarea/testfiles/f010.txt", "w" )
o.write("No longer about golfcarts at 23N 15W")
o.close()
o = open( "/root/crawlarea/testfiles/f011.txt", "w" )
o.write("------------\n")
o.write("No sodapop should show up for 12N 72W")
o.close()
# Restart job, which should pick up the changes
ConnectorHelpers.start_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "sodapop" ], None, [ "root/crawlarea/testfiles/f011.txt" ] )
print "Done Document Add Test."
# PHASE 4.5: Run all the reports via the API and check the results for being sensible
simple_result = ConnectorHelpers.run_simple_history_report_api( "FileSystem",
[ "job start", "job end" ] )
if len(simple_result) != 8:
raise Exception("Expected 8 job start/job end events, found %d" % len(simple_result))
max_bandwidth_result = ConnectorHelpers.run_max_bandwidth_history_report_api( "FileSystem",
[ "document ingest (GTS)" ], entity_bin_regexp="()" )
if len(max_bandwidth_result) != 1:
raise Exception("Expected 1 result row from bandwidth report, found %d" % len(max_bandwidth_result))
max_activity_result = ConnectorHelpers.run_max_activity_history_report_api( "FileSystem",
[ "document ingest (GTS)" ], entity_bin_regexp="()" )
if len(max_activity_result) != 1:
raise Exception("Expected 1 result row from activity report, found %d" % len(max_activity_result))
result_report = ConnectorHelpers.run_result_histogram_history_report_api( "FileSystem",
[ "document ingest (GTS)" ], entity_bin_regexp="()", result_bin_regexp="()" )
if len(result_report) != 1:
raise Exception("Expected 1 result row from result histogram report, found %d" % len(result_report))
document_status = ConnectorHelpers.run_document_status_api( "FileSystem",
[ job_id ] )
expected_queue_length = 15
if len(document_status) != expected_queue_length:
raise Exception("Expected %d documents in queue, found %d" % (expected_queue_length,len(document_status)))
queue_status = ConnectorHelpers.run_queue_status_api( "FileSystem",
[ job_id ], bucket_regexp="()" )
if len(queue_status) != 1:
raise Exception("Expected 1 result row from queue status report, found %d" % len(queue_status))
if int(queue_status[0]["inactive_count"]) != expected_queue_length:
raise Exception("Expected %d inactive queued documents, found %d" % (expected_queue_length,int(queue_status[0]["inactive_count"])))
# PHASE 5: Delete Job
print "Job Delete Test."
# ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.delete_job_ui( username, password, job_id )
print "...job delete request sent"
ConnectorHelpers.wait_job_deleted( job_id )
print "...job has vanished"
# Make sure the documents all went away
ConnectorHelpers.search_check( [ "reference" ], None, [] )
ConnectorHelpers.search_check( [ "good" ], None, [] )
ConnectorHelpers.search_check( [ "kidneys" ], None, [] )
ConnectorHelpers.search_check( [ "pub" ], None, [] )
ConnectorHelpers.search_check( [ "city" ], None, [] )
ConnectorHelpers.search_check( [ "interesting" ], None, [] )
ConnectorHelpers.search_check( [ "smelly" ], None, [] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [] )
ConnectorHelpers.search_check( [ "albemarle" ], None, [] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [] )
ConnectorHelpers.search_check( [ "sodapop" ], None, [] )
print "Done Job Delete Test."
# PHASE 6: Scheduled Ingestion
print "Scheduled Ingestion Test."
# Define job again
# doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
# Set the schedule. One minute is too short; might miss the window.
# We need to be sure we hit the window. We can estimate sanity without having the test fail in obscure ways by calculating the time interval ourselves.
min_scheduled_time_begin = time.time() + 120.0
# For the max time, give an additional minute's slop, because various CF threads don't run all the time
max_scheduled_time_begin = min_scheduled_time_begin + 60.0 + 60.0
# Sets the time for the test to run to the current time, plus 2 minutes. Since the current time might be (say) 3 m 59 s, and the execute time would then be
# 6 m, the actual interval may well be as little as 2 minutes.
ConnectorHelpers.set_scheduled_time( job_id, 3 )
# Dump the configuration
ConnectorHelpers.export_configuration( "test_crawl_2.conf" )
# Blow away all connector-related stuff
ConnectorHelpers.reset_all( )
# Restore the configuration
ConnectorHelpers.import_configuration( "test_crawl_2.conf" )
# Check to be sure we didn't miss the window!
if time.time() >= min_scheduled_time_begin:
raise Exception("Test invalid: Test setup exceeded limits, so scheduling won't fire")
# Everything should be back and work, as if we hadn't blown everything away and restored it. The only thing we must do is find the job_id, since it has changed.
job_id = ConnectorHelpers.find_job_by_name_ui( username, password, "Test job", "FileSystem" )
# Sleep until we are sure it should have fired
sleep_amt = max_scheduled_time_begin - time.time()
if sleep_amt > 0:
time.sleep(sleep_amt)
# Wait for job inactive
ConnectorHelpers.wait_job_complete( job_id )
# Make sure we can find our stuff
ConnectorHelpers.wait_for_ingest( )
# Look for state of index being right
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
# Try out the API call for getting the schedule
result = ConnectorHelpers.get_job_schedule_api( job_id )
if len(result) != 1:
raise Exception("Expected one schedule record, instead found %d" % len(result))
result[0]["daysofweek"]
result[0]["years"]
result[0]["months"]
result[0]["days"]
result[0]["hours"]
result[0]["minutes"]
result[0]["timezone"]
result[0]["duration"]
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
# PHASE 7: Time Window Ingestion
print "Time Window Ingestion Test."
# This test requires enough documents to keep crawler busy for >1 minute,
# which I don't have yet - so skip for now. MHL
print "Done Time Window Ingestion Test."
# PHASE 7.1: Check that pathological situations in scheduler don't mess us up.
#stop metacarta-agents
ConnectorHelpers.shutdown_agents()
#create the job
#doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
#set up the job schedule: run in Jan,Feb,Mar,April,May,June,July,Aug,Sept,Oct,Nov,Dec, but don't set any other info
ConnectorHelpers.invoke_crawler_command( "com.metacarta.crawler.AddScheduledTime", argument_list=[ job_id,
"",
"",
"",
"january,february,march,april,may,june,july,august,september,october,november,december",
"",
"",
"" ] )
#screw with the 'last job run' timestamp in the database, to set it to a
# known magic value, e.g 12:00AM June 29, 2009 GMT: 1246233600000
# This time is carefully picked because it must be greater than the 28th of the month, and yet after we've advanced to midnight
# we must still be in the same month; the next advance will thus be to go by days towards the first of the next month, which is
# what would fail.
db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
try:
db.query("update jobs set lastchecktime=1246233600000 where id=%s" % job_id)
finally:
db.close()
#start metacarta-agents
ConnectorHelpers.start_agents()
#wait until we're sure scheduler has had a chance to look at the record in question
time.sleep(30)
#try to shut down metacarta-agents; it should succeed if fixed; otherwise it will time out.
ConnectorHelpers.shutdown_agents()
# Now, clean up job
ConnectorHelpers.start_agents()
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
# PHASE 8: Crawl from seeds
print "Crawl From Seeds Test."
# define sample job with two sets of seeds
#doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint><startpoint path="/root/crawlarea2"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
job_id = ConnectorHelpers.define_filesystem_job_ui( username, password, "Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ), ( "/root/crawlarea2", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
# Now, crawl
ConnectorHelpers.start_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# Verify correctness of ingestion
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "humid" ], None, [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
# Modify document specification to remove testfiles2 area
doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea/testfiles"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
ConnectorHelpers.change_job_doc_spec( job_id, doc_spec_xml )
# Rerun
ConnectorHelpers.start_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# Verify correctness of ingestion
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "humid" ], None, [] )
ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
print "Done Crawl From Seeds Test."
# PHASE 9: Used to be "crawl everything crawled before", but it's no longer meaningful, since the
# it's the connector that determines how the crawler behaves now.
# PHASE 10: Adaptive crawling test
print "Adaptive Crawl Test."
# define sample job with two sets of seeds
#doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/root/crawlarea"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint><startpoint path="/root/crawlarea2"><include match="*.txt" type="file"/><include match="*" type="directory"/></startpoint></specification>'
job_id = ConnectorHelpers.define_filesystem_job_ui( username, password, "Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ), ( "/root/crawlarea2", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ],
type="continuous",
recrawlinterval=2 )
# Now, crawl
ConnectorHelpers.start_job( job_id )
# Job will not end, so we simply need to wait one minute.
time.sleep(1 * 60)
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# Verify correctness of ingestion
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "platypus" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "humid" ], None, [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
# Now, change a document and see if the recrawl happens
o = open( "/root/crawlarea/testfiles/f002.txt", "w" )
o.write("Now this document is at 49N 75E, and the keyword is castle")
o.close()
# Simply wait to see if the reingest occurs (it should after about 1 min)
time.sleep(2 * 60)
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# Verify correctness of ingestion
ConnectorHelpers.search_check( [ "reference" ], None, [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], None, [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], None, [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], None, [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "platypus" ], None, [] )
ConnectorHelpers.search_check( [ "castle" ], None, [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], None, [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], None, [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], None, [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "humid" ], None, [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], None, [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.abort_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
# PHASE 11: Ingest into collection test
print "Collection Ingestion Test (also with unregistered connector)."
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ],
collection_name="Zena",
document_template=document_template_text )
# Unregister the file system connector, and restart the services. This should not prevent us from starting the job! But when the job starts it should do
# nothing until we reregister the connector.
ConnectorHelpers.deregister_connector("com.metacarta.crawler.connectors.filesystem.FileConnector")
# Recycle the services to be sure there's no already-created handles around
ConnectorHelpers.restart_tomcat()
ConnectorHelpers.restart_agents()
time.sleep(60)
# Run the job to completion
# ConnectorHelpers.start_job( job_id )
ConnectorHelpers.start_job_ui( username, password, job_id )
# Wait a while. The job should start, but go nowhere.
time.sleep(60)
# Check the status in the UI, by fetching the job status page
jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if jobstatus != "Starting up":
raise Exception("Expected to see 'Starting up' status, but saw '%s'" % jobstatus)
# OK, reregister the connector now. This should cause the job to wake up and actually start. We have to be fast, though, to unregister exactly at that point.
ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
# Wait for the job to leave the "Starting up" state; shut down everything the moment that happens
it_started = False
for retry in range(30):
jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if jobstatus == "Running":
it_started = True
break
if jobstatus != "Starting up":
raise Exception( "Expecting job to start, but wound up with status '%s' instead" % jobstatus )
time.sleep(1)
if it_started == False:
raise Exception( "Job did not start as expected when connector was reregistered" )
# Stop agents, and deregister connector again
ConnectorHelpers.deregister_connector("com.metacarta.crawler.connectors.filesystem.FileConnector")
# We should have immediately entered the "Running, no connector" state
jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if jobstatus != "Running, no connector":
raise Exception("Expected to see 'Running, no connector' status, but saw '%s'" % jobstatus)
# Pause the job
ConnectorHelpers.pause_job_ui( username, password, job_id )
# Check the status to see if we indeed paused.
jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if jobstatus != "Paused":
raise Exception("Expected to see 'Paused' status, but saw '%s'" % jobstatus)
# Resume the job
ConnectorHelpers.resume_job_ui( username, password, job_id )
# Check the status to see if we indeed paused.
jobstatus = ConnectorHelpers.get_job_status_ui( username, password, job_id )
if jobstatus != "Running, no connector":
raise Exception("Expected to see 'Running, no connector' status, but saw '%s'" % jobstatus)
# OK, reregister the connector once again now. This should cause the job to wake up and finish.
ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# See if we can find the documents we just ingested, except for the one that should have been eliminated by the document template
ConnectorHelpers.search_check( [ "reference" ], "Zena", [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], "Zena", [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], "Zena", [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], "Zena", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "castle" ], "Zena", [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], "Zena", [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], "Zena", [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], "Zena", [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "sodapop" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "humid" ], "Zena", [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], "Zena", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
# Create a second job that ingests the same documents using a different collection (part of the test for 24171)
job_id_2 = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job 2",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ],
collection_name="Boomer",
document_template=document_template_text )
# Run this job to completion. It should replace all the collections with the new one...
ConnectorHelpers.start_job( job_id_2 )
ConnectorHelpers.wait_job_complete( job_id_2 )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# Confirm that all the documents have been reingested, but into the new collection.
ConnectorHelpers.search_check( [ "reference" ], "Boomer", [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], "Boomer", [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], "Boomer", [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], "Boomer", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "castle" ], "Boomer", [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], "Boomer", [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], "Boomer", [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], "Boomer", [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "sodapop" ], "Boomer", [ ] )
ConnectorHelpers.search_check( [ "humid" ], "Boomer", [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], "Boomer", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
# Now, delete the second job. This should *not* delete the documents, because they are shared, but the "Boomer" collection
# should still have documents in it.
ConnectorHelpers.delete_job( job_id_2 )
ConnectorHelpers.wait_job_deleted( job_id_2 )
ConnectorHelpers.search_check( [ "reference" ], "Boomer", [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], "Boomer", [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], "Boomer", [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], "Boomer", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "castle" ], "Boomer", [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], "Boomer", [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], "Boomer", [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], "Boomer", [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "sodapop" ], "Boomer", [ ] )
ConnectorHelpers.search_check( [ "humid" ], "Boomer", [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], "Boomer", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
# Rerun the *first* job. This should detect the fact that the collection needs to change, and the documents should thus be updated.
ConnectorHelpers.start_job( job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Wait until ingest has caught up
ConnectorHelpers.wait_for_ingest( )
# All should have reverted to how it was prior to job_2 being introduced!
ConnectorHelpers.search_check( [ "reference" ], "Zena", [ "root/crawlarea/testfiles/f001.txt" ] )
ConnectorHelpers.search_check( [ "interesting" ], "Zena", [ "root/crawlarea/testfiles/f006.txt" ] )
ConnectorHelpers.search_check( [ "smelly" ], "Zena", [ "root/crawlarea/testfiles/f007.txt" ] )
ConnectorHelpers.search_check( [ "restaurants" ], "Zena", [ "root/crawlarea/testfiles/newfolder/f008.txt" ] )
ConnectorHelpers.search_check( [ "castle" ], "Zena", [ "root/crawlarea/testfiles/f002.txt" ] )
ConnectorHelpers.search_check( [ "establishments" ], "Zena", [ "root/crawlarea/testfiles/f004.txt" ] )
ConnectorHelpers.search_check( [ "albemarle" ], "Zena", [ "root/crawlarea/testfiles/f009.txt" ] )
ConnectorHelpers.search_check( [ "golfcarts" ], "Zena", [ "root/crawlarea/testfiles/f010.txt" ] )
ConnectorHelpers.search_check( [ "sodapop" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "humid" ], "Zena", [ "root/crawlarea/testfiles2/f002.txt" ] )
ConnectorHelpers.search_check( [ "document" ], "Zena", [ "root/crawlarea/testfiles2/f001.txt", "root/crawlarea/testfiles/f002.txt", "root/crawlarea/testfiles/f009.txt" ] )
# For added fun, make sure we can delete a job when the connector has been removed!
ConnectorHelpers.deregister_connector("com.metacarta.crawler.connectors.filesystem.FileConnector")
# Recycle the services to be sure there's no already-created handles around
ConnectorHelpers.restart_tomcat()
ConnectorHelpers.restart_agents()
time.sleep(60)
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
# Job cleanup should leave nothing around
ConnectorHelpers.search_check( [ "reference" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "interesting" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "smelly" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "restaurants" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "castle" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "establishments" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "albemarle" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "golfcarts" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "humid" ], "Zena", [ ] )
ConnectorHelpers.search_check( [ "document" ], "Zena", [ ] )
print "Done Collection Ingestion Test."
# ConnectorHelpers.delete_repositoryconnection( "FileSystem" )
ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )
# OK, reregister the connector now. This should have no other affect than getting the system back to normal.
ConnectorHelpers.register_connector("com.metacarta.crawler.connectors.filesystem.FileConnector", "FilesystemConnector")
# Phase 12: Throttling and report generation test
# With this test, we will establish a fetch-rate throttle, and then crawl with it to verify that the crawler obeys the average fetch rate restrictions.
# For this to work, the file system connector must also provide a "fetch" activity that we can run reports against.
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password, "FileSystem", "FileSystem Connection",throttles=[("","Limit fetch rate to two per minute","2")] )
# Create and run the job. The job should take about 5 minutes to run, given the throttle settings.
job_id = ConnectorHelpers.define_filesystem_job_ui( username,
password,
"Test job",
"FileSystem",
[ ( "/root/crawlarea", [ ( "include", "file", "*.txt" ), ( "include", "directory", "*" ) ] ) ] )
# Run the job to completion
ConnectorHelpers.start_job_ui( username, password, job_id )
ConnectorHelpers.wait_job_complete( job_id )
# Now, delete the job
ConnectorHelpers.delete_job_ui( username, password, job_id )
ConnectorHelpers.wait_job_deleted( job_id )
# Run some history reports from the UI. These reports should confirm an average read rate of 2 documents per minute, and should contain all expected events.
simple_results = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem", [ "read document" ] )
if len(simple_results) != 15:
raise Exception("Expecting 15 simple report result rows; got %d" % len(simple_results))
max_activity_results = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem", [ "read document" ], entity_bin_regexp="()" )
if len(max_activity_results) != 1:
raise Exception("Expecting 1 row in max activity report; got %d" % len(max_activity_results))
rate_column = float(max_activity_results[0]["Highest Activity Rate [per min]"])
if rate_column > 3.0:
raise Exception("Maximum fetch rate exceeded the 1-sigma limit of 3.0 documents per minute; got %f" % rate_column)
max_bandwidth_results = ConnectorHelpers.run_max_bandwidth_history_report_ui( username, password, "FileSystem", [ "read document" ], entity_bin_regexp="()" )
if len(max_bandwidth_results) != 1:
raise Exception("Expecting 1 row in max bandwidth report; got %d" % len(max_bandwidth_results))
result_histogram = ConnectorHelpers.run_result_histogram_history_report_ui( username, password, "FileSystem", [ "read document" ], entity_bin_regexp="()", result_bin_regexp="(.*)" )
if len(result_histogram) != 1:
raise Exception("Expecting 1 row from result histogram; got %d" % len(result_histogram))
if result_histogram[0]["Result Class"] != "ok":
raise Exception("Expected only 'ok' results, got '%s'" % result_histogram[0]["Result Class"])
if result_histogram[0]["Event Count"] != "15":
raise Exception("Expected EventCount to be 15, was %s" % result_histogram[0]["Event Count"])
# We need to make sure that a report screen where connection has been chosen is still happy after the connection goes away.
# Open up a virtual browser window accordingly...
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
vb.load_main_window( "http://localhost/crawler/index.jsp" )
window = vb.find_window("")
link = window.find_link("Simple history")
link.click( )
window = vb.find_window("")
form = window.find_form("report")
form.find_selectbox("reportconnection").select_value( "FileSystem" )
window.find_button("Continue").click( )
window = vb.find_window("")
form = window.find_form("report")
activities_select = form.find_selectbox("reportactivities")
activities_select.multi_select_value( "read document" )
# Fire off the query
window.find_button("Execute this query").click( )
# Make sure we could fire off the query again if we wanted
window = vb.find_window("")
form = window.find_form("report")
go_button = window.find_button("Execute this query")
# Remove the connection.
ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )
# Make sure when we press "Go", something reasonable happens
go_button.click()
window = vb.find_window("")
# Report form should be present
form = window.find_form("report")
# Go button should be gone, but we should have a "continue" button back...
window.find_button("Continue")
print "Done Report Tests."
print "Spin to detect deadlock condition"
# This uses the crawl configuration saved during the scheduling test. We really don't care much about it except the queries that will be fired
# off, so it's fine to rerun this as long as we use ResetAll to clean up whatever configuration garbage is around at the end.
run_lock_spinner( "test_crawl_2.conf" )
ConnectorHelpers.reset_all()
ConnectorHelpers.define_gts_outputconnection( )
# Next phase: Run postgresql maintenance script
print "Running maintenance script"
ConnectorHelpers.run_maintenance()
print "Done with maintenance script test"
print "Testing for error page script injection"
response = ConnectorHelpers.invoke_curl( "http://localhost/crawler/error.jsp?text=%3Cscript%3Ealert(%27test%27)%3C/script%3E&target=%27%3E%3Cscript%3Ealert(%27test%27)%3C/script%3E", user_name=username, password=password )
if response.find("<script>alert('test')</script>") != -1:
raise Exception("Script injection seems to have taken place into error.jsp! Response = %s" % response)
print "Done with error page injection test"
# Delete standard GTS output
ConnectorHelpers.delete_gts_outputconnection( )
ConnectorHelpers.delete_crawler_user( username )
delete_folder("/root/crawlarea")
sqatools.LicenseMakerClient.revoke_license()
ConnectorHelpers.teardown_connector_environment( )
# Last check: Be sure that there are no errors in the metacarta log due to postgresql connections being dropped
print "Checking log for postgresql EOF errors"
lines = ConnectorHelpers.read_metacarta_log( "unexpected EOF on client connection", log_pos )
if len(lines) > 0:
raise Exception("Found %d EOF errors in postgresql log output!" % len(lines) )
print "Testing check-system-health when database is down"
stop_database()
# Kick off check_system_health until we don't see the 'skipping' message about authorities
while True:
if system_health_check( ):
break
start_database()
ConnectorHelpers.start_agents( )
print "Basic ConnectorFramework tests PASSED"