blob: 3fe21e6d138bb58f4a3e631140bfd3399600edb4 [file] [log] [blame]
#!/usr/bin/python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
import ConnectorHelpers
from sqatools import LicenseMakerClient
from sqatools import appliance
from threading import Thread
sys.path.append("/usr/lib/metacarta")
import MetaCartaVersion
# Copy a folder to a (new) area
def copy_folder( source, target ):
appliance.spcall( [ "mkdir", "-p", target ] )
appliance.spcall( [ "cp", "-r", source, target ] )
# Remove a folder
def delete_folder( target ):
appliance.spcall( [ "rm", "-rf", target ] )
# Crawl user credentials
username = "testingest"
password = "testingest"
def preclean( print_errors=True ):
''' Clean up everything we might have done during the execution of this test.
This will include all jobs and ingested documents. '''
try:
ConnectorHelpers.reset_all()
except Exception, e:
if print_errors:
print "Error resetting all jobs"
print e
# Remove test documents first
for folder in [ "/common/crawlarea" ]:
try:
delete_folder( folder )
except Exception, e:
if print_errors:
print "Error removing %s" % folder
print e
try:
LicenseMakerClient.revoke_license
except Exception, e:
if print_errors:
print "Error revoking license"
print e
try:
ConnectorHelpers.delete_crawler_user( username )
except Exception, e:
if print_errors:
print "Error removing crawler user"
print e
try:
ConnectorHelpers.teardown_connector_environment( )
except Exception, e:
if print_errors:
print "Error cleaning up debs"
print e
# Main
if __name__ == '__main__':
print "Precleaning!"
preclean( print_errors=False )
print "Setting up environment."
ConnectorHelpers.setup_connector_environment()
print "Adding crawl user."
ConnectorHelpers.create_crawler_user( username, password )
ConnectorHelpers.define_gts_outputconnection( )
print "Setting up file area."
# We need enough documents to allow us to estimate fetch rate accurately; I
# think 3x1,000 documents will do nicely, since it also fits within the "max fetch rate report" max window size of 5,000.
level0 = 0
while level0 < 10:
level1 = 0
while level1 < 10:
pathname = "/common/crawlarea/BOTH/%d/%d" % (level0,level1)
copy_folder("/root/largefiles",pathname)
level1 += 1
level0 += 1
level0 = 0
while level0 < 10:
level1 = 0
while level1 < 10:
pathname = "/common/crawlarea/SLOW/%d/%d" % (level0,level1)
copy_folder("/root/largefiles",pathname)
level1 += 1
level0 += 1
level0 = 0
while level0 < 10:
level1 = 0
while level1 < 10:
pathname = "/common/crawlarea/FAST/%d/%d" % (level0,level1)
copy_folder("/root/largefiles",pathname)
level1 += 1
level0 += 1
# Set up the connection with two different throttles for bins "FAST" and "SLOW"
print "Dissimilar throttle test."
# Define repository connection. The throttle rates are in documents per minute. We have to stay clear of the probable upper fetch
# rate of about 100 docs/sec, or 6000 docs per minute, but we can certainly aspire to make the test run as quickly as possible otherwise.
ConnectorHelpers.define_filesystem_repository_connection_ui( username, password,
"FileSystem", "FileSystem Connection",
max_connections=100,
throttles=[("FAST","All documents in the FAST bin",1000), ("SLOW","All documents in the SLOW bin",100)] )
# Define job
doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/common/crawlarea"><include match="*.htm" type="file"/><include match="*" type="directory"/></startpoint></specification>'
job_id = ConnectorHelpers.define_job( "Test job",
"FileSystem",
doc_spec_xml )
# Run the job to completion
start_time = time.time()
ConnectorHelpers.start_job( job_id )
# Now, let job complete
ConnectorHelpers.wait_job_complete( job_id )
# Now, check whether we exceeded any of the throttle rates for the different classes.
# If we did, it may mean that throttling is not working correctly for
# documents with multiple bins. I'm leaving the window size at 5 minutes, even though the overall crawl is expected to take only
# 10, because we do want to assess whether a burst might take place.
rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
[ "read document" ],
entity_regexp="/SLOW/",
entity_bin_regexp="()" )
if len(rate_report) != 1:
raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
if max_rate > 110.0:
raise Exception("Max activity rate for SLOW docs should not have exceeded one-sigma value of 110.0 docs per minute; instead saw %f" % max_rate)
rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
[ "read document" ],
entity_regexp="/BOTH/",
entity_bin_regexp="()" )
if len(rate_report) != 1:
raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
if max_rate > 110.0:
raise Exception("Max activity rate for BOTH docs should not have exceeded one-sigma value of 110.0 docs per minute; instead saw %f" % max_rate)
rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
[ "read document" ],
entity_regexp="/FAST/",
entity_bin_regexp="()" )
if len(rate_report) != 1:
raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
if max_rate > 1100.0:
raise Exception("Max activity rate for FAST docs should not have exceeded one-sigma value of 1100.0 docs per minute; instead saw %f" % max_rate)
# Next, use the simple report generator to give us an idea of the overall fetch rate for each document class.
# The BOTH and the SLOW rates should be similar.
slow_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
[ "read document" ],
entity_regexp="/SLOW/" )
fast_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
[ "read document" ],
entity_regexp="/FAST/" )
both_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
[ "read document" ],
entity_regexp="/BOTH/" )
if len(slow_history) == 0:
raise Exception("There should be documents in the history with /SLOW/ in the file path!")
if len(fast_history) == 0:
raise Exception("There should be documents in the history with /FAST/ in the file path!")
if len(both_history) == 0:
raise Exception("There should be documents in the history with /BOTH/ in the file path!")
# Use the last time (which is the first row of the result) for each of these to establish a crawl interval
slow_end_time = ConnectorHelpers.parse_date_time(slow_history[0]["Start Time"])
fast_end_time = ConnectorHelpers.parse_date_time(fast_history[0]["Start Time"])
both_end_time = ConnectorHelpers.parse_date_time(both_history[0]["Start Time"])
# Calculate a read rate (in docs per second).
# Even though we put 1000 documents into each category, really there are 1211 because we have to count the directories too...
slow_rate = 1211.0 / (slow_end_time - start_time)
fast_rate = 1211.0 / (fast_end_time - start_time)
both_rate = 1211.0 / (both_end_time - start_time)
print "The approximate rates are: %f (SLOW), %f (FAST), %f (BOTH)" % ((slow_rate * 60.0),(fast_rate * 60.0),(both_rate * 60.0))
# The document priorities should be calculated to avoid overweighting the "FAST" pool at the expense of the "SLOW" or "BOTH" pools.
# It is therefore part of this test that the rates we've determined will hold approximately the correct ratios to one another, even if the
# whole system "runs behind". Therefore, FAST should be approximately 10x SLOW and 10x BOTH.
if slow_rate * (10.0 * 0.65) > fast_rate or slow_rate * (10.0 * 1.35) < fast_rate:
raise Exception("SLOW documents : FAST documents ratio was out of range: %f" % (fast_rate/slow_rate))
if both_rate * (10.0 * 0.65) > fast_rate or both_rate * (10.0 * 1.35) < fast_rate:
raise Exception("BOTH documents : FAST documents ratio was out of range: %f" % (fast_rate/both_rate))
if slow_rate * 0.65 > both_rate or slow_rate * 1.35 < both_rate:
raise Exception("SLOW documents : BOTH documents ratio was out of range: %f" % (both_rate/slow_rate))
print "Cleanup Dissimilar Throttle Test."
ConnectorHelpers.delete_job( job_id )
ConnectorHelpers.wait_job_deleted( job_id )
ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )
ConnectorHelpers.delete_gts_outputconnection( )
ConnectorHelpers.delete_crawler_user( username )
delete_folder("/common/crawlarea")
LicenseMakerClient.revoke_license
ConnectorHelpers.teardown_connector_environment( )
print "Scheduling/Throttling ConnectorFramework tests PASSED"