legacy-tests/connectorframework_scheduling.py - manifoldcf - Git at Google

 #!/usr/bin/python

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements. See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 import sys
 import time
 import ConnectorHelpers
 from sqatools import LicenseMakerClient
 from sqatools import appliance
 from threading import Thread

 sys.path.append("/usr/lib/metacarta")

 import MetaCartaVersion

 # Copy a folder to a (new) area
 def copy_folder( source, target ):
     appliance.spcall( [ "mkdir", "-p", target ] )
     appliance.spcall( [ "cp", "-r", source, target ] )


 # Remove a folder
 def delete_folder( target ):
     appliance.spcall( [ "rm", "-rf", target ] )

 # Crawl user credentials
 username = "testingest"
 password = "testingest"

 def preclean( print_errors=True ):
     ''' Clean up everything we might have done during the execution of this test.
         This will include all jobs and ingested documents. '''

     try:
         ConnectorHelpers.reset_all()
     except Exception, e:
         if print_errors:
             print "Error resetting all jobs"
             print e

     # Remove test documents first
     for folder in [ "/common/crawlarea" ]:
         try:
             delete_folder( folder )
         except Exception, e:
             if print_errors:
                 print "Error removing %s" % folder
                 print e

     try:
         LicenseMakerClient.revoke_license
     except Exception, e:
         if print_errors:
             print "Error revoking license"
             print e

     try:
         ConnectorHelpers.delete_crawler_user( username )
     except Exception, e:
         if print_errors:
             print "Error removing crawler user"
             print e

     try:
         ConnectorHelpers.teardown_connector_environment( )
     except Exception, e:
         if print_errors:
             print "Error cleaning up debs"
             print e

 # Main
 if __name__ == '__main__':

     print "Precleaning!"
     preclean( print_errors=False )

     print "Setting up environment."
     ConnectorHelpers.setup_connector_environment()

     print "Adding crawl user."
     ConnectorHelpers.create_crawler_user( username, password )
     ConnectorHelpers.define_gts_outputconnection( )

     print "Setting up file area."
     # We need enough documents to allow us to estimate fetch rate accurately; I
     # think 3x1,000 documents will do nicely, since it also fits within the "max fetch rate report" max window size of 5,000.
     level0 = 0
     while level0 < 10:
         level1 = 0
         while level1 < 10:
             pathname = "/common/crawlarea/BOTH/%d/%d" % (level0,level1)
             copy_folder("/root/largefiles",pathname)
             level1 += 1
         level0 += 1
     level0 = 0
     while level0 < 10:
         level1 = 0
         while level1 < 10:
             pathname = "/common/crawlarea/SLOW/%d/%d" % (level0,level1)
             copy_folder("/root/largefiles",pathname)
             level1 += 1
         level0 += 1
     level0 = 0
     while level0 < 10:
         level1 = 0
         while level1 < 10:
             pathname = "/common/crawlarea/FAST/%d/%d" % (level0,level1)
             copy_folder("/root/largefiles",pathname)
             level1 += 1
         level0 += 1


     # Set up the connection with two different throttles for bins "FAST" and "SLOW"

     print "Dissimilar throttle test."

     # Define repository connection.  The throttle rates are in documents per minute.  We have to stay clear of the probable upper fetch
     # rate of about 100 docs/sec, or 6000 docs per minute, but we can certainly aspire to make the test run as quickly as possible otherwise.
     ConnectorHelpers.define_filesystem_repository_connection_ui( username, password,
         "FileSystem", "FileSystem Connection",
         max_connections=100,
         throttles=[("FAST","All documents in the FAST bin",1000), ("SLOW","All documents in the SLOW bin",100)] )

     # Define job
     doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/common/crawlarea"><include match="*.htm" type="file"/><include match="*" type="directory"/></startpoint></specification>'
     job_id = ConnectorHelpers.define_job( "Test job",
                             "FileSystem",
                             doc_spec_xml )

     # Run the job to completion
     start_time = time.time()

     ConnectorHelpers.start_job( job_id )
     # Now, let job complete
     ConnectorHelpers.wait_job_complete( job_id )

     # Now, check whether we exceeded any of the throttle rates for the different classes.
     # If we did, it may mean that throttling is not working correctly for
     # documents with multiple bins.  I'm leaving the window size at 5 minutes, even though the overall crawl is expected to take only
     # 10, because we do want to assess whether a burst might take place.
     rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
         [ "read document" ],
         entity_regexp="/SLOW/",
         entity_bin_regexp="()" )
     if len(rate_report) != 1:
         raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
     max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
     if max_rate > 110.0:
         raise Exception("Max activity rate for SLOW docs should not have exceeded one-sigma value of 110.0 docs per minute; instead saw %f" % max_rate)

     rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
         [ "read document" ],
         entity_regexp="/BOTH/",
         entity_bin_regexp="()" )
     if len(rate_report) != 1:
         raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
     max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
     if max_rate > 110.0:
         raise Exception("Max activity rate for BOTH docs should not have exceeded one-sigma value of 110.0 docs per minute; instead saw %f" % max_rate)

     rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
         [ "read document" ],
         entity_regexp="/FAST/",
         entity_bin_regexp="()" )
     if len(rate_report) != 1:
         raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
     max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
     if max_rate > 1100.0:
         raise Exception("Max activity rate for FAST docs should not have exceeded one-sigma value of 1100.0 docs per minute; instead saw %f" % max_rate)

     # Next, use the simple report generator to give us an idea of the overall fetch rate for each document class.
     # The BOTH and the SLOW rates should be similar.
     slow_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
         [ "read document" ],
         entity_regexp="/SLOW/" )
     fast_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
         [ "read document" ],
         entity_regexp="/FAST/" )
     both_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
         [ "read document" ],
         entity_regexp="/BOTH/" )
     if len(slow_history) == 0:
         raise Exception("There should be documents in the history with /SLOW/ in the file path!")
     if len(fast_history) == 0:
         raise Exception("There should be documents in the history with /FAST/ in the file path!")
     if len(both_history) == 0:
         raise Exception("There should be documents in the history with /BOTH/ in the file path!")
     # Use the last time (which is the first row of the result) for each of these to establish a crawl interval
     slow_end_time = ConnectorHelpers.parse_date_time(slow_history[0]["Start Time"])
     fast_end_time = ConnectorHelpers.parse_date_time(fast_history[0]["Start Time"])
     both_end_time = ConnectorHelpers.parse_date_time(both_history[0]["Start Time"])
     # Calculate a read rate (in docs per second).
     # Even though we put 1000 documents into each category, really there are 1211 because we have to count the directories too...
     slow_rate = 1211.0 / (slow_end_time - start_time)
     fast_rate = 1211.0 / (fast_end_time - start_time)
     both_rate = 1211.0 / (both_end_time - start_time)

     print "The approximate rates are: %f (SLOW), %f (FAST), %f (BOTH)" % ((slow_rate * 60.0),(fast_rate * 60.0),(both_rate * 60.0))

     # The document priorities should be calculated to avoid overweighting the "FAST" pool at the expense of the "SLOW" or "BOTH" pools.
     # It is therefore part of this test that the rates we've determined will hold approximately the correct ratios to one another, even if the
     # whole system "runs behind".  Therefore, FAST should be approximately 10x SLOW and 10x BOTH.
     if slow_rate * (10.0 * 0.65) > fast_rate or slow_rate * (10.0 * 1.35) < fast_rate:
         raise Exception("SLOW documents : FAST documents ratio was out of range: %f" % (fast_rate/slow_rate))
     if both_rate * (10.0 * 0.65) > fast_rate or both_rate * (10.0 * 1.35) < fast_rate:
         raise Exception("BOTH documents : FAST documents ratio was out of range: %f" % (fast_rate/both_rate))
     if slow_rate * 0.65 > both_rate or slow_rate * 1.35 < both_rate:
         raise Exception("SLOW documents : BOTH documents ratio was out of range: %f" % (both_rate/slow_rate))

     print "Cleanup Dissimilar Throttle Test."

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )

     ConnectorHelpers.delete_gts_outputconnection( )
     ConnectorHelpers.delete_crawler_user( username )
     delete_folder("/common/crawlarea")
     LicenseMakerClient.revoke_license

     ConnectorHelpers.teardown_connector_environment( )

     print "Scheduling/Throttling ConnectorFramework tests PASSED"
	#!/usr/bin/python

	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	import sys
	import time
	import ConnectorHelpers
	from sqatools import LicenseMakerClient
	from sqatools import appliance
	from threading import Thread

	sys.path.append("/usr/lib/metacarta")

	import MetaCartaVersion

	# Copy a folder to a (new) area
	def copy_folder( source, target ):
	appliance.spcall( [ "mkdir", "-p", target ] )
	appliance.spcall( [ "cp", "-r", source, target ] )


	# Remove a folder
	def delete_folder( target ):
	appliance.spcall( [ "rm", "-rf", target ] )

	# Crawl user credentials
	username = "testingest"
	password = "testingest"

	def preclean( print_errors=True ):
	''' Clean up everything we might have done during the execution of this test.
	This will include all jobs and ingested documents. '''

	try:
	ConnectorHelpers.reset_all()
	except Exception, e:
	if print_errors:
	print "Error resetting all jobs"
	print e

	# Remove test documents first
	for folder in [ "/common/crawlarea" ]:
	try:
	delete_folder( folder )
	except Exception, e:
	if print_errors:
	print "Error removing %s" % folder
	print e

	try:
	LicenseMakerClient.revoke_license
	except Exception, e:
	if print_errors:
	print "Error revoking license"
	print e

	try:
	ConnectorHelpers.delete_crawler_user( username )
	except Exception, e:
	if print_errors:
	print "Error removing crawler user"
	print e

	try:
	ConnectorHelpers.teardown_connector_environment( )
	except Exception, e:
	if print_errors:
	print "Error cleaning up debs"
	print e

	# Main
	if __name__ == '__main__':

	print "Precleaning!"
	preclean( print_errors=False )

	print "Setting up environment."
	ConnectorHelpers.setup_connector_environment()

	print "Adding crawl user."
	ConnectorHelpers.create_crawler_user( username, password )
	ConnectorHelpers.define_gts_outputconnection( )

	print "Setting up file area."
	# We need enough documents to allow us to estimate fetch rate accurately; I
	# think 3x1,000 documents will do nicely, since it also fits within the "max fetch rate report" max window size of 5,000.
	level0 = 0
	while level0 < 10:
	level1 = 0
	while level1 < 10:
	pathname = "/common/crawlarea/BOTH/%d/%d" % (level0,level1)
	copy_folder("/root/largefiles",pathname)
	level1 += 1
	level0 += 1
	level0 = 0
	while level0 < 10:
	level1 = 0
	while level1 < 10:
	pathname = "/common/crawlarea/SLOW/%d/%d" % (level0,level1)
	copy_folder("/root/largefiles",pathname)
	level1 += 1
	level0 += 1
	level0 = 0
	while level0 < 10:
	level1 = 0
	while level1 < 10:
	pathname = "/common/crawlarea/FAST/%d/%d" % (level0,level1)
	copy_folder("/root/largefiles",pathname)
	level1 += 1
	level0 += 1


	# Set up the connection with two different throttles for bins "FAST" and "SLOW"

	print "Dissimilar throttle test."

	# Define repository connection. The throttle rates are in documents per minute. We have to stay clear of the probable upper fetch
	# rate of about 100 docs/sec, or 6000 docs per minute, but we can certainly aspire to make the test run as quickly as possible otherwise.
	ConnectorHelpers.define_filesystem_repository_connection_ui( username, password,
	"FileSystem", "FileSystem Connection",
	max_connections=100,
	throttles=[("FAST","All documents in the FAST bin",1000), ("SLOW","All documents in the SLOW bin",100)] )

	# Define job
	doc_spec_xml = '<?xml version="1.0" encoding="UTF-8"?><specification><startpoint path="/common/crawlarea"><include match=".htm" type="file"/><include match="" type="directory"/></startpoint></specification>'
	job_id = ConnectorHelpers.define_job( "Test job",
	"FileSystem",
	doc_spec_xml )

	# Run the job to completion
	start_time = time.time()

	ConnectorHelpers.start_job( job_id )
	# Now, let job complete
	ConnectorHelpers.wait_job_complete( job_id )

	# Now, check whether we exceeded any of the throttle rates for the different classes.
	# If we did, it may mean that throttling is not working correctly for
	# documents with multiple bins. I'm leaving the window size at 5 minutes, even though the overall crawl is expected to take only
	# 10, because we do want to assess whether a burst might take place.
	rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
	[ "read document" ],
	entity_regexp="/SLOW/",
	entity_bin_regexp="()" )
	if len(rate_report) != 1:
	raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
	max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
	if max_rate > 110.0:
	raise Exception("Max activity rate for SLOW docs should not have exceeded one-sigma value of 110.0 docs per minute; instead saw %f" % max_rate)

	rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
	[ "read document" ],
	entity_regexp="/BOTH/",
	entity_bin_regexp="()" )
	if len(rate_report) != 1:
	raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
	max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
	if max_rate > 110.0:
	raise Exception("Max activity rate for BOTH docs should not have exceeded one-sigma value of 110.0 docs per minute; instead saw %f" % max_rate)

	rate_report = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "FileSystem",
	[ "read document" ],
	entity_regexp="/FAST/",
	entity_bin_regexp="()" )
	if len(rate_report) != 1:
	raise Exception("Expected max activity report to have exactly one row, instead saw %d" % len(rate_report))
	max_rate = float(rate_report[0]["Highest Activity Rate [per min]"])
	if max_rate > 1100.0:
	raise Exception("Max activity rate for FAST docs should not have exceeded one-sigma value of 1100.0 docs per minute; instead saw %f" % max_rate)

	# Next, use the simple report generator to give us an idea of the overall fetch rate for each document class.
	# The BOTH and the SLOW rates should be similar.
	slow_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
	[ "read document" ],
	entity_regexp="/SLOW/" )
	fast_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
	[ "read document" ],
	entity_regexp="/FAST/" )
	both_history = ConnectorHelpers.run_simple_history_report_ui( username, password, "FileSystem",
	[ "read document" ],
	entity_regexp="/BOTH/" )
	if len(slow_history) == 0:
	raise Exception("There should be documents in the history with /SLOW/ in the file path!")
	if len(fast_history) == 0:
	raise Exception("There should be documents in the history with /FAST/ in the file path!")
	if len(both_history) == 0:
	raise Exception("There should be documents in the history with /BOTH/ in the file path!")
	# Use the last time (which is the first row of the result) for each of these to establish a crawl interval
	slow_end_time = ConnectorHelpers.parse_date_time(slow_history[0]["Start Time"])
	fast_end_time = ConnectorHelpers.parse_date_time(fast_history[0]["Start Time"])
	both_end_time = ConnectorHelpers.parse_date_time(both_history[0]["Start Time"])
	# Calculate a read rate (in docs per second).
	# Even though we put 1000 documents into each category, really there are 1211 because we have to count the directories too...
	slow_rate = 1211.0 / (slow_end_time - start_time)
	fast_rate = 1211.0 / (fast_end_time - start_time)
	both_rate = 1211.0 / (both_end_time - start_time)

	print "The approximate rates are: %f (SLOW), %f (FAST), %f (BOTH)" % ((slow_rate * 60.0),(fast_rate * 60.0),(both_rate * 60.0))

	# The document priorities should be calculated to avoid overweighting the "FAST" pool at the expense of the "SLOW" or "BOTH" pools.
	# It is therefore part of this test that the rates we've determined will hold approximately the correct ratios to one another, even if the
	# whole system "runs behind". Therefore, FAST should be approximately 10x SLOW and 10x BOTH.
	if slow_rate * (10.0 * 0.65) > fast_rate or slow_rate * (10.0 * 1.35) < fast_rate:
	raise Exception("SLOW documents : FAST documents ratio was out of range: %f" % (fast_rate/slow_rate))
	if both_rate * (10.0 * 0.65) > fast_rate or both_rate * (10.0 * 1.35) < fast_rate:
	raise Exception("BOTH documents : FAST documents ratio was out of range: %f" % (fast_rate/both_rate))
	if slow_rate * 0.65 > both_rate or slow_rate * 1.35 < both_rate:
	raise Exception("SLOW documents : BOTH documents ratio was out of range: %f" % (both_rate/slow_rate))

	print "Cleanup Dissimilar Throttle Test."

	ConnectorHelpers.delete_job( job_id )
	ConnectorHelpers.wait_job_deleted( job_id )

	ConnectorHelpers.delete_filesystem_repository_connection_ui( username, password, "FileSystem" )

	ConnectorHelpers.delete_gts_outputconnection( )
	ConnectorHelpers.delete_crawler_user( username )
	delete_folder("/common/crawlarea")
	LicenseMakerClient.revoke_license

	ConnectorHelpers.teardown_connector_environment( )

	print "Scheduling/Throttling ConnectorFramework tests PASSED"