legacy-tests/webconnector_basic.py - manifoldcf - Git at Google

 #!/usr/bin/python

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements. See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import pg
 import sys
 import time
 import ConnectorHelpers
 import WebConnectorHelpers
 import sqatools.appliance
 from wintools import sqa_domain_info
 from wintools import filetools
 from wintools import ambassador_client
 from sqatools import LicenseMakerClient
 import TestDocs
 import VirtualBrowser

 sys.path.append("/usr/lib/metacarta")

 import MetaCartaVersion

 # Server name to talk to
 webServerName = None
 # Domain
 webDomain = None
 # User
 webUser = None
 # Password
 webPassword = None

 # Server port to talk to
 webServerPort = "81"
 # Secure server port to talk to
 webServerSecurePort = "444"
 # Session authentication site web server
 auth_web_server = "crystal.metacarta.com"

 # Names of feeds
 feed_names = [ "feed1.xml", "feed2.xml", "feed3.xml", "feed4.xml", "feed5.xml", "feed6.xml", "feed7.xml", "feed8.xml", "feed10.xml" ]

 # Names of documents that match 'redirected'
 redirected_documents = [ "redirected_2635.htm" ]

 # Names of documents that match 'atom'
 atom_documents = [ "atom_article_1.htm",
                    "atom_article_2.htm",
                    "atom_article_3.htm",
                    "atom_article_4.htm",
                    "atom_article_5.htm",
                    "atom_article_6.htm",
                    "atom_article_7.htm",
                    "atom_article_8.htm",
                    "atom_article_9.htm",
                    "atom_article_10.htm",
                    "atom_article_11.htm",
                    "atom_article_12.htm",
                    "atom_article_13.htm",
                    "atom_article_14.htm",
                    "atom_article_15.htm" ]

 # Names of documents that match 'latah'
 latah_documents = [ "latah_2635.htm",
                         "latah_2637.htm",
                         "latah_2638.htm",
                         "latah_2639.htm",
                         "latah_2640.htm",
                         "latah_2641.htm",
                         "latah_2642.htm",
                         "latah_2643.htm",
                         "latah_2644.htm",
                         "latah_2645.htm",
                         "latah_2646.htm",
                         "latah_2647.htm",
                         "latah_2649.htm",
                         "latah_2650.htm",
                         "latah_2651.htm",
                         "latah_2652.htm",
                         "latah_2653.htm",
                         "latah_2654.htm",
                         "latah_2655.htm",
                         "latah_2656.htm",
                         "latah_2657.htm",
                         "latah_2658.htm",
                         "latah_2659.htm",
                         "latah_2660.htm",
                         "latah_2661.htm",
                         "latah_2662.htm",
                         "latah_2663.htm",
                         "latah_2664.htm",
                         "latah_2665.htm",
                         "latah_2666.htm",
                         "latah_2667.htm" ]
 # Names of documents that match 'Jamestown'
 jamestown_specialdocument = "jamestown_58996.htm"
 jamestown_basedocuments = [ "jamestown_58978.htm",
                         "jamestown_58980.htm",
                         "jamestown_58981.htm",
                         "jamestown_58982.htm",
                         "jamestown_58987.htm",
                         "jamestown_58988.htm",
                         "jamestown_58991.htm",
                         "jamestown_58993.htm",
                         "jamestown_58994.htm",
                         "jamestown_58995.htm",
                         "jamestown_58997.htm",
                         "jamestown_59011.htm" ]
 jamestown_documents = jamestown_basedocuments + [ jamestown_specialdocument ]

 # Names of documents that match 'Allafrica'
 allafrica_specialdocument = "200712310708.html"
 allafrica_basedocuments = [ "200712310370.html",
                         "200712310447.html",
                         "200712310449.html",
                         "200712310454.html",
                         "200712310587.html",
                         "200712310588.html",
                         "200712310590.html",
                         "200712310632.html",
                         "200712310684.html",
                         "200712310698.html",
                         "200712310711.html",
                         "200712310712.html",
                         "200712310714.html",
                         "200712310715.html",
                         "200712310718.html",
                         "200712310744.html",
                         "200712310746.html",
                         "200712310748.html",
                         "200712310760.html",
                         "200712310795.html",
                         "200712310799.html",
                         "200712310808.html",
                         "200712310811.html",
                         "200712310816.html",
                         "200712310817.html",
                         "200712310820.html",
                         "200712310825.html",
                         "200712310829.html",
                         "200712310876.html" ]
 allafrica_documents = allafrica_basedocuments + [ allafrica_specialdocument ]

 # Names of documents that match "Chinanews"
 chinanews_specialdocument = "237544.htm"
 chinanews_basedocuments = [ "237532.htm",
                         "237543.htm",
                         "237545.htm",
                         "237546.htm",
                         "237548.htm",
                         "237552.htm",
                         "237554.htm",
                         "237560.htm",
                         "237566.htm",
                         "237568.htm",
                         "237575.htm",
                         "237611.htm",
                         "237613.htm",
                         "237614.htm",
                         "237617.htm",
                         "237619.htm",
                         "237636.htm",
                         "237645.htm",
                         "237664.htm" ]
 chinanews_documents = chinanews_basedocuments + [ chinanews_specialdocument ]

 # Names of documents that match "Baghdadfeed"

 baghdadfeed_specialdocument = "baghdad_sacrifice.html"
 baghdadfeed_basedocuments = [ "baghdad_green-zone-blue.html",
                         "baghdad_hot-water.html",
                         "baghdad_i-just-want-one.html",
                         "baghdad_in-the-glass-ca.html",
                         "baghdad_leaving.html",
                         "baghdad_return.html",
                         "baghdad_sage.html",
                         "baghdad_stray-bullets.html",
                         "baghdad_truth.html" ]
 baghdadfeed_documents = baghdadfeed_basedocuments + [ baghdadfeed_specialdocument ]

 # Names of documents that match "sportchannel"
 sportschannel_documents = [ "feed3.htm?Itemid=42&id=3018&option=com_content&task=view",
                         "feed3.htm?Itemid=148&id=3020&option=com_content&task=view",
                         "feed3.htm?Itemid=148&id=3016&option=com_content&task=view",
                         "feed3.htm?Itemid=40&id=3014&option=com_content&task=view",
                         "feed3.htm?Itemid=148&id=3017&option=com_content&task=view",
                         "feed3.htm?Itemid=40&id=3015&option=com_content&task=view",
                         "feed3.htm?Itemid=43&id=3019&option=com_content&task=view",
                         "feed3.htm?Itemid=148&id=3022&option=com_content&task=view",
                         "feed3.htm?Itemid=43&id=3013&option=com_content&task=view",
                         "feed3.htm?Itemid=43&id=3021&option=com_content&task=view" ]

 # Names of documents that match "Bostoncom"
 bostoncomfeed_basedocuments = [ "cannon_mountain_offering_new_ski_pass_next_year.htm",
                         "harvard_will_not_accept_transfers_for_2_years.htm",
                         "lufthansa_emergency_landing_in_poland.htm",
                         "winning_firm_gave_to_dimasi_charity.htm",
                         "gas_prices_down_a_penny_in_massachusetts.htm",
                         "detroits_democrat_mayor_indicted_in_sex_scandal.htm",
                         "bush_mourns_all_4000_dead_in_iraq_white_house.htm",
                         "kevorkian_kicks_off_congressional_run.htm",
                         "mogadishu_port_slowly_changing_lives_in_somalia.htm",
                         "bill_would_exonerate_witches.htm",
                         "courthouses_bold_art_draws_a_mixed_verdict.htm",
                         "many_arabs_fear_mccain_would_continue_bush_policy.htm",
                         "rice_urges_china_talks_with_dalai_lama.htm",
                         "bush_committed_to_iraq_success.htm",
                         "way_cleared_for_removal_of_portion_of_fort_halifax_dam.htm",
                         "worlds_tallest_man_struggles_to_fit_in.htm",
                         "5_killed_in_iowa_city_shooting.htm",
                         "gas_prices_drop_a_penny_in_ocean_state.htm",
                         "detroit_mayor_charged_with_perjury.htm",
                         "white_house_hosts_annual_egg_roll.htm",
                         "committee_passes_that_makes_noose_hanging_a_crime.htm",
                         "muslims_question_vatican_baptism_of_islamic_critic.htm",
                         "fairpoint_announces_tentative_accord_with_unions.htm",
                         "logging_truck_operator_stung_by_high_cost_of_diesel_fuel.htm",
                         "hezbollah_says_israel_prisoner_swap_talks_go_on.htm",
                         "comoros_warns_rebel_island_of_attack.htm",
                         "kwame_kilpatrick_at_a_glance.htm",
                         "skycaps_sue_airline_over_tips_lost_to_bag_fee.htm",
                         "college_to_buy_renewable_energy_offsets_from_wind_farm.htm",
                         "state_health_plan_underfunded.htm",
                         "entry_point.htm",
                         "rice_urges_chinese_to_listen_to_dalai_lama_on_tibet.htm",
                         "holiday_or_not_schools_gird_for_absences.htm",
                         "the_battle_scarred_caretakers.htm",
                         "coming_up_dry_on_bottles.htm" ]
 bostoncomfeed_documents = bostoncomfeed_basedocuments

 special_html_documents = [ "big-apple-barbe.html", "flying-disc-ranch.html" ]

 # Transferable documents
 unmappable_transferable_documents = latah_documents + jamestown_documents + allafrica_documents + \
         chinanews_documents + baghdadfeed_documents + bostoncomfeed_documents + atom_documents + [ "feed20.xml" ] + \
         special_html_documents
 mappable_transferable_documents = feed_names + \
         [ "feed3.htm", "modified1.htm", "modified2.htm", "feed16.xml", "feed17.xml" ]
 transferable_documents = unmappable_transferable_documents + mappable_transferable_documents

 def make_web_url(folder_path, location=""):
     if int(webServerPort) == 80:
         return "%s%s/%s" % (webServerName,location,folder_path)
     else:
         return "%s:%s%s/%s" % (webServerName,webServerPort,location,folder_path)

 def make_secure_web_url(folder_path, location=""):
     if int(webServerSecurePort) == 443:
         return "%s%s/%s" % (webServerName,location,folder_path)
     else:
         return "%s:%s%s/%s" % (webServerName,webServerSecurePort,location,folder_path)

 def clear_robots_cache():
     """ Clean out robots cache. """
     ConnectorHelpers.shutdown_agents()

     # Clear out robots database table
     db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
     try:
         db.query( "DELETE FROM robotsdata" )
     finally:
         db.close()

     ConnectorHelpers.start_agents()

 def clear_session_cache():
     """ Clean out robots cache. """
     ConnectorHelpers.shutdown_agents()

     # Clear out robots database table
     db = pg.DB( "metacarta", "localhost", 5432, None, None, "metacarta", "atracatem" )
     try:
         db.query( "DELETE FROM cookiedata" )
     finally:
         db.close()

     ConnectorHelpers.start_agents()

 # Crawl user credentials
 username = "testingest"
 password = "testingest"

 def preclean( ad_domain_info, print_errors=True ):
     ''' Clean up everything we might have done during the execution of this test.
         This will include all jobs and ingested documents. '''

     try:
         ConnectorHelpers.reset_all()
     except Exception, e:
         if print_errors:
             print "Error resetting all jobs"
             print e

     # Clean up the documents we dumped into the folders on the server
     for document in transferable_documents:
         try:
             WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+document )
         except Exception, e:
             if print_errors:
                 print "Error deleting test document %s" % document
                 print e

     # Clean up redirected documents
     for document in redirected_documents:
         try:
             WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "redirected_target_content\\"+document)
         except Exception, e:
             if print_errors:
                 print "Error deleting test document %s" % document
                 print e

     # Clean up robots.txt
     try:
         WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\robots.txt" )
     except Exception, e:
         if print_errors:
             print "Error removing robots.txt"
             print e

     try:
         ConnectorHelpers.delete_crawler_user( username )
     except Exception, e:
         if print_errors:
             print "Error deleting crawl user"
             print e

     try:
         LicenseMakerClient.revoke_license()
     except Exception, e:
         if print_errors:
             print "Error cleaning up old license"
             print e

     try:
         ConnectorHelpers.teardown_connector_environment( )
     except Exception, e:
         if print_errors:
             print "Error cleaning up debs"
             print e

 # Main
 if __name__ == '__main__':

     # AD parameters
     ad_group = "76"
     srvrname = "w2k3-shp-76-1"

     if len(sys.argv) > 1:
         ad_group = sys.argv[1]

     ad_domain_info = sqa_domain_info.SQADomainInfo( ad_group )

     webDomain = ad_domain_info.dns_domain.upper()
     webServerName = getattr(ad_domain_info,"web_server_fqdn");

     # User
     webUser = ad_domain_info.realm_admin.split("@")[0]
     # Password
     webPassword = ad_domain_info.realm_admin_password

     print "Precleaning!"

     preclean( ad_domain_info, print_errors=False )
     clear_robots_cache()

     print "Setup Connector Environment."
     ConnectorHelpers.setup_connector_environment()

     print "Setting up ingestion user."
     ConnectorHelpers.create_crawler_user( username, password )

     print "Testing how UI handles bad license."
     sqatools.appliance.install_license(extra_services=[], detect_gdms=True)

     # Restart, since otherwise we may have already passed the license check
     ConnectorHelpers.restart_tomcat()
     time.sleep(10)

     WebConnectorHelpers.define_web_repository_connection_ui( username,
                                         password,
                                         "WEBConnection",
                                         "WEB Connection",
                                         "kwright@metacarta.com" )
     # Viewing the connection should NOT give 'Connection working'!
     saw_error = True
     try:
         ConnectorHelpers.view_repository_connection_ui( username, password, "WEBConnection" )
         saw_error = False
     except:
         pass

     if saw_error == False:
         raise Exception("Licensing off but did not see license error!")

     ConnectorHelpers.delete_repository_connection_ui( username, password, "WEBConnection" )

     LicenseMakerClient.revoke_license()

     print "Setting up license."
     sqatools.appliance.install_license(extra_services=["webConnector"], detect_gdms=True)

     ConnectorHelpers.define_gts_outputconnection( )

     # PHASE 0: Canonicalization rules test

     print "Canonicalization rules test."


     # The seed urls we will use for ALL tests are the following:
     canonicalization_seed_urls = []
     for url_args in [ "/s(qqq)/random/path/stuff;jsessionid=zzz?arg1=A&arg2=B&PHPSESSID=xxx&arg3=C&arg1=D&BVSession@@@@=yyy" ]:
         if int(webServerPort) == 80:
             new_url = "http://%s%s" % (webServerName,url_args)
         else:
             new_url = "http://%s:%s%s" % (webServerName,webServerPort,url_args)

         canonicalization_seed_urls += [ new_url ]

     # Loop over a set of combination tuples
     for canon_parameters, expected_results in [ (("",None,"yes","no","no","no","no"),["/s(qqq)/random/path/stuff;jsessionid=zzz?BVSession@@@@=yyy&PHPSESSID=xxx&arg1=A&arg1=D&arg2=B&arg3=C"]),
                                         (("",None,"no","yes","yes","yes","yes"),["/random/path/stuff?arg1=A&arg2=B&arg3=C&arg1=D"]),
                                         (("",None,"yes","no","yes","no","yes"),["/random/path/stuff;jsessionid=zzz?PHPSESSID=xxx&arg1=A&arg1=D&arg2=B&arg3=C"]) ]:

         # Since we're evaluating canonicalization using the report feature, set up and tear down connection for each round, so the history is clean
         # Set up a repository connection
         WebConnectorHelpers.define_web_repository_connection_ui( username,
                                         password,
                                         "WEBConnection",
                                         "WEB Connection",
                                         "kwright@metacarta.com" )

         # Build and run a job
         job_id = WebConnectorHelpers.define_web_job_ui( username,
                         password,
                         "WEBJob",
                         "WEBConnection",
                         canonicalization_seed_urls,
                         canonicalization_rules=[ canon_parameters ],
                         inclusions = [ "^http://%s:%s" % (webServerName,webServerPort) ] )

         # Run the job to completion
         ConnectorHelpers.start_job( job_id )
         ConnectorHelpers.wait_job_complete( job_id )

         # See what the report says about the fetches we did
         results = ConnectorHelpers.run_simple_history_report_api( "WEBConnection", [ "fetch" ] )
         # Throw the results into a map based on document identifier, so we can look uris up
         result_map = {}
         for result in results:
             document_uri = result["identifier"]
             result_map[document_uri] = result

         for expected_result in expected_results:
             if int(webServerPort) == 80:
                 full_url = "http://%s%s" % (webServerName,expected_result)
             else:
                 full_url = "http://%s:%s%s" % (webServerName,webServerPort,expected_result)

             if not result_map.has_key(full_url):
                 raise Exception("Expected to find canonicalized URL fetch for %s, but didn't; actual: %s" % (full_url,str(results)))

         # Delete the job
         ConnectorHelpers.delete_job( job_id )
         ConnectorHelpers.wait_job_deleted( job_id )

         # Tear down the initial repository connection
         ConnectorHelpers.delete_repository_connection_ui( username, password, "WEBConnection" )

     # PHASE 1: Ingestion
     print "Ingestion Test."

     # Add all docs to the repository
     map = { "%server%" : webServerName }
     for document in mappable_transferable_documents:
         WebConnectorHelpers.add_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+document, "/root/rssfeeds/"+document, map=map)
     for document in unmappable_transferable_documents:
         WebConnectorHelpers.add_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+document, "/root/rssfeeds/"+document)
     # Add redirected documents
     for document in redirected_documents:
         WebConnectorHelpers.add_document(webServerName, webUser+"@"+webDomain, webPassword, "redirected_target_content\\"+document, "/root/rssfeeds/"+document)

     url_list = []
     for feed_name in feed_names:
         url_list.append( "https://"+make_secure_web_url(feed_name) )

     # Do one url that isn't in fact on the server, so we can see what the crawler does under those conditions, and two feeds that share the same dead link,
     # Also give it a feed that has illegal utf-8 characters and points to documents that will not be found.
     for feed_name in [ "does_not_exist.xml", "feed16.xml", "feed17.xml", "feed20.xml" ]:
         url_list.append( "https://"+make_secure_web_url(feed_name) )

     # Add in html documents which did not parse correctly before.  These should be fetched and parsed, without
     # blocking the job from completing.
     for doc_name in special_html_documents:
         url_list.append( "https://"+make_secure_web_url(doc_name) )

     # Define repository connection
     WebConnectorHelpers.define_web_repository_connection_ui( username,
                                         password,
                                         "WEBConnection",
                                         "WEB Connection",
                                         "kwright@metacarta.com",
                                         page_access_credentials=[{"type":"ntlm","domain":webDomain,"username":webUser,"password":webPassword}],
                                         certificates=[{"certificate":"livelinksrvr/bigiiscax509.cer"}])

     # Define job, including special documents that we want processed and extracted
     job_id = WebConnectorHelpers.define_web_job_ui( username,
                         password,
                         "WEBJob",
                         "WEBConnection",
                         url_list ,
                         user_metadata=[ ("test_metadata_1", "hello"), ("test_metadata_2", "there"), ("test_metadata_1", "charlie") ],
                         inclusions = [ "http://www\\.theflatulent\\.com", "^https://%s:%s" % (webServerName,webServerSecurePort), "^http://%s:%s" % (webServerName,webServerPort) ] )


     # Run the job to completion
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # See if we can find the documents we just ingested

     # Special documents first - just want to be sure we saw them ingested properly...
     ConnectorHelpers.search_check( [ "velvety" ], None, [ make_secure_web_url( "big-apple-barbe.html" ) ] )
     ConnectorHelpers.search_check( [ "greenpoint" ], None, [ make_secure_web_url( "big-apple-barbe.html" ), make_secure_web_url( "flying-disc-ranch.html" ) ] )

     url_list = []
     for document in redirected_documents:
         url_list.append( make_web_url(document,location="/redirect_target") )
     url_list.append( make_secure_web_url( "feed8.xml" ) )
     ConnectorHelpers.search_check( [ "redirected" ], None, url_list )
     url_list = []
     for document in latah_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_secure_web_url( "feed1.xml" ) )
     url_list.append( make_secure_web_url( "feed8.xml" ) )
     ConnectorHelpers.search_check( [ "latah" ], None, url_list )
     url_list = []
     for document in atom_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_secure_web_url( "feed20.xml" ) )
     ConnectorHelpers.search_check( [ "atom" ], None, url_list )
     url_list = []
     for document in allafrica_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_secure_web_url( "feed4.xml" ) )
     url_list.append( make_web_url(allafrica_specialdocument) )
     ConnectorHelpers.search_check( [ "Allafrica" ], None, url_list )
     url_list = []
     for document in chinanews_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_web_url(chinanews_specialdocument) )
     ConnectorHelpers.search_check( [ "Chinanews" ], None, url_list )
     url_list = []
     for document in jamestown_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_secure_web_url( "feed5.xml" ) )
     url_list.append( make_web_url(jamestown_specialdocument) )
     ConnectorHelpers.search_check( [ "Jamestown" ], None, url_list )

     # Look for user metadata too
     ConnectorHelpers.search_check( [ "Jamestown", "metadata:test_metadata_1=hello" ], None, url_list )
     ConnectorHelpers.search_check( [ "Jamestown", "metadata:test_metadata_1=charlie" ], None, url_list )
     ConnectorHelpers.search_check( [ "Jamestown", "metadata:test_metadata_2=there" ], None, url_list )

     url_list = []
     for document in baghdadfeed_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_web_url(baghdadfeed_specialdocument) )
     ConnectorHelpers.search_check( [ "Baghdadfeed" ], None, url_list )
     url_list = []
     for document in sportschannel_documents:
         url_list.append( make_web_url(document) )
     ConnectorHelpers.search_check( [ "sportschannel" ], None, url_list )
     url_list = []
     for document in bostoncomfeed_documents:
         url_list.append( make_web_url(document) )
     ConnectorHelpers.search_check( [ "bostoncom" ], None, url_list )

     # For the feed that didn't exist, check to make sure exactly one fetch attempt was made.
     results = ConnectorHelpers.run_simple_history_report_api( "WEBConnection", [ "fetch" ], entity_regexp="does_not_exist\\.xml" )
     if len(results) != 1:
         raise Exception("Expected exactly one fetch attempt of non-existent feed, instead saw %d" % len(results))
     if results[0]["result_code"] != "404":
         raise Exception("Expected fetch result code of 404, instead saw %s" % results[0]["result_code"])

     # For the document that didn't exist, check that exactly one fetch attempt was made.
     results = ConnectorHelpers.run_simple_history_report_api( "WEBConnection", [ "fetch" ], entity_regexp="does_not_exist_on_server\\.htm" )
     if len(results) != 1:
         raise Exception("Expected exactly one fetch attempt of non-existent shared document, instead saw %d" % len(results))
     if results[0]["result_code"] != "404":
         raise Exception("Expected fetch result code of 404, instead saw %s" % results[0]["result_code"])

     # For the documents in the feed with bad utf-8 characters, make sure the appropriate fetches were all attempted.
     for url in [ "www\\.theflatulent\\.com/seven/04012010/entertainment/travel/_42_staunton__virginia_167773\\.htm",
             "www\\.theflatulent\\.com/seven/05152010/sports/boxing/opportunity_knocks_for_vargas_169473\\.htm",
             "www\\.theflatulent\\.com/seven/06072009/news/nationalnews/bam_threat_nut_busted_172944\\.htm" ]:
         full_url = "http://" + url
         results = ConnectorHelpers.run_simple_history_report_api( "WEBConnection", [ "fetch" ], entity_regexp=full_url )
         if len(results) != 1:
             raise Exception("Expected exactly one fetch attempt of %s, instead saw %d" % (full_url,len(results)))
         if results[0]["result_code"] != "-10":
             raise Exception("For %s, expected fetch result code of -10, instead saw %s" % (full_url,results[0]["result_code"]))

     # Success: done
     print "Done ingestion test."


     # PHASE 2: Document Change Detection

     print "Document Change Test."
     # Create two modified documents
     WebConnectorHelpers.version_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+chinanews_specialdocument, "/root/rssfeeds/modified1.htm" )
     WebConnectorHelpers.version_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+jamestown_specialdocument, "/root/rssfeeds/modified2.htm" )

     # Restart job, which should pick up the changes
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     # Wait until ingest has caught up
     ConnectorHelpers.wait_for_ingest( )

     # Look for state of index being right
     ConnectorHelpers.search_check( [ "modified" ], None, [ make_web_url(chinanews_specialdocument), make_web_url(jamestown_specialdocument) ] )
     url_list = []
     for document in jamestown_basedocuments:
         url_list.append( make_web_url(document) )
     url_list.append( make_secure_web_url("feed5.xml") )
     ConnectorHelpers.search_check( [ "Jamestown" ], None, url_list )
     url_list = []
     for document in chinanews_basedocuments:
         url_list.append( make_web_url(document) )
     ConnectorHelpers.search_check( [ "Chinanews" ], None, url_list )

     print "Done Document Change Test."

     # PHASE 3: Document Delete Detection

     print "Document Delete Test."
     WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+baghdadfeed_specialdocument )
     WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+allafrica_specialdocument )

     # Restart job, which should pick up the changes
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     url_list = []
     for document in baghdadfeed_basedocuments:
         url_list.append( make_web_url(document) )
     ConnectorHelpers.search_check( [ "Baghdadfeed" ], None, url_list )
     url_list = []
     for document in allafrica_basedocuments:
         url_list.append( make_web_url(document) )
     url_list.append( make_secure_web_url("feed4.xml") )
     ConnectorHelpers.search_check( [ "Allafrica" ], None, url_list )

     print "Done Document Delete Test."

     # PHASE 5: Delete Job

     print "Job Delete Test."
     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     # Make sure the documents all went away
     ConnectorHelpers.search_check( [ "latah" ], None, [] )
     ConnectorHelpers.search_check( [ "atom" ], None, [] )
     ConnectorHelpers.search_check( [ "Allafrica" ], None, [] )
     ConnectorHelpers.search_check( [ "Chinanews" ], None, [] )
     ConnectorHelpers.search_check( [ "Jamestown" ], None, [] )
     ConnectorHelpers.search_check( [ "Baghdadfeed" ], None, [] )
     ConnectorHelpers.search_check( [ "sportschannel" ], None, [] )
     ConnectorHelpers.search_check( [ "bostoncom" ], None, [] )
     ConnectorHelpers.search_check( [ "modified" ], None, [] )


     print "Done Job Delete Test."

     print "Assessing robots.txt combinations"

     url_list = []
     for document in latah_documents:
         url_list.append( make_web_url(document) )
     url_list.append( make_web_url("feed1.xml"))
     url_list.append( make_web_url("feed8.xml"))

     redirected_url_list = []
     for document in redirected_documents:
         redirected_url_list.append( make_web_url(document,location="/redirect_target") )
     redirected_url_list.append( make_web_url("feed8.xml"))

     feed_url_list = [ make_web_url("feed1.xml"), make_web_url("feed8.xml") ]
     feed_redirected_url_list = [ make_web_url("feed8.xml") ]

     # Cycle through a bunch more robots.txt variants.  Each variant should either show documents, or not.
     #robots.txt - Verbatim NYPost robots test, tests inclusion where agents matches, no disallow or allow for any interesting items, blank disallow lines, LF-style newlines,
     #                   check that comment at start of line doesn't disrupt parse.
     #robots_1.txt - Tests exclusion due to no matching agent, with subsequent "*" agent causing the disallow, also bad line: "alsdfkjasdjfhaklsdfh", LF-style newlines,
     #                   check that commented-out rule is actually ignored.
     #robots_2.txt - Tests agent match case insensitivity, and match in the middle of the agents value, LF-style newlines
     #robots_3.txt - Tests ordering, where "*" agent appears earlier than substring agent match, LF-style newlines
     #robots_4.txt - Tests precedence of allow operation over disallow operation for an agent, also specific path disallows, and CRLF-style newlines,
     #                   check that trailing comment is not disruptive.
     #robots_5.txt - Tests the other half of the allow/disallow precedence requirement, LF-style newlines
     #robots_6.txt - Test what happens when there is no matching agent at all (including * embedded in a string); LF-style newlines
     #robots_7.txt - A standard HTML page returned by www.11alive.com/robots.txt
     for robots_file,show_documents,show_redirected,show_feeds in [ ("robots.txt", True, True, False),
                                                                             ("robots_1.txt", False, False, False),
                                                                             ("robots_2.txt", True, True, False),
                                                                             ("robots_3.txt", True, True, False),
                                                                             ("robots_4.txt", False, True, True),
                                                                             ("robots_5.txt", True, True, False),
                                                                             ("robots_6.txt", True, True, False),
                                                                             ("robots_7.txt", True, True, False),
                                                                             ("robots_8.txt", True, False, True),
                                                                             ("robots_9.txt", True, True, True) ]:
         # Copy the specified robots file to the server
         WebConnectorHelpers.add_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\robots.txt", "/root/rssfeeds/"+robots_file)

         # Restart metacarta-agents, so that robots cache is flushed.
         clear_robots_cache()

         print "Assessing robots file %s..." % robots_file


         # Create a simple job and crawl it
         job_id = WebConnectorHelpers.define_web_job_ui( username,
                                 password,
                                 "WEBJob",
                                 "WEBConnection",
                                 [ "http://"+make_web_url("feed1.xml"), "http://"+make_web_url("feed8.xml") ],
                                 inclusions = [ "^http://%s:%s" % (webServerName,webServerPort) ] )

         # Run the job to completion
         ConnectorHelpers.start_job( job_id )
         ConnectorHelpers.wait_job_complete( job_id )

         # Wait until ingest has caught up
         ConnectorHelpers.wait_for_ingest( )

         # EITHER check for existence of documents, OR check for non-existence
         if show_documents:
             ConnectorHelpers.search_check( [ "latah" ], None, url_list )
         else:
             if show_feeds:
                 ConnectorHelpers.search_check( [ "latah" ], None, feed_url_list )
             else:
                 ConnectorHelpers.search_check( [ "latah" ], None, [] )

         if show_redirected:
             ConnectorHelpers.search_check( [ "redirected" ], None, redirected_url_list )
         else:
             if show_feeds:
                 ConnectorHelpers.search_check( ["redirected"], None, feed_redirected_url_list )
             else:
                 ConnectorHelpers.search_check( [ "redirected" ], None, [] )

         # Delete the job
         ConnectorHelpers.delete_job( job_id )
         ConnectorHelpers.wait_job_deleted( job_id )

         # Remove the robots.txt that's on the server
         WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\robots.txt" )

     # Finally, do a robots parsing report, and compare the stats against what we should expect.
     report_result = ConnectorHelpers.run_result_histogram_history_report_ui(username, password, "WEBConnection",
         ["robots parse"],
         entity_bin_regexp="()",
         result_bin_regexp="(.*)")

     # We expect SUCCESS, ERRORS, and HTML back - so that's three rows
     if len(report_result) != 3:
         raise Exception("Expecting exactly three rows in robots parsing report; got %d" % len(report_result))
     result_class_dict = {}
     for result_row in report_result:
         event_count = int(result_row["Event Count"])
         result_class = result_row["Result Class"]
         result_class_dict[result_class] = event_count

     for expected_event_count,expected_result_class in [(7,"success"),(2,"errors"),(1,"html")]:
         actual_event_count = result_class_dict[expected_result_class]
         if actual_event_count != expected_event_count:
             raise Exception("Actual event count %d and expected event count %d differ for robots parsing return class %s" % (actual_event_count,expected_event_count,expected_result_class))

     print "Done assessing robots.txt logic"

     ConnectorHelpers.delete_repository_connection_ui( username, password, "WEBConnection" )

     # Clear robot cache
     clear_robots_cache()

     # PHASE 5.9: Session authentication test
     print "Assessing session authentication..."

     # For this test, we set up crawls against several different sites.  These crawls are meant to force the crawl to go through one or more login sequences to obtain the content in question.
     clear_session_cache()

     # Define repository connection
     WebConnectorHelpers.define_web_repository_connection_ui( username,
                                         password,
                                         "WEBConnection",
                                         "WEB Connection",
                                         "kwright@metacarta.com",
                                         page_access_credentials=[ { "regexp" : ConnectorHelpers.regexp_encode("://%s:8082/" % auth_web_server),
                                                                                 "type" : "basic",
                                                                                 "username" : "geoff",
                                                                                 "password" : "geoff" } ],
                                         session_access_credentials=[ { "regexp" : ConnectorHelpers.regexp_encode("://%s:8081/" % auth_web_server),
                                                                                     "loginpages" : [ { "regexp" : ConnectorHelpers.regexp_encode("://%s:8081/index.php" % auth_web_server),
                                                                                                                 "pagetype" : "form",
                                                                                                                 "matchexpr" : "^$",
                                                                                                                 "parameters" : [ { "nameregexp" : "^username$",
                                                                                                                                             "value" : "geoff" },
                                                                                                                                         { "nameregexp" : "^password$",
                                                                                                                                             "password" : "geoff" } ] },
                                                                                                             { "regexp" : ConnectorHelpers.regexp_encode("://%s:8081/content_" % auth_web_server),
                                                                                                                 "pagetype" : "link",
                                                                                                                 "matchexpr" : ConnectorHelpers.regexp_encode("://%s:8081/index.php" % auth_web_server) } ] },
                                                                                     { "regexp" : ConnectorHelpers.regexp_encode("://%s:8083/" % auth_web_server),
                                                                                         "loginpages" : [ { "regexp" : ConnectorHelpers.regexp_encode("://%s:8083/dologin.php" % auth_web_server),
                                                                                                                     "pagetype" : "form",
                                                                                                                     "matchexpr" : "^$",
                                                                                                                     "parameters" : [ { "nameregexp" : "^username$",
                                                                                                                                                 "value" : "geoff" },
                                                                                                                                             { "nameregexp" : "^password$",
                                                                                                                                                 "password" : "geoff" } ] },
                                                                                                                 { "regexp" : ConnectorHelpers.regexp_encode("://%s:8083/" % auth_web_server),
                                                                                                                     "pagetype" : "redirection",
                                                                                                                     "matchexpr" : ConnectorHelpers.regexp_encode("://%s:8083/" % auth_web_server) } ] } ] )

     job_id = WebConnectorHelpers.define_web_job_ui( username,
                         password,
                         "WEBJob",
                         "WEBConnection",
                         [ "http://%s:8081/index.php" % auth_web_server,
                           "http://%s:8082/protected.php" % auth_web_server,
                           "http://%s:8083/index.php" % auth_web_server ],
                         inclusions = [ ConnectorHelpers.regexp_encode("://%s" % auth_web_server) ] )

     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     ConnectorHelpers.wait_for_ingest()

     # Look for the two content documents we expect to have found
     ConnectorHelpers.search_check( [ "redirection" ], None, [ "%s:8083/content_1.php" % auth_web_server ] )
     ConnectorHelpers.search_check( [ "content" ], None, [ "%s:8081/content_1.php" % auth_web_server, "%s:8083/content_1.php" % auth_web_server, "%s:8082/protected.php" % auth_web_server ] )
     ConnectorHelpers.search_check( [ "basic" ], None, [ "%s:8082/protected.php" % auth_web_server ] )

     # Delete job and connection
     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     ConnectorHelpers.delete_repository_connection_ui( username, password, "WEBConnection" )

     clear_session_cache()

     # PHASE 6: Max fetch rate throttle test
     print "Max fetch rate throttle test..."

     url_list = []
     for feed_name in feed_names:
         url_list.append( "http://"+make_web_url(feed_name) )

     # Define repository connection
     WebConnectorHelpers.define_web_repository_connection_ui( username,
                                         password,
                                         "WEBConnection",
                                         "WEB Connection",
                                         "kwright@metacarta.com",
                                         limits=[ { "regexp":"^[^\\.]", "fetchesperminute":str(4) } ] )

     job_id = WebConnectorHelpers.define_web_job_ui( username,
                         password,
                         "WEBJob",
                         "WEBConnection",
                         url_list,
                         inclusions = [ "^http://%s:%s" % (webServerName,webServerPort) ] )

     # Run the job to completion
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )
     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     max_activity_results = ConnectorHelpers.run_max_activity_history_report_ui( username, password, "WEBConnection", [ "fetch" ], entity_bin_regexp="()" )
     if len(max_activity_results) != 1:
         raise Exception("Expecting 1 row in max activity report; got %d" % len(max_activity_results))
     rate_column = float(max_activity_results[0]["Highest Activity Rate [per min]"])
     if rate_column > 4.5:
         raise Exception("Maximum fetch rate exceeded the 1-sigma limit of 4.5 documents per minute; got %f" % rate_column)


     ConnectorHelpers.delete_repository_connection_ui( username, password, "WEBConnection" )

     # PHASE 7: Max bandwidth throttle test

     # Define repository connection
     WebConnectorHelpers.define_web_repository_connection_ui( username,
                                         password,
                                         "WEBConnection",
                                         "WEB Connection",
                                         "kwright@metacarta.com",
                                         limits=[ { "regexp":"^[^\\.]", "kbpersecond":str(1) } ] )

     job_id = WebConnectorHelpers.define_web_job_ui( username,
                         password,
                         "WEBJob",
                         "WEBConnection",
                         url_list,
                         inclusions = [ "^http://%s:%s" % (webServerName,webServerPort) ] )

     # Run the job to completion
     ConnectorHelpers.start_job( job_id )
     ConnectorHelpers.wait_job_complete( job_id )

     ConnectorHelpers.delete_job( job_id )
     ConnectorHelpers.wait_job_deleted( job_id )

     max_bandwidth_results = ConnectorHelpers.run_max_bandwidth_history_report_ui( username, password, "WEBConnection", [ "fetch" ], entity_bin_regexp="()", window_size_minutes="5" )
     if len(max_bandwidth_results) != 1:
         raise Exception("Expecting 1 row in max bandwidth report; got %d" % len(max_bandwidth_results))
     rate_column = float(max_bandwidth_results[0]["Highest Bandwidth [bps]"])
     if rate_column > 1200.0:
         raise Exception("Bandwidth has exceeded the one-sigma maximum value of 1200 bps: %f" % rate_column)

     ConnectorHelpers.delete_repository_connection_ui( username, password, "WEBConnection" )

     # Clean up the documents we dumped into the folders on server
     for document in transferable_documents:
         WebConnectorHelpers.remove_document(webServerName, webUser+"@"+webDomain, webPassword, "rss\\"+document )

     ConnectorHelpers.delete_gts_outputconnection( )

     LicenseMakerClient.revoke_license()

     ConnectorHelpers.delete_crawler_user( username )

     ConnectorHelpers.teardown_connector_environment( )

     print "Basic WebConnector tests PASSED"