blob: 63736e7a54d0ecfe64a3939768e409bd9bf961d8 [file] [log] [blame]
#!/usr/bin/python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import ConnectorHelpers
import sqatools.appliance
from wintools import sqa_domain_info
from wintools import filetools
from wintools import ambassador_client
from sqatools import LicenseMakerClient
import VirtualBrowser
# Create a web repository connection via the UI
def define_web_repository_connection_ui( username,
password,
connection_name,
connection_description,
email_address,
robots_value="all",
max_repository_connections=None,
throttles=None,
limits=None,
page_access_credentials=[],
session_access_credentials=[],
certificates=[] ) :
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for repository connection management and click it
window = vb.find_window("")
link = window.find_link("List repository connections")
link.click( )
# Click "add a connection"
window = vb.find_window("")
link = window.find_link("Add a connection")
link.click( )
# Find the right form elements and set them
window = vb.find_window("")
form = window.find_form("editconnection")
# "Name" tab
namefield = form.find_textarea("connname")
descriptionfield = form.find_textarea("description")
namefield.set_value( connection_name )
descriptionfield.set_value( connection_description )
# "Type" tab
link = window.find_link("Type tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
connectortypefield = form.find_selectbox("classname")
connectortypefield.select_value( "com.metacarta.crawler.connectors.webcrawler.WebcrawlerConnector" )
# Click the "Continue" button
continue_button = window.find_button("Continue to next page")
continue_button.click( )
window = vb.find_window("")
# "Throttling" tab
link = window.find_link("Throttling tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
if max_repository_connections != None:
form.find_textarea("maxconnections").set_value( str(max_repository_connections) )
if throttles != None:
for throttle in throttles:
regexp,description,rate = throttle
# Add a throttle with the specified parameters
regexpfield = form.find_textarea("throttle")
descfield = form.find_textarea("throttledesc")
valuefield = form.find_textarea("throttlevalue")
regexpfield.set_value( regexp )
if description != None:
descfield.set_value( description )
valuefield.set_value( rate )
add_button = window.find_button("Add throttle")
add_button.click()
window = vb.find_window("")
form = window.find_form("editconnection")
# "Email" tab
link = window.find_link("Email tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Set web-specific stuff
form = window.find_form("editconnection")
form.find_textarea("email").set_value( email_address )
# "Robots" tab
link = window.find_link("Robots tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
# WEB robots selection
if robots_value != None:
form.find_selectbox("robotsusage").select_value(robots_value)
# "Bandwidth" tab
link = window.find_link("Bandwidth tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Do the throttles, if any
if limits != None:
for throttle in limits:
# Each throttle consists of a dictionary with the following fields: "regexp", "insensitive", "connections", "kbpersecond", "fetchesperminute".
if throttle.has_key("regexp"):
form.find_textarea("regexp_bandwidth").set_value(throttle["regexp"])
else:
form.find_textarea("regexp_bandwidth").set_value("()")
if throttle.has_key("insensitive"):
insensitive = (throttle["insensitive"] == True)
else:
insensitive = False
if insensitive == True:
form.find_checkbox("insensitive_bandwidth").check()
if throttle.has_key("connections"):
form.find_textarea("connections_bandwidth").set_value(str(throttle["connections"]))
if throttle.has_key("kbpersecond"):
form.find_textarea("rate_bandwidth").set_value(str(throttle["kbpersecond"]))
if throttle.has_key("fetchesperminute"):
form.find_textarea("fetches_bandwidth").set_value(str(throttle["fetchesperminute"]))
# Click the "add" button
window.find_button("Add bin regular expression").click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Access credentials tab
link = window.find_link("Access Credentials tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Do the individual page credentials, if any
for credential in page_access_credentials:
# Each credential is a dictionary with the following fields: "regexp", "type", "domain, "username", "password". The understood types are
# "basic" and "ntlm".
if credential.has_key("regexp"):
form.find_textarea("regexp_acredential").set_value(credential["regexp"])
if credential.has_key("type"):
form.find_radiobutton("type_acredential",credential["type"]).select()
if credential.has_key("domain"):
form.find_textarea("domain_acredential").set_value(credential["domain"])
if credential.has_key("username"):
form.find_textarea("username_acredential").set_value(credential["username"])
if credential.has_key("password"):
form.find_textarea("password_acredential").set_value(credential["password"])
window.find_button("Add page authentication url regular expression").click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Do the individual session credentials, if any
session_access_credential_index = 0
for credential in session_access_credentials:
# Credential is a dictionary with fields: "regexp", "loginpages".
# "loginpages" contains an array of login page descriptions, see below.
if credential.has_key("regexp"):
form.find_textarea("scredential_regexp").set_value(credential["regexp"])
window.find_button("Add session authentication url regular expression").click()
window = vb.find_window("")
form = window.find_form("editconnection")
prefix = "scredential_%d" % session_access_credential_index
if credential.has_key("loginpages"):
loginpages = credential["loginpages"]
login_page_index = 0
for loginpage in loginpages:
# Each loginpage is a dictionary consisting of the fields "regexp", "pagetype", "matchexpr", and optionally "parameters".
# The understood page types are: "form", "link", and "redirection".
# We have to add the login pages to the row we just added, which has a certain index (that must be used to locate the right form elements)
if loginpage.has_key("regexp"):
form.find_textarea("%s_loginpageregexp" % prefix).set_value(loginpage["regexp"])
if loginpage.has_key("pagetype"):
form.find_radiobutton("%s_loginpagetype" % prefix, loginpage["pagetype"]).select()
if loginpage.has_key("matchexpr"):
form.find_textarea("%s_loginpagematchregexp" % prefix).set_value(loginpage["matchexpr"])
# Click the 'add' button
window.find_button("Add login page to credential #%d" % (session_access_credential_index+1)).click()
window = vb.find_window("")
form = window.find_form("editconnection")
login_prefix = "%s_%d" % (prefix,login_page_index)
if loginpage.has_key("parameters"):
parameters = loginpage["parameters"]
for parameter in parameters:
# Each parameter consists of a dictionary with these fields: "nameregexp", "value", "password"
if parameter.has_key("nameregexp"):
form.find_textarea("%s_loginparamname" % login_prefix).set_value(parameter["nameregexp"])
if parameter.has_key("value"):
form.find_textarea("%s_loginparamvalue" % login_prefix).set_value(parameter["value"])
if parameter.has_key("password"):
form.find_textarea("%s_loginparampassword" % login_prefix).set_value(parameter["password"])
# Click the "add" button
window.find_button("Add parameter to login page #%d for credential #%d" % (login_page_index+1,session_access_credential_index+1)).click()
window = vb.find_window("")
form = window.find_form("editconnection")
login_page_index += 1
session_access_credential_index += 1
# Certificates tab
link = window.find_link("Certificates tab")
link.click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Set the certificates
for certificate in certificates:
# Each certificate is a dictionary with the fields: "regexp", and "certificate". The "certificate" field contains a file name.
if certificate.has_key("regexp"):
form.find_textarea("regexp_trust").set_value(certificate["regexp"])
if certificate.has_key("certificate"):
form.find_filebrowser("certificate_trust").setfile(certificate["certificate"],"application/octet-stream")
window.find_button("Add url regular expression for truststore").click()
window = vb.find_window("")
form = window.find_form("editconnection")
# Now, save this page
save_button = window.find_button("Save this connection")
save_button.click( )
# See if the connector saved
window = vb.find_window("")
found_connection_name = window.find_match("<!--connection=(.*)-->",1)
if found_connection_name != connection_name:
raise Exception("Created connection doesn't match")
# Define a standard web job using the UI
def define_web_job_ui( username,
password,
job_name,
connection_name,
URLs,
inclusions=None,
exclusions=None,
user_metadata=None,
canonicalization_rules=None,
collection_name=None,
type="specified",
startmethod="windowbegin",
recrawlinterval=0 ):
"""connection_name is the name of the web connection. URLs is the array of seed urls.
Legal values for type are: "specified" or "continuous"
Legal values for start method are: "windowbegin", "windowinside", or "disable".
user_metadata is an array of tuples, each tuple having the form:
( name, value )
canonicalization_rules is an array of tuples, each tuple having the form:
( regexp, description, allow_reordering, java_session_removal, asp_session_removal, php_session_removal, bv_session_removal)
... where the regexp and description fields are strings, and the other fields are None, or "yes", or "no".
"""
# Set up virtual browser instance
vb = VirtualBrowser.VirtualBrowser( username=username, password=password )
# First, go to main page
vb.load_main_window( "http://localhost/crawler/index.jsp" )
# Find the link for job management and click it
window = vb.find_window("")
link = window.find_link("List jobs")
link.click( )
# Grab the new window
window = vb.find_window("")
# Add a job
link = window.find_link("Add a job")
link.click( )
# Grab the edit window
window = vb.find_window("")
# Start setting stuff in the form
form = window.find_form("editjob")
# "Name" tab
# textarea for setting description
form.find_textarea("description").set_value( job_name )
# "Connection" tab
link = window.find_link("Connection tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
# start method
if startmethod == "windowbegin":
startmethod_value = 0
elif startmethod == "windowinside":
startmethod_value = 1
elif startmethod == "disable":
startmethod_value = 2
else:
raise Exception("Illegal start method value: '%s'" % startmethod )
form.find_selectbox("startmethod").select_value( str(startmethod_value) )
# connection name
form.find_selectbox("connectionname").select_value( connection_name )
form.find_selectbox("outputname").select_value( "GTS" )
# Click the "Continue" button
window.find_button("Continue to next screen").click( )
window = vb.find_window("")
form = window.find_form("editjob")
# "Collections" tab
link = window.find_link("Collections tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
# textarea for setting collection
if collection_name != None:
form.find_textarea("gts_collectionname").set_value( collection_name )
# "Scheduling" tab
link = window.find_link("Scheduling tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
# type
if type == "specified":
type_value = 1
elif type == "continuous":
type_value = 0
else:
raise Exception("Illegal type value: '%s'" % type )
form.find_selectbox("scheduletype").select_value( str(type_value) )
# Recrawl interval
if type == "continuous":
form.find_textarea("recrawlinterval").set_value( str(recrawlinterval * 1000 * 60) )
# "Seeds" tab
link = window.find_link("Seeds tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
# Now, set up seed urls
url_string = ""
for url in URLs:
# Append each url to the string with a newline separator
url_string = url_string + url + "\n"
form.find_textarea("seeds").set_value(url_string)
# "Inclusions" tab
link = window.find_link("Inclusions tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
if inclusions != None:
inclusion_string = ""
for inclusion in inclusions:
inclusion_string = inclusion_string + inclusion + "\n"
form.find_textarea("inclusions").set_value(inclusion_string)
# "Exclusions" tab
link = window.find_link("Exclusions tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
if exclusions != None:
exclusion_string = ""
for exclusion in exclusions:
exclusion_string = exclusion_string + exclusion + "\n"
form.find_textarea("exclusions").set_value(exclusion_string)
# "Metadata" tab
link = window.find_link("Metadata tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
if user_metadata != None:
for element in user_metadata:
name, value = element
form.find_textarea("specmetaname").set_value(str(name))
form.find_textarea("specmetavalue").set_value(str(value))
window.find_button("Add metadata").click()
window = vb.find_window("")
form = window.find_form("editjob")
# "Canonicalization" tab
link = window.find_link("Canonicalization tab")
link.click()
window = vb.find_window("")
form = window.find_form("editjob")
if canonicalization_rules != None:
# Loop through the tuples and add them one at a time
for rule in canonicalization_rules:
regexp, description, allow_reorder, remove_java, remove_asp, remove_php, remove_bv = rule
form.find_textarea("urlregexp").set_value(regexp)
if description != None:
form.find_textarea("urlregexpdesc").set_value(description)
if allow_reorder != None:
if allow_reorder == "yes":
form.find_checkbox("urlregexpreorder", "yes").select()
else:
form.find_checkbox("urlregexpreorder", "yes").deselect()
if remove_java != None:
if remove_java == "yes":
form.find_checkbox("urlregexpjava", "yes").select()
else:
form.find_checkbox("urlregexpjava", "yes").deselect()
if remove_asp != None:
if remove_asp == "yes":
form.find_checkbox("urlregexpasp", "yes").select()
else:
form.find_checkbox("urlregexpasp", "yes").deselect()
if remove_php != None:
if remove_php == "yes":
form.find_checkbox("urlregexpphp", "yes").select()
else:
form.find_checkbox("urlregexpphp", "yes").deselect()
if remove_bv != None:
if remove_bv == "yes":
form.find_checkbox("urlregexpbv", "yes").select()
else:
form.find_checkbox("urlregexpbv", "yes").deselect()
# Click the "Add" button
window.find_button("Add url regexp").click()
window = vb.find_window("")
form = window.find_form("editjob")
# Finally, submit the form
window.find_button("Save this job").click( )
window = vb.find_window("")
jobid = window.find_match("<!--jobid=(.*)-->",1)
return jobid
# Method to add a document to a jcifs share
def add_document(server_servername, server_user, server_password, targetpath, sourcepath, map=None):
"""Add a document to the share"""
""" The code below does not work, because we get an access violation creating the file. Not sure
why... """
#listparams = [ "/usr/lib/metacarta/jcifs-adddoc",
# ConnectorHelpers.process_argument(server_servername),
# ConnectorHelpers.process_argument(server_user),
# ConnectorHelpers.process_argument(server_password),
# ConnectorHelpers.process_argument(targetpath),
# ConnectorHelpers.process_argument(sourcepath) ]
#return ConnectorHelpers.invoke_script( listparams )
amb = ambassador_client.AmbassadorClient(server_servername+":8000", server_user, server_password)
targetpath = "C:\\"+targetpath.replace("/","\\")
permissions = [ ("+", ConnectorHelpers.get_everyone_sid()) ]
fd = open(sourcepath, "r")
try:
lines = fd.readlines()
newlines = []
for line in lines:
if map != None:
# For each key in the map, do substitution
for key in map.keys():
apart = line.split(key)
line = map[key].join(apart)
newlines.append( line.strip() )
string = "\n".join(newlines)
filetools.create_windows_file(targetpath, permissions, string, amb)
return targetpath
finally:
fd.close()
# Method to remove a document from a jcifs share
def remove_document(server_servername, server_user, server_password, targetpath):
"""Remove a document from the server"""
#listparams = [ "/usr/lib/metacarta/jcifs-removedoc",
# ConnectorHelpers.process_argument(server_servername),
# ConnectorHelpers.process_argument(server_user),
# ConnectorHelpers.process_argument(server_password),
# ConnectorHelpers.process_argument(targetpath) ]
#try:
# ConnectorHelpers.invoke_script( listparams )
#except Exception, e:
# print "Warning: Error deleting document: %s" % str(e)
print "Erasing %s" % targetpath
amb = ambassador_client.AmbassadorClient(server_servername+":8000", server_user, server_password)
targetpath = "C:\\"+targetpath.replace("/","\\")
try:
amb.run('erase "%s"' % targetpath)
except Exception, e:
print "Warning: Error deleting document: %s" % str(e)
# Method to update a document in the jcifs repository
def version_document(server_servername, server_user, server_password, targetpath, sourcepath):
"""Create a new version of an existing document"""
amb = ambassador_client.AmbassadorClient(server_servername+":8000", server_user, server_password)
targetpath = "C:\\"+targetpath.replace("/","\\")
try:
amb.run('erase "%s"' % targetpath)
except Exception, e:
print "Warning: Error deleting document: %s" % str(e)
permissions = [ ("+", ConnectorHelpers.get_everyone_sid()) ]
fd = open(sourcepath, "r")
try:
lines = fd.readlines()
newlines = []
for line in lines:
newlines.append( line.strip() )
string = " ".join(newlines)
filetools.create_windows_file(targetpath, permissions, string, amb)
finally:
fd.close()