Merge pull request #185 from apache/python3
Update scripts for Python 3
diff --git a/crawler/src/main/resources/bin/dump_repo_details.py b/crawler/src/main/resources/bin/dump_repo_details.py
index ee68f4e..bd17d65 100755
--- a/crawler/src/main/resources/bin/dump_repo_details.py
+++ b/crawler/src/main/resources/bin/dump_repo_details.py
@@ -21,18 +21,18 @@
def main(argv=None):
if len(argv) == 0:
- print "No Repo details to dump"
+ print("No Repo details to dump")
sys.exit()
if os.getenv("DRAT_HOME")==None:
- print "Please add DRAT_HOME environment variable and try again";
+ print("Please add DRAT_HOME environment variable and try again");
sys.exit()
default_repo_file_url = os.getenv("DRAT_HOME") + "/conf/repo.default.txt"
with open(default_repo_file_url,'rb')as repoFile:
data = ''
for line in repoFile:
- data+=line
+ data+=line.strip().decode('utf-8')
rep = eval(data)
reponame = os.path.basename(os.path.normpath(argv[0]))
@@ -45,7 +45,7 @@
file.write(json.dumps(rep))
file.close()
- print rep
+ print(rep)
if __name__ == "__main__":
main(sys.argv[1:])
diff --git a/distribution/src/main/resources/bin/dratstats.py b/distribution/src/main/resources/bin/dratstats.py
index c4dd0c5..0ac89ab 100755
--- a/distribution/src/main/resources/bin/dratstats.py
+++ b/distribution/src/main/resources/bin/dratstats.py
@@ -26,27 +26,23 @@
import time
import shutil
import datetime
-import csv
-import urllib2
+from urllib.request import urlopen, Request
import json
-import xmlrpclib
-import getopt
-import glob
-import md5
+import xmlrpc
# Check for environment variables
def check_env_var():
if os.getenv("DRAT_HOME") == None:
- print "Environment variable $DRAT_HOME is not set."
+ print("Environment variable $DRAT_HOME is not set.")
sys.exit(1)
if os.getenv("JAVA_HOME") == None:
- print "Environment variable $JAVA_HOME is not set."
+ print("Environment variable $JAVA_HOME is not set.")
sys.exit(1)
if os.getenv("SOLR_DRAT_URL") == None:
- print "Environment variable $SOLR_DRAT_URL is not set."
+ print("Environment variable $SOLR_DRAT_URL is not set.")
sys.exit(1)
if os.getenv("WORKFLOW_URL") == None:
- print "Environment variable $WORKFLOW_URL is not set."
+ print("Environment variable $WORKFLOW_URL is not set.")
sys.exit(1)
@@ -83,7 +79,7 @@
# Printing out on Console
def printnow(string):
- print string
+ print(string)
sys.stdout.flush()
@@ -176,7 +172,7 @@
# Check if there are any pending PGE jobs in the queue
def job_in_queue(job_name):
status = "PGE EXEC"
- server = xmlrpclib.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
+ server = xmlrpc.client.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
for x in range(0,6):
@@ -225,9 +221,9 @@
# Index into Solr
def index_solr(json_data):
printnow(json_data)
- request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
+ request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
- urllib2.urlopen(request, json_data)
+ urlopen(request, json_data)
# Run DRAT and collect statistics
diff --git a/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py b/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py
index 87d52b6..335b567 100755
--- a/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py
+++ b/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py
@@ -24,11 +24,12 @@
import sys
import json
-import os
import getopt
-import urllib2
-import xmlrpclib
-urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
+import urllib
+from urllib.request import urlopen, Request
+from xmlrpc import client
+
+#urllib.request.build_opener(urllib.HTTPHandler(debuglevel=1))
solrPostfix = "/select/?q=mimetype:$type&version=2.2&start=0&rows=10&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename"
solrPostfixByPage = "/select/?q=mimetype:$type&version=2.2&start=$i&rows=$num&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename"
@@ -41,26 +42,26 @@
if not url.endswith("/"):
url = url + "/"
solrUrl = url+solrPostfix.replace("$type", type)
- print "GET "+solrUrl
+ print("GET "+solrUrl)
numFound = 0
- req = urllib2.Request(solrUrl)
+ req = Request(solrUrl)
try:
- f = urllib2.urlopen(req)
- jsonResp = json.loads(f.read())
+ f = urlopen(req)
+ jsonResp = json.loads(f.read().decode('utf-8'))
numFound = int(jsonResp["response"]["numFound"])
- except urllib2.HTTPError, (err):
- print "HTTP error(%s)" % (err)
- print "Aborting RAT execution"
+ except urllib.error.HTTPError as err:
+ print("HTTP error(%s)" % (err))
+ print("Aborting RAT execution")
return
- wm = xmlrpclib.Server(workflowUrl)
+ wm = client.Server(workflowUrl)
for i in range(0, numFound, num):
ratSolrUrl = url + solrPostfixByPage.replace("$type", type).replace("$i", str(i)).replace("$num",str(num))
- req = urllib2.Request(ratSolrUrl)
- f = urllib2.urlopen(req)
- jsonResp = json.loads(f.read())
+ req = Request(ratSolrUrl)
+ f = urlopen(req)
+ jsonResp = json.loads(f.read().decode('utf-8'))
docs = jsonResp["response"]["docs"]
metadata = {}
metadata["MimeType"] = type
@@ -75,13 +76,13 @@
metadata["InputFiles"] = []
metadata["InputFiles"].append(fullpath)
- print "Metadata is "+str(metadata)
+ print("Metadata is "+str(metadata))
wm.workflowmgr.executeDynamicWorkflow([taskIds], metadata)
def get_mime_types(solrUrl):
neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
- connection = urllib2.urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
+ connection = urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
response = eval(connection.read())
mime_count = response["facet_counts"]["facet_fields"]["mimetype"]
stats = {}
@@ -101,11 +102,11 @@
try:
opts, args = getopt.getopt(argv,"hu:c:w:t:",["solrUrl=", "numFilesPerJob=", "workflowUrl=", "ratTaskId="])
except getopt.GetoptError:
- print usage
+ print(usage)
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
- print usage
+ print(usage)
sys.exit()
elif opt in ("-u", "--solrUrl"):
solrUrl = arg
@@ -117,15 +118,15 @@
ratTaskId = arg
if solrUrl == "" or numFilesPerJob == 0 or workflowUrl == "" or ratTaskId == "":
- print usage
+ print(usage)
sys.exit()
- print "Configured SOLR url: ["+solrUrl+"]"
+ print("Configured SOLR url: ["+solrUrl+"]")
mimeTypes = get_mime_types(solrUrl)
for type in mimeTypes:
- print "Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]"
+ print("Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]")
executeRatJobs(solrUrl, numFilesPerJob, type, workflowUrl, ratTaskId)
if __name__ == "__main__":
diff --git a/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py b/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py
index 0b1b04d..e7969b8 100755
--- a/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py
+++ b/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py
@@ -25,31 +25,24 @@
import sys
import os
-import getopt
-import subprocess
-import time
-import shutil
-import datetime
-import csv
-import urllib2
+
+from urllib.request import urlopen, Request
import json
-import xmlrpclib
-import getopt
import glob
-import md5
+import hashlib
import requests
def parse_license(s):
li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'}
if s and not s.isspace():
- arr = s.split("/", 1)
+ arr = s.split(b"/", 1)
li = arr[0].strip()
if li in li_dict:
li = li_dict[li]
- if len(arr) > 1 and len(arr[1].split("/")) > 0:
- return [arr[1].split("/")[-1], li]
+ if len(arr) > 1 and len(arr[1].split(b"/")) > 0:
+ return [arr[1].split(b"/")[-1], li]
else:
#print('split not correct during license parsing '+str(arr))
return ["/dev/null", li_dict['!?????']]
@@ -98,9 +91,9 @@
def index_solr(json_data):
#print(json_data)
- request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
+ request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
- urllib2.urlopen(request, json_data)
+ urlopen(request, json_data.encode('utf-8'))
def main(argv=None):
usage = 'rat_aggregator.py logfile1 logfile2 ... logfileN'
@@ -110,13 +103,13 @@
with open(repo_file_url,'rb')as repoFile:
data = ''
for line in repoFile:
- data+=line
+ data+=line.decode('utf-8')
rep = eval(data)
index_solr(json.dumps([rep]))
if len(argv) == 0:
- print usage
+ print(usage)
sys.exit()
totalNotes = 0
@@ -193,7 +186,7 @@
with open(filename, 'rb') as f:
for line in f:
- if '*****************************************************' in line:
+ if b'*****************************************************' in line:
l = 0
h = 0
if cur_section == 'licenses':
@@ -204,9 +197,9 @@
cur_file = ''
cur_header = ''
cur_section = ''
- if line.startswith(' Files with Apache') and not parsedLicenses:
+ if line.startswith(b' Files with Apache') and not parsedLicenses:
cur_section = 'licenses'
- if line.startswith(' Printing headers for ') and not parsedHeaders:
+ if line.startswith(b' Printing headers for ') and not parsedHeaders:
cur_section = 'headers'
if cur_section == 'licenses':
l += 1
@@ -218,12 +211,12 @@
rat_license[li[0]] = li[1]
#print(li)
if cur_section == 'headers':
- if '=====================================================' in line or '== File:' in line:
+ if b'=====================================================' in line or b'== File:' in line:
h += 1
if h == 2:
- cur_file = line.split("/")[-1].strip()
+ cur_file = line.split(b"/")[-1].strip()
if h == 3:
- cur_header += line
+ cur_header += line.decode('utf-8')
if h == 4:
rat_header[cur_file] = cur_header.split("\n", 1)[1]
cur_file = ''
@@ -248,8 +241,7 @@
for doc in docs:
fdata = {}
fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
- m = md5.new()
- m.update(fdata['id'])
+ m = hashlib.md5(fdata['id'].encode('utf-8'))
hashId = m.hexdigest()
fileId = hashId+"-"+doc['filename'][0]
@@ -275,7 +267,7 @@
# Copying data to Output Directory
print ("Notes,Binaries,Archives,Standards,Apache,Generated,Unknown")
- print str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown)
+ print(str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown))
#print("\nData copied to Solr and Output Directory: OK\n")
diff --git a/pge/src/main/resources/config/PgeConfig_RatAggregator.xml b/pge/src/main/resources/config/PgeConfig_RatAggregator.xml
index 1cda5d0..4721fce 100644
--- a/pge/src/main/resources/config/PgeConfig_RatAggregator.xml
+++ b/pge/src/main/resources/config/PgeConfig_RatAggregator.xml
@@ -9,7 +9,7 @@
<cmd>echo "Creating working dirs"</cmd>
<cmd>mkdir [JobInputDir] ; mkdir [JobOutputDir]; mkdir [JobLogDir]</cmd>
<cmd>echo "Running RAT aggregator"</cmd>
- <cmd>[RatAggregatorScript] `python -c "print ' '.join('[InputFiles]'.split(','))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd>
+ <cmd>[RatAggregatorScript] `python -c "print(' '.join('[InputFiles]'.split(',')))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd>
</exe>
<!-- Files to ingest -->