Merge pull request #185 from apache/python3

Update scripts for Python 3
diff --git a/crawler/src/main/resources/bin/dump_repo_details.py b/crawler/src/main/resources/bin/dump_repo_details.py
index ee68f4e..bd17d65 100755
--- a/crawler/src/main/resources/bin/dump_repo_details.py
+++ b/crawler/src/main/resources/bin/dump_repo_details.py
@@ -21,18 +21,18 @@
 
 def main(argv=None):
 	if len(argv) == 0:
-		print "No Repo details to dump"
+		print("No Repo details to dump")
 		sys.exit()
 
 	if os.getenv("DRAT_HOME")==None:
-		print "Please add DRAT_HOME environment variable and try again";
+		print("Please add DRAT_HOME environment variable and try again");
 		sys.exit()
 	
 	default_repo_file_url = os.getenv("DRAT_HOME") + "/conf/repo.default.txt"
 	with open(default_repo_file_url,'rb')as repoFile:
 		data = ''
 		for line in repoFile:
-			data+=line
+			data+=line.strip().decode('utf-8')
 	rep = eval(data)
 
 	reponame = os.path.basename(os.path.normpath(argv[0]))
@@ -45,7 +45,7 @@
 	file.write(json.dumps(rep))
 	file.close()
 
-	print rep
+	print(rep)
 
 if __name__ == "__main__":
 	main(sys.argv[1:])
diff --git a/distribution/src/main/resources/bin/dratstats.py b/distribution/src/main/resources/bin/dratstats.py
index c4dd0c5..0ac89ab 100755
--- a/distribution/src/main/resources/bin/dratstats.py
+++ b/distribution/src/main/resources/bin/dratstats.py
@@ -26,27 +26,23 @@
 import time
 import shutil
 import datetime
-import csv
-import urllib2
+from urllib.request import urlopen, Request
 import json
-import xmlrpclib
-import getopt
-import glob
-import md5
+import xmlrpc
 
 # Check for environment variables
 def check_env_var():
 	if os.getenv("DRAT_HOME") == None:
-		print "Environment variable $DRAT_HOME is not set."
+		print("Environment variable $DRAT_HOME is not set.")
 		sys.exit(1)
 	if os.getenv("JAVA_HOME") == None:
-		print "Environment variable $JAVA_HOME is not set."
+		print("Environment variable $JAVA_HOME is not set.")
 		sys.exit(1)
 	if os.getenv("SOLR_DRAT_URL") == None:
-		print "Environment variable $SOLR_DRAT_URL is not set."
+		print("Environment variable $SOLR_DRAT_URL is not set.")
 		sys.exit(1)
 	if os.getenv("WORKFLOW_URL") == None:
-		print "Environment variable $WORKFLOW_URL is not set."
+		print("Environment variable $WORKFLOW_URL is not set.")
 		sys.exit(1)
 
 
@@ -83,7 +79,7 @@
 
 # Printing out on Console
 def printnow(string):
-	print string
+	print(string)
 	sys.stdout.flush()
 
 
@@ -176,7 +172,7 @@
 # Check if there are any pending PGE jobs in the queue
 def job_in_queue(job_name):
 	status = "PGE EXEC"
-	server = xmlrpclib.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
+	server = xmlrpc.client.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
 	
 
 	for x in range(0,6):
@@ -225,9 +221,9 @@
 # Index into Solr
 def index_solr(json_data):
 	printnow(json_data)
-	request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
+	request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
 	request.add_header('Content-type', 'application/json')
-	urllib2.urlopen(request, json_data)
+	urlopen(request, json_data)
 
 
 # Run DRAT and collect statistics
diff --git a/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py b/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py
index 87d52b6..335b567 100755
--- a/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py
+++ b/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py
@@ -24,11 +24,12 @@
 
 import sys
 import json
-import os
 import getopt
-import urllib2
-import xmlrpclib
-urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
+import urllib
+from urllib.request import urlopen, Request
+from xmlrpc import client
+
+#urllib.request.build_opener(urllib.HTTPHandler(debuglevel=1))
 solrPostfix = "/select/?q=mimetype:$type&version=2.2&start=0&rows=10&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename"
 solrPostfixByPage = "/select/?q=mimetype:$type&version=2.2&start=$i&rows=$num&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename"
 
@@ -41,26 +42,26 @@
     if not url.endswith("/"):
         url = url + "/"
     solrUrl = url+solrPostfix.replace("$type", type)
-    print "GET "+solrUrl
+    print("GET "+solrUrl)
     numFound = 0
-    req = urllib2.Request(solrUrl)
+    req = Request(solrUrl)
     try:
-        f = urllib2.urlopen(req)
-        jsonResp = json.loads(f.read())
+        f = urlopen(req)
+        jsonResp = json.loads(f.read().decode('utf-8'))
         numFound = int(jsonResp["response"]["numFound"])
-    except urllib2.HTTPError, (err):
-        print "HTTP error(%s)" % (err)
-        print "Aborting RAT execution"
+    except urllib.error.HTTPError as err:
+        print("HTTP error(%s)" % (err))
+        print("Aborting RAT execution")
         return
 
-    wm = xmlrpclib.Server(workflowUrl)
+    wm = client.Server(workflowUrl)
 
 
     for i in range(0, numFound, num):
         ratSolrUrl = url + solrPostfixByPage.replace("$type", type).replace("$i", str(i)).replace("$num",str(num))
-        req = urllib2.Request(ratSolrUrl)
-        f = urllib2.urlopen(req)
-        jsonResp = json.loads(f.read())
+        req = Request(ratSolrUrl)
+        f = urlopen(req)
+        jsonResp = json.loads(f.read().decode('utf-8'))
         docs = jsonResp["response"]["docs"]
         metadata = {}
         metadata["MimeType"] = type
@@ -75,13 +76,13 @@
                 metadata["InputFiles"] = []
             metadata["InputFiles"].append(fullpath)
 
-        print "Metadata is "+str(metadata)
+        print("Metadata is "+str(metadata))
         wm.workflowmgr.executeDynamicWorkflow([taskIds], metadata)
         
 
 def get_mime_types(solrUrl):
     neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
-    connection = urllib2.urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
+    connection = urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
     response = eval(connection.read())
     mime_count = response["facet_counts"]["facet_fields"]["mimetype"]
     stats = {}
@@ -101,11 +102,11 @@
    try:
       opts, args = getopt.getopt(argv,"hu:c:w:t:",["solrUrl=", "numFilesPerJob=", "workflowUrl=", "ratTaskId="])
    except getopt.GetoptError:
-      print usage
+      print(usage)
       sys.exit(2)
    for opt, arg in opts:
       if opt == '-h':
-         print usage
+         print(usage)
          sys.exit()
       elif opt in ("-u", "--solrUrl"):
          solrUrl = arg
@@ -117,15 +118,15 @@
           ratTaskId = arg
 
    if solrUrl == "" or numFilesPerJob == 0 or workflowUrl == "" or ratTaskId == "":
-       print usage
+       print(usage)
        sys.exit()
 
 
-   print "Configured SOLR url: ["+solrUrl+"]"
+   print("Configured SOLR url: ["+solrUrl+"]")
    mimeTypes = get_mime_types(solrUrl)
 
    for type in mimeTypes:
-       print "Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]"
+       print("Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]")
        executeRatJobs(solrUrl, numFilesPerJob, type, workflowUrl, ratTaskId)
 
 if __name__ == "__main__":
diff --git a/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py b/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py
index 0b1b04d..e7969b8 100755
--- a/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py
+++ b/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py
@@ -25,31 +25,24 @@
 
 import sys
 import os
-import getopt
-import subprocess
-import time
-import shutil
-import datetime
-import csv
-import urllib2
+
+from urllib.request import urlopen, Request
 import json
-import xmlrpclib
-import getopt
 import glob
-import md5
+import hashlib
 import requests
 
 
 def parse_license(s):
    li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'}
    if s and not s.isspace():
-      arr = s.split("/", 1)
+      arr = s.split(b"/", 1)
       li = arr[0].strip()
       if li in li_dict:
          li = li_dict[li]
 
-      if len(arr) > 1 and len(arr[1].split("/")) > 0:
-         return [arr[1].split("/")[-1], li]
+      if len(arr) > 1 and len(arr[1].split(b"/")) > 0:
+         return [arr[1].split(b"/")[-1], li]
       else:
          #print('split not correct during license parsing '+str(arr))
          return ["/dev/null", li_dict['!?????']]
@@ -98,9 +91,9 @@
 
 def index_solr(json_data):
    #print(json_data)
-   request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
+   request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
    request.add_header('Content-type', 'application/json')
-   urllib2.urlopen(request, json_data)
+   urlopen(request, json_data.encode('utf-8'))
 
 def main(argv=None):
    usage = 'rat_aggregator.py logfile1 logfile2 ... logfileN'
@@ -110,13 +103,13 @@
    with open(repo_file_url,'rb')as repoFile:
       data = ''
       for line in repoFile:
-          data+=line
+          data+=line.decode('utf-8')
    rep = eval(data)
    
    index_solr(json.dumps([rep]))
 
    if len(argv) == 0:
-      print usage
+      print(usage)
       sys.exit()
 
    totalNotes = 0
@@ -193,7 +186,7 @@
 
          with open(filename, 'rb') as f:
             for line in f:
-               if '*****************************************************' in line:
+               if b'*****************************************************' in line:
                   l = 0
                   h = 0
                   if cur_section == 'licenses':
@@ -204,9 +197,9 @@
                   cur_file = ''
                   cur_header = ''
                   cur_section = ''
-               if line.startswith('  Files with Apache') and not parsedLicenses:
+               if line.startswith(b'  Files with Apache') and not parsedLicenses:
                   cur_section = 'licenses'
-               if line.startswith(' Printing headers for ') and not parsedHeaders:
+               if line.startswith(b' Printing headers for ') and not parsedHeaders:
                   cur_section = 'headers'
                if cur_section == 'licenses':
                   l += 1
@@ -218,12 +211,12 @@
                         rat_license[li[0]] = li[1]
                         #print(li)
                if cur_section == 'headers':
-                  if '=====================================================' in line or '== File:' in line:
+                  if b'=====================================================' in line or b'== File:' in line:
                      h += 1
                   if h == 2:
-                     cur_file = line.split("/")[-1].strip()
+                     cur_file = line.split(b"/")[-1].strip()
                   if h == 3:
-                     cur_header += line
+                     cur_header += line.decode('utf-8')
                   if h == 4:
                      rat_header[cur_file] = cur_header.split("\n", 1)[1]
                      cur_file = ''
@@ -248,8 +241,7 @@
       for doc in docs:
          fdata = {}
          fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
-         m = md5.new()
-         m.update(fdata['id'])
+         m = hashlib.md5(fdata['id'].encode('utf-8'))
          hashId = m.hexdigest()
          fileId = hashId+"-"+doc['filename'][0]
 
@@ -275,7 +267,7 @@
 
       # Copying data to Output Directory
       print ("Notes,Binaries,Archives,Standards,Apache,Generated,Unknown")
-      print str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+"    ,"+str(totalGenerated)+","+str(totalUnknown)
+      print(str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+"    ,"+str(totalGenerated)+","+str(totalUnknown))
       
       #print("\nData copied to Solr and Output Directory: OK\n")
 
diff --git a/pge/src/main/resources/config/PgeConfig_RatAggregator.xml b/pge/src/main/resources/config/PgeConfig_RatAggregator.xml
index 1cda5d0..4721fce 100644
--- a/pge/src/main/resources/config/PgeConfig_RatAggregator.xml
+++ b/pge/src/main/resources/config/PgeConfig_RatAggregator.xml
@@ -9,7 +9,7 @@
       <cmd>echo "Creating working dirs"</cmd>
      <cmd>mkdir [JobInputDir] ; mkdir [JobOutputDir]; mkdir [JobLogDir]</cmd>
      <cmd>echo "Running RAT aggregator"</cmd>
-     <cmd>[RatAggregatorScript] `python -c "print ' '.join('[InputFiles]'.split(','))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd>
+     <cmd>[RatAggregatorScript] `python -c "print(' '.join('[InputFiles]'.split(',')))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd>
   </exe>
 
   <!-- Files to ingest -->