blob: c4dd0c51297ccad5bea9e48af4f99477b9ef2c15 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script collects statistics from DRAT run on multiple repositories.
# Please see help() method to understand the usage
# author: karanjeets
# author: mattmann
import sys
import os
import subprocess
import time
import shutil
import datetime
import csv
import urllib2
import json
import xmlrpclib
import getopt
import glob
import md5
# Check for environment variables
def check_env_var():
if os.getenv("DRAT_HOME") == None:
print "Environment variable $DRAT_HOME is not set."
sys.exit(1)
if os.getenv("JAVA_HOME") == None:
print "Environment variable $JAVA_HOME is not set."
sys.exit(1)
if os.getenv("SOLR_DRAT_URL") == None:
print "Environment variable $SOLR_DRAT_URL is not set."
sys.exit(1)
if os.getenv("WORKFLOW_URL") == None:
print "Environment variable $WORKFLOW_URL is not set."
sys.exit(1)
# Returns Current Date Time
def current_datetime():
dt = datetime.datetime.now()
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
# Returns a normalized path.
# Removes the first "/" character and replaces remaining "/" with "_"
def normalize_path(repository):
tmp = repository[repository.index("/") + 1:]
tmp = tmp.replace("/", "_")
tmp = tmp + "_" + current_datetime()
return tmp
# Count the number of files in a directory recursively
# Leverages a basic utility to exclude some files as well
def count_num_files(path, exclude):
count = 0
for root, dirs, files in os.walk(path):
for filename in files:
if exclude not in os.path.join(root, filename):
count += 1
return count
# Prints usage of this script
def help():
print >>sys.stderr, "\n\nUsage: python dratstats.py <path to list of repository URLs> <path to output directory>\n"
# Printing out on Console
def printnow(string):
print string
sys.stdout.flush()
# Parsing RAT log files
def parseFile(filepath):
f = open(filepath, 'r')
lines = f.readlines()
notes = 0
binaries = 0
archives = 0
standards = 0
apachelicensed = 0
generated = 0
unknown = 0
for line in lines:
if line.startswith('Notes:'):
notes = notes + int(line.split(':')[1].strip())
if line.startswith('Binaries:'):
binaries = binaries + int(line.split(':')[1].strip())
if line.startswith('Archives:'):
archives = archives + int(line.split(':')[1].strip())
if line.startswith('Standards:'):
standards = standards + int(line.split(':')[1].strip())
if line.startswith('Apache Licensed:'):
apachelicensed = apachelicensed + int(line.split(':')[1].strip())
if line.startswith('Generated:'):
generated = generated + int(line.split(':')[1].strip())
if line.find('Unknown Licenses') != -1:
unknown = unknown + int(line.split(' ')[0].strip())
return (notes, binaries,archives,standards,apachelicensed,generated,unknown)
return (-1,-1,-1,-1,-1,-1,-1)
# OODT Process (start, stop)
def oodt_process(command):
try:
retcode = subprocess.call("${DRAT_HOME}/bin/oodt" + " " + command, shell=True)
if retcode < 0:
print >>sys.stderr, "ODDT process was terminated by signal", -retcode, ". OODT failed to " + command + ". Aborting..."
sys.exit(1)
elif retcode > 0:
print >>sys.stderr, "OODT process returned", retcode, ". OODT failed to " + command + ". Aborting..."
sys.exit(1)
except OSError as e:
print >>sys.stderr, "OODT execution failed:", e, ". OODT failed to " + command + ". Aborting..."
sys.exit(1)
# DRAT process (crawl, index, map, reduce)
def drat_process(command, repository):
retval = True
try:
retcode = 0
if command == "crawl":
retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " + command + " --exclude \"\\.git\" " + repository, shell=True)
elif command == "index":
retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " + command + " " + repository, shell=True)
elif command == "map" or command == "reduce":
retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " + command + " &", shell=True)
if retcode < 0:
print >>sys.stderr, "DRAT " + command + " process was terminated by signal", -retcode, ". Aborting..."
retval = False
elif retcode > 0:
print >>sys.stderr, "DRAT " + command + " process returned", retcode, ". Aborting..."
retval = False
except OSError as e:
print >>sys.stderr, "DRAT " + command + " execution failed:", e, ". Aborting..."
retval = False
return retval
# Reset DRAT
def drat_reset():
printnow ("Removing " + os.getenv("DRAT_HOME") + "/data/workflow")
shutil.rmtree(os.getenv("DRAT_HOME") + "/data/workflow")
printnow ("Removing " + os.getenv("DRAT_HOME") + "/filemgr/catalog")
shutil.rmtree(os.getenv("DRAT_HOME") + "/filemgr/catalog")
printnow ("Removing " + os.getenv("DRAT_HOME") + "/solr/drat/data")
shutil.rmtree(os.getenv("DRAT_HOME") + "/solr/drat/data")
printnow ("Removing " + os.getenv("DRAT_HOME") + "/data/archive")
shutil.rmtree(os.getenv("DRAT_HOME") + "/data/archive")
os.mkdir(os.getenv("DRAT_HOME") + "/data/archive")
printnow ("Removing " + os.getenv("DRAT_HOME") + "/data/jobs")
shutil.rmtree(os.getenv("DRAT_HOME") + "/data/jobs")
os.mkdir(os.getenv("DRAT_HOME") + "/data/jobs")
# Check if there are any pending PGE jobs in the queue
def job_in_queue(job_name):
status = "PGE EXEC"
server = xmlrpclib.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
for x in range(0,6):
response = server.workflowmgr.getWorkflowInstancesByStatus(status)
for i in range(0, len(response)):
#print response[i]["sharedContext"]["TaskId"]
if response[i]["sharedContext"]["TaskId"][0] == job_name:
return True
time.sleep(3)
return False
# Wait for job to complete
def wait_for_job(job_name):
while job_in_queue(job_name):
for i in range(1, 11):
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(2)
# Parse license from RAT
def parse_license(s):
li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'}
if s and not s.isspace():
arr = s.split("/", 1)
li = arr[0].strip()
if li in li_dict:
li = li_dict[li]
if len(arr) > 1 and len(arr[1].split("/")) > 0:
return [arr[1].split("/")[-1], li]
else:
printnow('split not correct during license parsing '+str(arr))
return ["/dev/null", li_dict['!?????']]
else:
printnow('blank line provided to parse license ['+s+']')
return ["/dev/null", li_dict['!?????']]
# Index into Solr
def index_solr(json_data):
printnow(json_data)
request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true")
request.add_header('Content-type', 'application/json')
urllib2.urlopen(request, json_data)
# Run DRAT and collect statistics
def run(repos_list, output_dir):
repos = []
with open(repos_list) as repositories:
repo_content = repositories.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
repo_content = [x.strip() for x in repo_content]
print('DRAT stats: inspecting '+str(len(repo_content))+' repositories.')
for repository in repo_content:
repo_toks = repository.split()
repo_path = repo_toks[0]
repo_name = repo_toks[1]
repo_loc_url = repo_toks[2]
repo_desc = ' '.join(repo_toks[3:])
rep = {
"id" : repo_loc_url,
"repo": repo_path,
"name" :repo_name,
"loc_url" : repo_loc_url,
"description": repo_desc,
"type" : "project"
}
if rep["repo"].startswith('#'):
print('\nSkipping Repository: ' + rep["repo"][1:])
continue
print("\nVerifying repository path...\n")
if not os.path.exists(rep["repo"]):
print ("\nPath " + rep["repo"] + "is not valid. Skipping and moving on...\n")
continue
print("\nRepository Path: OK\n")
repos.append(rep)
print ("\nStarting OODT...\n")
oodt_process("start")
time.sleep(20)
print("\nOODT Started: OK\n")
print('Adding repository: '+str(rep)+' to Solr')
# index_solr(json.dumps([rep]))
print("\nRunning DRAT on " + rep["repo"] + " ...\n")
retval = True
stats = {}
stats['id'] = rep["repo"]
stats['crawl_start'] = current_datetime()
retval = drat_process("crawl", rep["repo"])
stats['crawl_end'] = current_datetime()
rep["id"] = "id:"+os.path.normpath(rep["repo"])
outputfile = os.getenv("DRAT_HOME") + "/data/repo"
file = open(outputfile,"w")
file.write(json.dumps(rep))
file.close()
if retval:
time.sleep(5)
stats['index_start'] = current_datetime()
retval = drat_process("index", rep["repo"])
stats['index_end'] = current_datetime()
if retval:
time.sleep(5)
stats['map_start'] = current_datetime()
retval = drat_process("map", None)
time.sleep(10)
wait_for_job("urn:drat:MimePartitioner")
wait_for_job("urn:drat:RatCodeAudit")
stats['map_end'] = current_datetime()
if(retval):
wait_for_job("urn:drat:RatAggregator")
time.sleep(10)
retval = drat_process("reduce",None)
print ("\nwaiting for Rat Aggregator...\n")
wait_for_job("urn:drat:RatAggregator")
time.sleep(5)
if(retval):
# Copy Data with datetime variables above, extract output from RatAggregate file, extract data from Solr Core
printnow ("\nCopying data to Solr and Output Directory...\n")
# Copying data to Output Directory
repos_out = output_dir + "/" + normalize_path(rep["repo"])
shutil.copytree(os.getenv("DRAT_HOME") + "/data/archive", repos_out + "/data/archive")
shutil.copytree(os.getenv("DRAT_HOME") + "/data/jobs", repos_out + "/data/jobs")
shutil.copytree(os.getenv("DRAT_HOME") + "/data/workflow", repos_out + "/data/workflow")
print("\nData copied to Solr and Output Directory: OK\n")
time.sleep(5)
print ("\nStopping OODT...\n")
oodt_process("stop")
time.sleep(20)
print ("\nOODT Stopped: OK\n")
print ("\nReseting DRAT...\n")
drat_reset()
time.sleep(5)
print ("\nDRAT Reset: OK\n")
print("\nDRAT SCAN COMPLETED!!!\n")
# This is where it all begins
def main():
if len(sys.argv) < 2 or len(sys.argv) > 3:
print >>sys.stderr, "\nIncorrect number of arguments passed. Aborting..."
help()
sys.exit(1)
repos_list = sys.argv[1]
output_dir = sys.argv[2]
if not os.path.isfile(repos_list):
print >>sys.stderr, "\nRepository list doesn't exists at the path: ", repos_list
help()
sys.exit(1)
if not os.path.isdir(output_dir):
print >>sys.stderr, "\nOutput Directory doesn't exist at the path: ", output_dir
help()
sys.exit(1)
dratData = os.getenv("DRAT_HOME") + "/data"
if os.path.realpath(output_dir).startswith(dratData):
print >>sys.stderr, "\nOutput dir cannot be a sub directory of "+dratData
help()
sys.exit(1)
check_env_var()
run(repos_list, output_dir)
if __name__ == "__main__":
main()