blob: 5ec0fe707426ef43277f0a2fd17265725e7821a1 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
#the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import subprocess
import time
import tempfile
import hashlib
import email.utils
import datetime, time
title = "Census Scanner for Git"
version = "0.1.0"
def accepts(source):
""" Do we accept this source?? """
if source['type'] == 'git':
return True
# There are cases where we have a github repo, but don't wanna annalyze the code, just issues
if source['type'] == 'github' and source.get('issuesonly', False) == False:
return True
return False
def scan(KibbleBit, source):
""" Conduct a census scan """
people = {}
idseries = {}
lcseries = {}
alcseries = {}
ctseries = {}
atseries = {}
rid = source['sourceID']
url = source['sourceURL']
rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
gpath = os.path.join(rootpath, rid)
if 'steps' in source and source['steps']['sync']['good'] and os.path.exists(gpath):
source['steps']['census'] = {
'time': time.time(),
'status': 'Census count started at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
'running': True,
'good': True,
}
KibbleBit.updateSource(source)
gname = rid
inp = ""
modificationDates = {}
# Did we do a census before?
if 'census' in source and source['census'] > 0:
# Go back 2 months, meh...
ts = source['census'] - (62*86400)
pd = time.gmtime(ts)
date = time.strftime("%Y-%b-%d 0:00", pd)
inp = subprocess.check_output("git --git-dir %s/.git log --after=\"%s\" --all \"--pretty=format:::%%H|%%ce|%%cn|%%ae|%%an|%%ct\" --numstat" % (gpath, date), shell = True)
else:
inp = subprocess.check_output("git --git-dir %s/.git log --all \"--pretty=format:::%%H|%%ce|%%cn|%%ae|%%an|%%ct\" --numstat" % gpath, shell = True)
tmp = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
tmp.write(inp)
tmp.flush()
tmp.close()
with open(tmp.name, mode="r", encoding="utf-8", errors='replace') as f:
inp = f.read()
f.close()
os.unlink(tmp.name)
edone = 0
KibbleBit.pprint("Parsing log for %s (%s)..." % (rid, url))
for m in re.finditer(u":([a-f0-9]+)\|([^\r\n|]+)\|([^\r\n|]+)\|([^\r\n|]+)\|([^\r\n|]+)\|([\d+]+)\r?\n([^:]+?):", inp, flags=re.MULTILINE):
if m:
ch = m.group(1)
ce = m.group(2)
cn = m.group(3)
ae = m.group(4)
an = m.group(5)
ct = int(m.group(6))
diff = m.group(7)
insert = 0
delete = 0
files_touched = set()
# Diffs
for l in re.finditer(u"(\d+)[ \t]+(\d+)[ \t]+([^\r\n]+)", diff, flags=re.MULTILINE):
insert += int(l.group(1))
delete += int(l.group(2))
filename = l.group(3)
if filename:
files_touched.update([filename])
if filename and len(filename) > 0 and (not filename in modificationDates or modificationDates[filename]['timestamp'] < ct):
modificationDates[filename] = {
'hash': ch,
'filename': filename,
'timestamp': ct,
'created': ct if (not filename in modificationDates or not 'created' in modificationDates[filename] or modificationDates[filename]['created'] > ct) else modificationDates[filename]['created'],
'author_email': ae,
'committer_email': ce
}
if insert > 100000000:
insert = 0
if delete > 100000000:
delete = 0
if delete > 1000000 or insert > 1000000:
KibbleBit.pprint("gigantic diff for %s (%s), ignoring" % (gpath, source['sourceURL']))
pass
if not gname in idseries:
idseries[gname] = {}
if not gname in lcseries:
lcseries[gname] = {}
if not gname in alcseries:
alcseries[gname] = {}
if not gname in ctseries:
ctseries[gname] = {}
if not gname in atseries:
atseries[gname] = {}
ts = ct - (ct % 86400)
if not ts in idseries[gname]:
idseries[gname][ts] = [0,0]
idseries[gname][ts][0] += insert
idseries[gname][ts][1] += delete
if not ts in lcseries[gname]:
lcseries[gname][ts] = {}
if not ts in alcseries[gname]:
alcseries[gname][ts] = {}
if not ce in lcseries[gname][ts]:
lcseries[gname][ts][ce] = [0,0]
lcseries[gname][ts][ce][0] = lcseries[gname][ts][ce][0] + insert
lcseries[gname][ts][ce][1] = lcseries[gname][ts][ce][0] + delete
if not ae in alcseries[gname][ts]:
alcseries[gname][ts][ae] = [0,0]
alcseries[gname][ts][ae][0] = alcseries[gname][ts][ae][0] + insert
alcseries[gname][ts][ae][1] = alcseries[gname][ts][ae][0] + delete
if not ts in ctseries[gname]:
ctseries[gname][ts] = {}
if not ts in atseries[gname]:
atseries[gname][ts] = {}
if not ce in ctseries[gname][ts]:
ctseries[gname][ts][ce] = 0
ctseries[gname][ts][ce] += 1
if not ae in atseries[gname][ts]:
atseries[gname][ts][ae] = 0
atseries[gname][ts][ae] += 1
# Committer
if not ce in people or len(people[ce]['name']) < len(cn):
people[ce] = people[ce] if ce in people else {'projects': [gname]}
people[ce]['name'] = cn
if not gname in people[ce]['projects']:
people[ce]['projects'].append(gname)
# Author
if not ae in people or len(people[ae]['name']) < len(an):
people[ae] = people[ae] if ae in people else {'projects': [gname]}
people[ae]['name'] = an
if not gname in people[ae]['projects']:
people[ae]['projects'].append(gname)
# Make a list of changed files, max 1024
filelist = list(touched_files)
filelist = filelist[:1023]
# ES commit documents
tsd = ts - (ts % 86400)
js = {
'id': rid + "/" + ch,
'sourceID': rid,
'sourceURL': source['sourceURL'],
'organisation': source['organisation'],
'ts': ct,
'tsday': tsd,
'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(ts)),
'committer_name': cn,
'committer_email': ce,
'author_name': an,
'author_email': ae,
'insertions': insert,
'deletions': delete,
'vcs': 'git',
'files_changed': filelist
}
jsx = {
'id': ch,
'organisation': source['organisation'],
'sourceID': source['sourceID'], # Only ever the last source with this
'ts': ct,
'tsday': tsd,
'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(ts)),
'committer_name': cn,
'committer_email': ce,
'author_name': an,
'author_email': ae,
'insertions': insert,
'deletions': delete,
'repository': rid, # This will always ever only be the last repo that had it!
'vcs': 'git',
'files_changed': filelist
}
KibbleBit.append ( 'person', {
'upsert': True,
'name': cn,
'email': ce,
'address': ce,
'organisation': source['organisation'],
'id' : hashlib.sha1( ("%s%s" % (source['organisation'], ce)).encode('ascii', errors='replace')).hexdigest()
})
KibbleBit.append ( 'person',
{
'upsert': True,
'name': an,
'email': ae,
'address': ae,
'organisation': source['organisation'],
'id' :hashlib.sha1( ("%s%s" % (source['organisation'], ae)).encode('ascii', errors='replace')).hexdigest()
}
)
KibbleBit.append('code_commit', js)
KibbleBit.append('code_commit_unique', jsx)
if True: # Do file changes?? Might wanna make this optional
KibbleBit.pprint("Scanning file changes for %s" % source['sourceURL'])
for filename in modificationDates:
fid = hashlib.sha1( ("%s/%s" % (source['sourceID'], filename)).encode('ascii', errors='replace')).hexdigest()
jsfe = {
'upsert': True,
'id': fid,
'organisation': source['organisation'],
'sourceID': source['sourceID'],
'ts': modificationDates[filename]['timestamp'],
'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(modificationDates[filename]['timestamp'])),
'committer_email': modificationDates[filename]['committer_email'],
'author_email': modificationDates[filename]['author_email'],
'hash': modificationDates[filename]['hash'],
'created': modificationDates[filename]['created'],
'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(modificationDates[filename]['created']))
}
found = KibbleBit.exists('file_history', fid)
if found:
del jsfe['created']
del jsfe['createdDate']
KibbleBit.append('file_history', jsfe)
source['steps']['census'] = {
'time': time.time(),
'status': 'Census count completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
'running': False,
'good': True,
}
source['census'] = time.time()
KibbleBit.updateSource(source)