src/plugins/scanners/git-census.py - kibble-scanners - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
  #the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import re
 import subprocess
 import time
 import tempfile
 import hashlib
 import email.utils
 import datetime, time

 title = "Census Scanner for Git"
 version = "0.1.0"


 def accepts(source):
     """ Do we accept this source?? """
     if source['type'] == 'git':
         return True
     # There are cases where we have a github repo, but don't wanna annalyze the code, just issues
     if source['type'] == 'github' and source.get('issuesonly', False) == False:
         return True
     return False


 def scan(KibbleBit, source):
     """ Conduct a census scan """
     people = {}
     idseries = {}
     lcseries = {}
     alcseries = {}
     ctseries = {}
     atseries = {}

     rid = source['sourceID']
     url = source['sourceURL']
     rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
     gpath = os.path.join(rootpath, rid)

     if 'steps' in source and source['steps']['sync']['good'] and os.path.exists(gpath):
         source['steps']['census'] = {
                 'time': time.time(),
                 'status': 'Census count started at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
                 'running': True,
                 'good': True,
             }
         KibbleBit.updateSource(source)
         gname = rid
         inp = ""
         modificationDates = {}
         # Did we do a census before?
         if 'census' in source and source['census'] > 0:
             # Go back 2 months, meh...
             ts = source['census'] - (62*86400)
             pd = time.gmtime(ts)
             date = time.strftime("%Y-%b-%d 0:00", pd)
             inp = subprocess.check_output("git --git-dir %s/.git log --after=\"%s\" --all \"--pretty=format:::%%H|%%ce|%%cn|%%ae|%%an|%%ct\" --numstat" % (gpath, date), shell = True)
         else:
             inp = subprocess.check_output("git --git-dir %s/.git log --all \"--pretty=format:::%%H|%%ce|%%cn|%%ae|%%an|%%ct\" --numstat" % gpath, shell = True)
         tmp = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
         tmp.write(inp)
         tmp.flush()
         tmp.close()
         with open(tmp.name, mode="r", encoding="utf-8", errors='replace') as f:
             inp = f.read()
             f.close()
         os.unlink(tmp.name)
         edone = 0
         KibbleBit.pprint("Parsing log for %s (%s)..." % (rid, url))
         for m in re.finditer(u":([a-f0-9]+)\|([^\r\n|]+)\|([^\r\n|]+)\|([^\r\n|]+)\|([^\r\n|]+)\|([\d+]+)\r?\n([^:]+?):", inp, flags=re.MULTILINE):
             if m:
                 ch = m.group(1)
                 ce = m.group(2)
                 cn = m.group(3)
                 ae = m.group(4)
                 an = m.group(5)
                 ct = int(m.group(6))
                 diff = m.group(7)
                 insert = 0
                 delete = 0
                 files_touched = set()
                 # Diffs
                 for l in re.finditer(u"(\d+)[ \t]+(\d+)[ \t]+([^\r\n]+)", diff, flags=re.MULTILINE):
                     insert += int(l.group(1))
                     delete += int(l.group(2))
                     filename = l.group(3)
                     if filename:
                         files_touched.update([filename])
                     if filename and len(filename) > 0 and (not filename in modificationDates or modificationDates[filename]['timestamp'] < ct):
                         modificationDates[filename] = {
                             'hash': ch,
                             'filename': filename,
                             'timestamp': ct,
                             'created': ct if (not filename in modificationDates or not 'created' in modificationDates[filename] or modificationDates[filename]['created'] > ct) else modificationDates[filename]['created'],
                             'author_email': ae,
                             'committer_email': ce
                             }
                     if insert > 100000000:
                         insert = 0
                     if delete > 100000000:
                         delete = 0
                     if delete > 1000000 or insert > 1000000:
                         KibbleBit.pprint("gigantic diff for %s (%s), ignoring" % (gpath, source['sourceURL']))
                         pass
                 if not gname in idseries:
                     idseries[gname] = {}
                 if not gname in lcseries:
                     lcseries[gname] = {}
                 if not gname in alcseries:
                     alcseries[gname] = {}
                 if not gname in ctseries:
                     ctseries[gname] = {}
                 if not gname in atseries:
                     atseries[gname] = {}
                 ts = ct - (ct % 86400)
                 if not ts in idseries[gname]:
                     idseries[gname][ts] = [0,0]

                 idseries[gname][ts][0] += insert
                 idseries[gname][ts][1] += delete

                 if not ts in lcseries[gname]:
                     lcseries[gname][ts] = {}
                 if not ts in alcseries[gname]:
                     alcseries[gname][ts] = {}
                 if not ce in lcseries[gname][ts]:
                     lcseries[gname][ts][ce] = [0,0]
                 lcseries[gname][ts][ce][0]  = lcseries[gname][ts][ce][0] + insert
                 lcseries[gname][ts][ce][1]  = lcseries[gname][ts][ce][0] + delete

                 if not ae in alcseries[gname][ts]:
                     alcseries[gname][ts][ae] = [0,0]
                 alcseries[gname][ts][ae][0]  = alcseries[gname][ts][ae][0] + insert
                 alcseries[gname][ts][ae][1]  = alcseries[gname][ts][ae][0] + delete

                 if not ts in ctseries[gname]:
                     ctseries[gname][ts] = {}
                 if not ts in atseries[gname]:
                     atseries[gname][ts] = {}

                 if not ce in ctseries[gname][ts]:
                     ctseries[gname][ts][ce] = 0
                 ctseries[gname][ts][ce] += 1

                 if not ae in atseries[gname][ts]:
                     atseries[gname][ts][ae] = 0
                 atseries[gname][ts][ae] += 1

                 # Committer
                 if not ce in people or len(people[ce]['name']) < len(cn):
                     people[ce] = people[ce] if ce in people else {'projects': [gname]}
                     people[ce]['name'] = cn
                     if not gname in people[ce]['projects']:
                         people[ce]['projects'].append(gname)

                 # Author
                 if not ae in people or len(people[ae]['name']) < len(an):
                     people[ae] = people[ae] if ae in people else {'projects': [gname]}
                     people[ae]['name'] = an
                     if not gname in people[ae]['projects']:
                         people[ae]['projects'].append(gname)

                 # Make a list of changed files, max 1024
                 filelist = list(touched_files)
                 filelist = filelist[:1023]

                 # ES commit documents
                 tsd = ts - (ts % 86400)
                 js = {
                     'id': rid + "/" + ch,
                     'sourceID': rid,
                     'sourceURL': source['sourceURL'],
                     'organisation': source['organisation'],
                     'ts': ct,
                     'tsday': tsd,
                     'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(ts)),
                     'committer_name': cn,
                     'committer_email': ce,
                     'author_name': an,
                     'author_email': ae,
                     'insertions': insert,
                     'deletions': delete,
                     'vcs': 'git',
                     'files_changed': filelist
                 }
                 jsx = {
                     'id': ch,
                     'organisation': source['organisation'],
                     'sourceID': source['sourceID'], # Only ever the last source with this
                     'ts': ct,
                     'tsday': tsd,
                     'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(ts)),
                     'committer_name': cn,
                     'committer_email': ce,
                     'author_name': an,
                     'author_email': ae,
                     'insertions': insert,
                     'deletions': delete,
                     'repository': rid, # This will always ever only be the last repo that had it!
                     'vcs': 'git',
                     'files_changed': filelist
                 }
                 KibbleBit.append ( 'person', {
                     'upsert': True,
                     'name': cn,
                     'email': ce,
                     'address': ce,
                     'organisation': source['organisation'],
                     'id' : hashlib.sha1( ("%s%s" % (source['organisation'], ce)).encode('ascii', errors='replace')).hexdigest()
                 })
                 KibbleBit.append ( 'person',
                     {
                     'upsert': True,
                     'name': an,
                     'email': ae,
                     'address': ae,
                     'organisation': source['organisation'],
                     'id' :hashlib.sha1( ("%s%s" % (source['organisation'], ae)).encode('ascii', errors='replace')).hexdigest()
                 }
                     )
                 KibbleBit.append('code_commit', js)
                 KibbleBit.append('code_commit_unique', jsx)

         if True: # Do file changes?? Might wanna make this optional
             KibbleBit.pprint("Scanning file changes for %s" % source['sourceURL'])
             for filename in modificationDates:
                 fid = hashlib.sha1( ("%s/%s" % (source['sourceID'], filename)).encode('ascii', errors='replace')).hexdigest()
                 jsfe = {
                         'upsert': True,
                         'id': fid,
                         'organisation': source['organisation'],
                         'sourceID': source['sourceID'],
                         'ts': modificationDates[filename]['timestamp'],
                         'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(modificationDates[filename]['timestamp'])),
                         'committer_email': modificationDates[filename]['committer_email'],
                         'author_email': modificationDates[filename]['author_email'],
                         'hash': modificationDates[filename]['hash'],
                         'created': modificationDates[filename]['created'],
                         'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(modificationDates[filename]['created']))
                     }
                 found = KibbleBit.exists('file_history', fid)
                 if found:
                     del jsfe['created']
                     del jsfe['createdDate']
                 KibbleBit.append('file_history', jsfe)

         source['steps']['census'] = {
                 'time': time.time(),
                 'status': 'Census count completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
                 'running': False,
                 'good': True,
             }
         source['census'] = time.time()
         KibbleBit.updateSource(source)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	#the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import os
	import re
	import subprocess
	import time
	import tempfile
	import hashlib
	import email.utils
	import datetime, time

	title = "Census Scanner for Git"
	version = "0.1.0"


	def accepts(source):
	""" Do we accept this source?? """
	if source['type'] == 'git':
	return True
	# There are cases where we have a github repo, but don't wanna annalyze the code, just issues
	if source['type'] == 'github' and source.get('issuesonly', False) == False:
	return True
	return False


	def scan(KibbleBit, source):
	""" Conduct a census scan """
	people = {}
	idseries = {}
	lcseries = {}
	alcseries = {}
	ctseries = {}
	atseries = {}

	rid = source['sourceID']
	url = source['sourceURL']
	rootpath = "%s/%s/git" % (KibbleBit.config['scanner']['scratchdir'], source['organisation'])
	gpath = os.path.join(rootpath, rid)

	if 'steps' in source and source['steps']['sync']['good'] and os.path.exists(gpath):
	source['steps']['census'] = {
	'time': time.time(),
	'status': 'Census count started at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
	'running': True,
	'good': True,
	}
	KibbleBit.updateSource(source)
	gname = rid
	inp = ""
	modificationDates = {}
	# Did we do a census before?
	if 'census' in source and source['census'] > 0:
	# Go back 2 months, meh...
	ts = source['census'] - (62*86400)
	pd = time.gmtime(ts)
	date = time.strftime("%Y-%b-%d 0:00", pd)
	inp = subprocess.check_output("git --git-dir %s/.git log --after=\"%s\" --all \"--pretty=format:::%%H\|%%ce\|%%cn\|%%ae\|%%an\|%%ct\" --numstat" % (gpath, date), shell = True)
	else:
	inp = subprocess.check_output("git --git-dir %s/.git log --all \"--pretty=format:::%%H\|%%ce\|%%cn\|%%ae\|%%an\|%%ct\" --numstat" % gpath, shell = True)
	tmp = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False)
	tmp.write(inp)
	tmp.flush()
	tmp.close()
	with open(tmp.name, mode="r", encoding="utf-8", errors='replace') as f:
	inp = f.read()
	f.close()
	os.unlink(tmp.name)
	edone = 0
	KibbleBit.pprint("Parsing log for %s (%s)..." % (rid, url))
	for m in re.finditer(u":([a-f0-9]+)\\|([^\r\n\|]+)\\|([^\r\n\|]+)\\|([^\r\n\|]+)\\|([^\r\n\|]+)\\|([\d+]+)\r?\n([^:]+?):", inp, flags=re.MULTILINE):
	if m:
	ch = m.group(1)
	ce = m.group(2)
	cn = m.group(3)
	ae = m.group(4)
	an = m.group(5)
	ct = int(m.group(6))
	diff = m.group(7)
	insert = 0
	delete = 0
	files_touched = set()
	# Diffs
	for l in re.finditer(u"(\d+)[ \t]+(\d+)[ \t]+([^\r\n]+)", diff, flags=re.MULTILINE):
	insert += int(l.group(1))
	delete += int(l.group(2))
	filename = l.group(3)
	if filename:
	files_touched.update([filename])
	if filename and len(filename) > 0 and (not filename in modificationDates or modificationDates[filename]['timestamp'] < ct):
	modificationDates[filename] = {
	'hash': ch,
	'filename': filename,
	'timestamp': ct,
	'created': ct if (not filename in modificationDates or not 'created' in modificationDates[filename] or modificationDates[filename]['created'] > ct) else modificationDates[filename]['created'],
	'author_email': ae,
	'committer_email': ce
	}
	if insert > 100000000:
	insert = 0
	if delete > 100000000:
	delete = 0
	if delete > 1000000 or insert > 1000000:
	KibbleBit.pprint("gigantic diff for %s (%s), ignoring" % (gpath, source['sourceURL']))
	pass
	if not gname in idseries:
	idseries[gname] = {}
	if not gname in lcseries:
	lcseries[gname] = {}
	if not gname in alcseries:
	alcseries[gname] = {}
	if not gname in ctseries:
	ctseries[gname] = {}
	if not gname in atseries:
	atseries[gname] = {}
	ts = ct - (ct % 86400)
	if not ts in idseries[gname]:
	idseries[gname][ts] = [0,0]

	idseries[gname][ts][0] += insert
	idseries[gname][ts][1] += delete

	if not ts in lcseries[gname]:
	lcseries[gname][ts] = {}
	if not ts in alcseries[gname]:
	alcseries[gname][ts] = {}
	if not ce in lcseries[gname][ts]:
	lcseries[gname][ts][ce] = [0,0]
	lcseries[gname][ts][ce][0] = lcseries[gname][ts][ce][0] + insert
	lcseries[gname][ts][ce][1] = lcseries[gname][ts][ce][0] + delete

	if not ae in alcseries[gname][ts]:
	alcseries[gname][ts][ae] = [0,0]
	alcseries[gname][ts][ae][0] = alcseries[gname][ts][ae][0] + insert
	alcseries[gname][ts][ae][1] = alcseries[gname][ts][ae][0] + delete

	if not ts in ctseries[gname]:
	ctseries[gname][ts] = {}
	if not ts in atseries[gname]:
	atseries[gname][ts] = {}

	if not ce in ctseries[gname][ts]:
	ctseries[gname][ts][ce] = 0
	ctseries[gname][ts][ce] += 1

	if not ae in atseries[gname][ts]:
	atseries[gname][ts][ae] = 0
	atseries[gname][ts][ae] += 1

	# Committer
	if not ce in people or len(people[ce]['name']) < len(cn):
	people[ce] = people[ce] if ce in people else {'projects': [gname]}
	people[ce]['name'] = cn
	if not gname in people[ce]['projects']:
	people[ce]['projects'].append(gname)

	# Author
	if not ae in people or len(people[ae]['name']) < len(an):
	people[ae] = people[ae] if ae in people else {'projects': [gname]}
	people[ae]['name'] = an
	if not gname in people[ae]['projects']:
	people[ae]['projects'].append(gname)

	# Make a list of changed files, max 1024
	filelist = list(touched_files)
	filelist = filelist[:1023]

	# ES commit documents
	tsd = ts - (ts % 86400)
	js = {
	'id': rid + "/" + ch,
	'sourceID': rid,
	'sourceURL': source['sourceURL'],
	'organisation': source['organisation'],
	'ts': ct,
	'tsday': tsd,
	'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(ts)),
	'committer_name': cn,
	'committer_email': ce,
	'author_name': an,
	'author_email': ae,
	'insertions': insert,
	'deletions': delete,
	'vcs': 'git',
	'files_changed': filelist
	}
	jsx = {
	'id': ch,
	'organisation': source['organisation'],
	'sourceID': source['sourceID'], # Only ever the last source with this
	'ts': ct,
	'tsday': tsd,
	'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(ts)),
	'committer_name': cn,
	'committer_email': ce,
	'author_name': an,
	'author_email': ae,
	'insertions': insert,
	'deletions': delete,
	'repository': rid, # This will always ever only be the last repo that had it!
	'vcs': 'git',
	'files_changed': filelist
	}
	KibbleBit.append ( 'person', {
	'upsert': True,
	'name': cn,
	'email': ce,
	'address': ce,
	'organisation': source['organisation'],
	'id' : hashlib.sha1( ("%s%s" % (source['organisation'], ce)).encode('ascii', errors='replace')).hexdigest()
	})
	KibbleBit.append ( 'person',
	{
	'upsert': True,
	'name': an,
	'email': ae,
	'address': ae,
	'organisation': source['organisation'],
	'id' :hashlib.sha1( ("%s%s" % (source['organisation'], ae)).encode('ascii', errors='replace')).hexdigest()
	}
	)
	KibbleBit.append('code_commit', js)
	KibbleBit.append('code_commit_unique', jsx)

	if True: # Do file changes?? Might wanna make this optional
	KibbleBit.pprint("Scanning file changes for %s" % source['sourceURL'])
	for filename in modificationDates:
	fid = hashlib.sha1( ("%s/%s" % (source['sourceID'], filename)).encode('ascii', errors='replace')).hexdigest()
	jsfe = {
	'upsert': True,
	'id': fid,
	'organisation': source['organisation'],
	'sourceID': source['sourceID'],
	'ts': modificationDates[filename]['timestamp'],
	'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(modificationDates[filename]['timestamp'])),
	'committer_email': modificationDates[filename]['committer_email'],
	'author_email': modificationDates[filename]['author_email'],
	'hash': modificationDates[filename]['hash'],
	'created': modificationDates[filename]['created'],
	'createdDate': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(modificationDates[filename]['created']))
	}
	found = KibbleBit.exists('file_history', fid)
	if found:
	del jsfe['created']
	del jsfe['createdDate']
	KibbleBit.append('file_history', jsfe)

	source['steps']['census'] = {
	'time': time.time(),
	'status': 'Census count completed at ' + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
	'running': False,
	'good': True,
	}
	source['census'] = time.time()
	KibbleBit.updateSource(source)