kibble/api/pages/mail/map.py - kibble-1 - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 ########################################################################
 # OPENAPI-URI: /api/mail/map
 ########################################################################
 # get:
 #   responses:
 #     '200':
 #       content:
 #         application/json:
 #           schema:
 #             $ref: '#/components/schemas/Sloc'
 #       description: 200 Response
 #     default:
 #       content:
 #         application/json:
 #           schema:
 #             $ref: '#/components/schemas/Error'
 #       description: unexpected error
 #   security:
 #   - cookieAuth: []
 #   summary: Shows a breakdown of email author reply mappings
 # post:
 #   requestBody:
 #     content:
 #       application/json:
 #         schema:
 #           $ref: '#/components/schemas/defaultWidgetArgs'
 #   responses:
 #     '200':
 #       content:
 #         application/json:
 #           schema:
 #             $ref: '#/components/schemas/Sloc'
 #       description: 200 Response
 #     default:
 #       content:
 #         application/json:
 #           schema:
 #             $ref: '#/components/schemas/Error'
 #       description: unexpected error
 #   security:
 #   - cookieAuth: []
 #   summary: Shows a breakdown of email author reply mappings
 #
 ########################################################################


 """
 This is the committer relationship list renderer for Kibble
 """

 import json
 import time
 import hashlib
 import copy
 import re
 import math

 badBots = r"(JIRA|Hudson|jira|jenkins|GitHub|git@|dev@|bugzilla|gerrit)"


 def run(API, environ, indata, session):

     # We need to be logged in for this!
     if not session.user:
         raise API.exception(403, "You must be logged in to use this API endpoint! %s")

     now = time.time()

     # First, fetch the view if we have such a thing enabled
     viewList = []
     if indata.get("view"):
         viewList = session.getView(indata.get("view"))
     if indata.get("subfilter"):
         viewList = session.subFilter(indata.get("subfilter"), view=viewList)

     dateTo = indata.get("to", int(time.time()))
     dateFrom = indata.get(
         "from", dateTo - (86400 * 30 * 6)
     )  # Default to a 6 month span
     span = dateTo - dateFrom

     ####################################################################
     ####################################################################
     dOrg = session.user["defaultOrganisation"] or "apache"
     query = {
         "query": {
             "bool": {
                 "must": [
                     {"range": {"ts": {"from": dateFrom, "to": dateTo}}},
                     {"term": {"organisation": dOrg}},
                 ]
             }
         }
     }
     # Source-specific or view-specific??
     if indata.get("source"):
         query["query"]["bool"]["must"].append(
             {"term": {"sourceID": indata.get("source")}}
         )
     elif viewList:
         query["query"]["bool"]["must"].append({"terms": {"sourceID": viewList}})
     if indata.get("search"):
         query["query"]["bool"]["must"].append(
             {"regexp": {"subject": indata.get("search")}}
         )

     if indata.get("email"):
         query["query"]["bool"]["minimum_should_match"] = 1
         query["query"]["bool"]["should"] = [
             {"term": {"replyto.keyword": indata.get("email")}},
             {"term": {"sender": indata.get("email")}},
         ]

     # Get number of commits, this period, per repo
     query["aggs"] = {
         "per_ml": {
             "terms": {
                 "field": "replyto.keyword" if not indata.get("author") else "sender",
                 "size": 150,
             }
         }
     }
     res = session.DB.ES.search(
         index=session.DB.dbname, doc_type="email", size=0, body=query
     )

     repos = {}
     repo_commits = {}
     authorlinks = {}
     max_emails = 0
     max_links = 0
     max_shared = 0
     max_authors = 0
     minLinks = indata.get("links", 1)

     if indata.get("email"):
         del query["query"]["bool"]["should"]
         del query["query"]["bool"]["minimum_should_match"]

     # For each repo, count commits and gather data on authors
     for doc in res["aggregations"]["per_ml"]["buckets"]:
         sourceID = doc["key"]
         emails = doc["doc_count"]
         if re.search(badBots, sourceID):  # No bots
             continue
         if emails > (span / 86400) * 4:  # More than 4/day and we consider you a bot!
             continue

         # Gather the unique authors/committers
         query["aggs"] = {
             "per_ml": {
                 "terms": {
                     "field": "sender"
                     if not indata.get("author")
                     else "replyto.keyword",
                     "size": 5000,
                 }
             }
         }
         xquery = copy.deepcopy(query)

         xquery["query"]["bool"]["must"].append(
             {
                 "term": {
                     "replyto.keyword"
                     if not indata.get("author")
                     else "sender": sourceID
                 }
             }
         )
         xres = session.DB.ES.search(
             index=session.DB.dbname, doc_type="email", size=0, body=xquery
         )
         authors = []
         for person in xres["aggregations"]["per_ml"]["buckets"]:
             pk = person["key"]
             authors.append(pk)
         if emails > max_emails:
             max_emails = emails
         repos[sourceID] = authors
         repo_commits[sourceID] = emails

     # Now, figure out which repos share the same contributors
     repo_links = {}
     repo_notoriety = {}
     repodatas = {}
     repo_authors = {}

     # Grab data of all sources
     for ID, repo in repos.items():
         mylinks = {}
         hID = hashlib.sha1(
             ("%s%s" % (dOrg, ID)).encode("ascii", errors="replace")
         ).hexdigest()
         if not session.DB.ES.exists(index=session.DB.dbname, doc_type="person", id=hID):
             continue
         repodatas[ID] = session.DB.ES.get(
             index=session.DB.dbname, doc_type="person", id=hID
         )

     for ID, repo in repos.items():
         mylinks = {}
         if not ID in repodatas:
             continue
         repodata = repodatas[ID]
         oID = ID
         if indata.get("collapse"):
             m = re.search(indata.get("collapse"), repodata["_source"]["email"])
             if m:
                 ID = m.group(1)
         xlinks = []
         for xID, xrepo in repos.items():
             if xID in repodatas:
                 xrepodata = repodatas[xID]
                 if indata.get("collapse"):
                     m = re.search(indata.get("collapse"), xrepodata["_source"]["email"])
                     if m:
                         xID = m.group(1)
                 if xID != ID:

                     if ID in xrepo:
                         xlinks.append(xID)
                         lname = "%s||%s" % (ID, xID)  # Link name
                         rname = "%s||%s" % (xID, ID)  # Reverse link name
                         if (
                             len(xlinks) > 0
                             and rname not in repo_links
                             and len(xlinks) >= minLinks
                         ):
                             mylinks[ID] = mylinks.get(ID, 0) + 1
                             repo_links[lname] = repo_links.get(lname, 0) + len(
                                 xlinks
                             )  # How many contributors in common between project A and B?
                             if repo_links[lname] > max_shared:
                                 max_shared = repo_links[lname]
                         elif rname in repo_links:
                             repo_links[rname] = repo_links.get(rname, 0) + len(xlinks)
         if ID not in repo_notoriety:
             repo_notoriety[ID] = set()
         repo_notoriety[ID].update(
             mylinks.keys()
         )  # How many projects is this repo connected to?

         if ID not in repo_authors:
             repo_authors[ID] = set()
         repo_authors[ID].update(repo)  # How many projects is this repo connected to?

         if ID != oID:
             repo_commits[ID] = repo_commits.get(ID, 0) + repo_commits[oID]
             if repo_commits[ID] > max_emails:
                 max_emails = repo_commits[ID]  # Used for calculating max link thickness
         if len(repo_notoriety[ID]) > max_links:
             max_links = len(repo_notoriety[ID])
         if len(repo_authors[ID]) > max_authors:
             max_authors = len(
                 repo_authors[ID]
             )  # Used for calculating max sphere size in charts

     # Now, pull it all together!
     nodes = []
     links = []
     existing_repos = []
     for sourceID, ns in repo_notoriety.items():
         lsize = 0
         for k in repo_links.keys():
             fr, to = k.split("||")
             if fr == sourceID or to == sourceID:
                 lsize += 1
         asize = len(repo_authors[sourceID])
         doc = {
             "id": sourceID,
             "gravatar": hashlib.md5(sourceID.lower().encode("utf-8")).hexdigest(),
             "name": repodatas[sourceID]["_source"].get("name", sourceID),
             "replies": repo_commits[sourceID],
             "authors": asize,
             "links": lsize,
             "size": max(
                 5, (1 - abs(math.log10(repo_commits[sourceID] / max_emails))) * 45
             ),
             "tooltip": "%u connections, %u fellows, %u replies to"
             % (lsize, asize, repo_commits[sourceID]),
         }
         nodes.append(doc)
         existing_repos.append(sourceID)

     for k, s in repo_links.items():
         size = s
         fr, to = k.split("||")
         if fr in existing_repos and to in existing_repos:
             doc = {
                 "source": fr,
                 "target": to,
                 "value": max(1, (size / max_shared) * 5),
                 "name": "%s &#8596; %s" % (fr, to),
                 "tooltip": "%u topics exchanged" % size,
             }
             links.append(doc)

     JSON_OUT = {
         "maxLinks": max_links,
         "maxShared": max_shared,
         "widgetType": {"chartType": "link"},  # Recommendation for the UI
         "links": links,
         "nodes": nodes,
         "okay": True,
         "responseTime": time.time() - now,
     }
     yield json.dumps(JSON_OUT)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	########################################################################
	# OPENAPI-URI: /api/mail/map
	########################################################################
	# get:
	# responses:
	# '200':
	# content:
	# application/json:
	# schema:
	# $ref: '#/components/schemas/Sloc'
	# description: 200 Response
	# default:
	# content:
	# application/json:
	# schema:
	# $ref: '#/components/schemas/Error'
	# description: unexpected error
	# security:
	# - cookieAuth: []
	# summary: Shows a breakdown of email author reply mappings
	# post:
	# requestBody:
	# content:
	# application/json:
	# schema:
	# $ref: '#/components/schemas/defaultWidgetArgs'
	# responses:
	# '200':
	# content:
	# application/json:
	# schema:
	# $ref: '#/components/schemas/Sloc'
	# description: 200 Response
	# default:
	# content:
	# application/json:
	# schema:
	# $ref: '#/components/schemas/Error'
	# description: unexpected error
	# security:
	# - cookieAuth: []
	# summary: Shows a breakdown of email author reply mappings
	#
	########################################################################


	"""
	This is the committer relationship list renderer for Kibble
	"""

	import json
	import time
	import hashlib
	import copy
	import re
	import math

	badBots = r"(JIRA\|Hudson\|jira\|jenkins\|GitHub\|git@\|dev@\|bugzilla\|gerrit)"


	def run(API, environ, indata, session):

	# We need to be logged in for this!
	if not session.user:
	raise API.exception(403, "You must be logged in to use this API endpoint! %s")

	now = time.time()

	# First, fetch the view if we have such a thing enabled
	viewList = []
	if indata.get("view"):
	viewList = session.getView(indata.get("view"))
	if indata.get("subfilter"):
	viewList = session.subFilter(indata.get("subfilter"), view=viewList)

	dateTo = indata.get("to", int(time.time()))
	dateFrom = indata.get(
	"from", dateTo - (86400 * 30 * 6)
	) # Default to a 6 month span
	span = dateTo - dateFrom

	####################################################################
	####################################################################
	dOrg = session.user["defaultOrganisation"] or "apache"
	query = {
	"query": {
	"bool": {
	"must": [
	{"range": {"ts": {"from": dateFrom, "to": dateTo}}},
	{"term": {"organisation": dOrg}},
	]
	}
	}
	}
	# Source-specific or view-specific??
	if indata.get("source"):
	query["query"]["bool"]["must"].append(
	{"term": {"sourceID": indata.get("source")}}
	)
	elif viewList:
	query["query"]["bool"]["must"].append({"terms": {"sourceID": viewList}})
	if indata.get("search"):
	query["query"]["bool"]["must"].append(
	{"regexp": {"subject": indata.get("search")}}
	)

	if indata.get("email"):
	query["query"]["bool"]["minimum_should_match"] = 1
	query["query"]["bool"]["should"] = [
	{"term": {"replyto.keyword": indata.get("email")}},
	{"term": {"sender": indata.get("email")}},
	]

	# Get number of commits, this period, per repo
	query["aggs"] = {
	"per_ml": {
	"terms": {
	"field": "replyto.keyword" if not indata.get("author") else "sender",
	"size": 150,
	}
	}
	}
	res = session.DB.ES.search(
	index=session.DB.dbname, doc_type="email", size=0, body=query
	)

	repos = {}
	repo_commits = {}
	authorlinks = {}
	max_emails = 0
	max_links = 0
	max_shared = 0
	max_authors = 0
	minLinks = indata.get("links", 1)

	if indata.get("email"):
	del query["query"]["bool"]["should"]
	del query["query"]["bool"]["minimum_should_match"]

	# For each repo, count commits and gather data on authors
	for doc in res["aggregations"]["per_ml"]["buckets"]:
	sourceID = doc["key"]
	emails = doc["doc_count"]
	if re.search(badBots, sourceID): # No bots
	continue
	if emails > (span / 86400) * 4: # More than 4/day and we consider you a bot!
	continue

	# Gather the unique authors/committers
	query["aggs"] = {
	"per_ml": {
	"terms": {
	"field": "sender"
	if not indata.get("author")
	else "replyto.keyword",
	"size": 5000,
	}
	}
	}
	xquery = copy.deepcopy(query)

	xquery["query"]["bool"]["must"].append(
	{
	"term": {
	"replyto.keyword"
	if not indata.get("author")
	else "sender": sourceID
	}
	}
	)
	xres = session.DB.ES.search(
	index=session.DB.dbname, doc_type="email", size=0, body=xquery
	)
	authors = []
	for person in xres["aggregations"]["per_ml"]["buckets"]:
	pk = person["key"]
	authors.append(pk)
	if emails > max_emails:
	max_emails = emails
	repos[sourceID] = authors
	repo_commits[sourceID] = emails

	# Now, figure out which repos share the same contributors
	repo_links = {}
	repo_notoriety = {}
	repodatas = {}
	repo_authors = {}

	# Grab data of all sources
	for ID, repo in repos.items():
	mylinks = {}
	hID = hashlib.sha1(
	("%s%s" % (dOrg, ID)).encode("ascii", errors="replace")
	).hexdigest()
	if not session.DB.ES.exists(index=session.DB.dbname, doc_type="person", id=hID):
	continue
	repodatas[ID] = session.DB.ES.get(
	index=session.DB.dbname, doc_type="person", id=hID
	)

	for ID, repo in repos.items():
	mylinks = {}
	if not ID in repodatas:
	continue
	repodata = repodatas[ID]
	oID = ID
	if indata.get("collapse"):
	m = re.search(indata.get("collapse"), repodata["_source"]["email"])
	if m:
	ID = m.group(1)
	xlinks = []
	for xID, xrepo in repos.items():
	if xID in repodatas:
	xrepodata = repodatas[xID]
	if indata.get("collapse"):
	m = re.search(indata.get("collapse"), xrepodata["_source"]["email"])
	if m:
	xID = m.group(1)
	if xID != ID:

	if ID in xrepo:
	xlinks.append(xID)
	lname = "%s\|\|%s" % (ID, xID) # Link name
	rname = "%s\|\|%s" % (xID, ID) # Reverse link name
	if (
	len(xlinks) > 0
	and rname not in repo_links
	and len(xlinks) >= minLinks
	):
	mylinks[ID] = mylinks.get(ID, 0) + 1
	repo_links[lname] = repo_links.get(lname, 0) + len(
	xlinks
	) # How many contributors in common between project A and B?
	if repo_links[lname] > max_shared:
	max_shared = repo_links[lname]
	elif rname in repo_links:
	repo_links[rname] = repo_links.get(rname, 0) + len(xlinks)
	if ID not in repo_notoriety:
	repo_notoriety[ID] = set()
	repo_notoriety[ID].update(
	mylinks.keys()
	) # How many projects is this repo connected to?

	if ID not in repo_authors:
	repo_authors[ID] = set()
	repo_authors[ID].update(repo) # How many projects is this repo connected to?

	if ID != oID:
	repo_commits[ID] = repo_commits.get(ID, 0) + repo_commits[oID]
	if repo_commits[ID] > max_emails:
	max_emails = repo_commits[ID] # Used for calculating max link thickness
	if len(repo_notoriety[ID]) > max_links:
	max_links = len(repo_notoriety[ID])
	if len(repo_authors[ID]) > max_authors:
	max_authors = len(
	repo_authors[ID]
	) # Used for calculating max sphere size in charts

	# Now, pull it all together!
	nodes = []
	links = []
	existing_repos = []
	for sourceID, ns in repo_notoriety.items():
	lsize = 0
	for k in repo_links.keys():
	fr, to = k.split("\|\|")
	if fr == sourceID or to == sourceID:
	lsize += 1
	asize = len(repo_authors[sourceID])
	doc = {
	"id": sourceID,
	"gravatar": hashlib.md5(sourceID.lower().encode("utf-8")).hexdigest(),
	"name": repodatas[sourceID]["_source"].get("name", sourceID),
	"replies": repo_commits[sourceID],
	"authors": asize,
	"links": lsize,
	"size": max(
	5, (1 - abs(math.log10(repo_commits[sourceID] / max_emails))) * 45
	),
	"tooltip": "%u connections, %u fellows, %u replies to"
	% (lsize, asize, repo_commits[sourceID]),
	}
	nodes.append(doc)
	existing_repos.append(sourceID)

	for k, s in repo_links.items():
	size = s
	fr, to = k.split("\|\|")
	if fr in existing_repos and to in existing_repos:
	doc = {
	"source": fr,
	"target": to,
	"value": max(1, (size / max_shared) * 5),
	"name": "%s ↔ %s" % (fr, to),
	"tooltip": "%u topics exchanged" % size,
	}
	links.append(doc)

	JSON_OUT = {
	"maxLinks": max_links,
	"maxShared": max_shared,
	"widgetType": {"chartType": "link"}, # Recommendation for the UI
	"links": links,
	"nodes": nodes,
	"okay": True,
	"responseTime": time.time() - now,
	}
	yield json.dumps(JSON_OUT)