blob: 484e5acef18778893b5691ba127c360ca061afdf [file] [log] [blame]
#!/usr/bin/env ruby
require 'json'
puts 'DANGER, WILL ROBINSON! THIS IS NOT READY FOR PRODUCTION USE!'
# Map usernames between different systems (id.a.o and JIRA|Confluence)
# Input data is:
# committers = [{ID=>'curcuru', NAME =>'Shane Curcuru', MAIL=>['mail1', 'mail2',...]}, ...]
# other = [{ID=>'curcuru', NAME =>'Shane Curcuru', MAIL=>'mailone'}, ...]
# Analyzes matching IDs and emails, and returns:
# matches = hash by committer id of all committers, and if they matched
# SAME:... committer ID and an email address match exactly
# Most likely committer has same ID on both
# DIFF:... committer ID matches, but none of the emails matched
# Most likely the other account is a different person than committer
# NONE:... there were no ID matches
# Committer's ID is not found in other system
# crossmatches = hash by email address of any other matches from
# any of the other system's emails to any committer's emails
# REVIEW:... email address(es) matched, but IDs did not
# Note: emails may not be unique across accounts on either side
# Need to manually investigate, since may involve multiple people/accounts
# DIFF: an email address matched, but the IDs did not
# Manually invesigate to see why
# NOTE: You *must* manually evaluate the results!
module NameMap
extend self
COMMITTER_JSON = 'https://whimsy.apache.org/roster/committer/index.json'
ID = 'id'
MAIL = 'mail'
NAME = 'name'
TEST_COMMITTERS = [ # Drawn from Whimsy's committer data
{
'id' => 'curcuru',
'name' => 'Shane Curcuru',
'mail' => [
'asf@shanecurcuru.org',
'asfl@shanecurcuru.org',
'curcuru@apache.org'
],
'member' => true
},
{
'id' => 'makemyday',
'name' => 'Clint Eastwood',
'mail' => [
'clint@eastwood.com',
'gun@smoke.org'
],
},
{
'id' => 'robocop',
'name' => 'Peter Weller',
'mail' => [
'peter@buckaroo.com',
'both@lists.find',
],
},
{
'id' => 'laurel',
'name' => 'Yanni (musician)',
'mail' => [
'both@lists.find',
],
},
{
'id' => 'emailUnMatch',
'name' => 'Test Case1c',
'mail' => [
'email@example.com',
],
},
]
TEST_OTHER = [ # Any other system must provide id,name,email for each user
{
'id' => 'curcuru',
'name' => 'Shane Curcuru',
'mail' => 'asf@shanecurcuru.org',
},
{
'id' => 'bogie',
'name' => 'Rick Blaine',
'mail' => 'piano@sam.org',
},
{
'id' => 'makemyday',
'name' => 'Doris Day',
'mail' => 'doris@day.movies',
},
{
'id' => 'yanni',
'name' => 'Laurel Hardy',
'mail' => 'both@lists.find',
},
{
'id' => 'yannidouble',
'name' => 'Laurel and Hardy',
'mail' => 'both@lists.find',
},
{
'id' => 'emailNotMatch',
'name' => 'Test Case1o',
'mail' => 'email@example.com',
},
]
# Read committer accounts
# @param io stream to read JSON from
# @return json data
def get_committers(io)
if io
return JSON.parse(io)
else
return TEST_COMMITTERS
end
end
# Read other system accounts
# TODO Depends on file format of exported other system accounts
# @param f filename to read from
# @return json data
def get_other(f)
return TEST_OTHER
end
# Transform committer accounts into lookup hashes
# @param committers array from COMMITTER_JSON
# @return byid, bymail - hashes for lookups to committer accounts
# byid - hash by id of data
# bymail - hash by id of array of datum (in case non-unique emails)
def hash_committers(committers)
byid = {}
bymail = {}
committers.each do |hsh|
byid[hsh[ID]] = hsh
hsh[MAIL].each do |addr| # Committers can have multiple emails
(bymail[addr] ||= []) << hsh
end
end
return byid, bymail
end
# Transform other system accounts into lookup hashes
# @param other array of hashes including 'id', 'name', 'mail' keys
# @return byid, bymail - hashes for lookups to other system accounts
# byid - hash by id of data
# bymail - hash by id of array of datum (in case non-unique emails)
def hash_other(other)
byid = {}
bymail = {}
other.each do |hsh|
byid[hsh[ID]] = hsh
(bymail[hsh[MAIL]] ||= []) << hsh
end
return byid, bymail
end
# Compare committer ids to other system account ids
# @param cids - hash by id of committer data
# @param cmails - hash by email of [committer1, ...]
# @param cids - hash by id of other system account data
# @param cmails - hash by email of [other1, ...]
# @return matches, crossmatches - list of committer ids matched or not; list of emails cross-matched
def compare(cids, cmails, oids, omails)
matches = {}
crossmatches = {}
# For every committer, check for a matching account in other system
cids.each do |cid, committer|
# If the other system has identical id as committer
if oids.has_key?(cid)
# Cross-check all our mails with the other account to see if *any* match
committer[MAIL].each do |caddr|
# If one matches exactly with a single other account, log a likely match
if caddr.eql?(oids[cid][MAIL])
matches[cid] = "SAME:email match:(#{committer[NAME]},#{caddr}):(#{oids[cid][NAME]},#{oids[cid][MAIL]})"
break
end
end
if matches[cid].nil?
# None of our emails matched the other2 email, log
if committer[MAIL].length == 1
matches[cid] = "DIFF:email no match:(#{committer[NAME]},#{committer[MAIL][0]}):(#{oids[cid][NAME]},#{oids[cid][MAIL]})"
else
matches[cid] = "DIFF:email no match:(#{committer[NAME]},#{committer[MAIL].length} addresses):(#{oids[cid][NAME]},#{oids[cid][MAIL]})"
end
end
else # No id match, log it
matches[cid] = "NONE:no id match found"
end
end
# Also cross-check email addresses of other system to all committer emails
omails.each do |omail, other_accounts|
if cmails.has_key?(omail)
# Each bymail entry is an array; usually 1 element, but sometimes more
if cmails[omail].length == 1 && other_accounts.length == 1
# Simple case: check single id value
if cmails[omail][0][ID].eql?(other_accounts[0][ID])
# no-op: If both emails have single account that matches, ignore (was logged above)
else
# Mismatch of two IDs with same (unique) emails
crossmatches[omail] = "DIFF:id no match:(#{cmails[omail][0][ID]},#{cmails[omail][0][NAME]}):(#{other_accounts[0][ID]},#{other_accounts[0][NAME]})"
end
else
# Complex case: check through arrays of accounts with same email
str = "REVIEW:#{omail}:"
cmails[omail].each do |itm|
str += "(#{itm[ID]},#{itm[NAME]})"
end
str += ':'
other_accounts.each do |itm|
str += "(#{itm[ID]},#{itm[NAME]})"
end
crossmatches[omail] = str
end
end
end
return matches, crossmatches
end
# Compare a committer list to another system's list
# @param cio io stream to read committer accounts from
# @param ofile filename to read other system accounts from
# @return matches, crossmatches - list of committer ids matched or not; list of emails cross-matched
def report(cio = nil, ofile = nil)
cids, cmails = hash_committers(get_committers(cio))
oids, omails = hash_other(get_other(ofile))
matches, crossmatches = compare(cids, cmails, oids, omails)
return matches, crossmatches
end
# Check for email duplicates in committer roster
# @return hash of any committers with duplicate emails
# @return histogram of how many aliases committers list
def committer_dups(io)
dups = {}
histogram = Hash.new{|k,v| v = 0}
cids, cmails = hash_committers(get_committers(io))
cids.each do |id, hsh|
histogram[hsh[MAIL].length] += 1
end
cmails.each do |addr, ary|
if ary.length > 1
dups[addr] = ''
ary.each do |hsh|
dups[addr] += "#{hsh[ID]},"
end
end
end
return dups, histogram
end
end
#### MAIN TESTING CODE
matches, crossmatches = NameMap.report()
puts JSON.pretty_generate(matches)
puts JSON.pretty_generate(crossmatches)
# dups, histogram = NameMap.committer_dups(File.read('committerlist-from-whimsy.json'))
# puts JSON.pretty_generate(dups)
# puts JSON.pretty_generate(histogram)