#103 #96: add @mocobeta's scratchy tool with one thorn removed so I can iterate locally on the mapping file; allow commented out lines in mapping file; log a warning on malformed lines when loading mapping file; add a few more entries to mapping file
diff --git a/migration/mappings-data/account-map.csv.20220722.verified b/migration/mappings-data/account-map.csv.20220722.verified
index dd9422d..ca920fb 100644
--- a/migration/mappings-data/account-map.csv.20220722.verified
+++ b/migration/mappings-data/account-map.csv.20220722.verified
@@ -176,10 +176,12 @@
TimOwen,timatbw,Tim Owen
sarowe,sarowe,Steven Rowe
steve_rowe,sarowe,Steven Rowe
+sarowe@syr.edu,sarowe,Steven Rowe
gf2121,gf2121,Feng Guo
mharwood,markharwood,Mark Harwood
markharw00d@yahoo.co.uk,markharwood,Mark Harwood
hossman,hossman,Chris M. Hostetter
+hossman_lucene@fucit.org,hossman,Chris M. Hostetter
munendrasn,munendrasn,Munendra S N
vajda,ovalhub,Andi Vajda
manish82,manishbafna,Manish
@@ -356,3 +358,8 @@
marcussorealheis,marcussorealheis,Marcus Eagan
pru30,praveennish,Praveen Nishchal
steffkes,steffkes,Stefan Matheis
+ab,sigram,Andrzej Bialecki
+goankur,goankur,Ankur Goel
+# goller@detego-software.de,???,Christoph Goller
+# markus17,???,Markus Jelsma
+# ???,paul.elschot@xs4all.nl,Paul Elschot
diff --git a/migration/src/common.py b/migration/src/common.py
index 0ce828e..9f50564 100644
--- a/migration/src/common.py
+++ b/migration/src/common.py
@@ -142,8 +142,12 @@
with open(account_mapping_file) as fp:
fp.readline() # skip header
for line in fp:
+ line = line.strip()
+ if line.startswith('#'):
+ continue
cols = line.strip().split(",")
if len(cols) < 2:
+ logger.warning(f"Skipping malformed entry {line} (< 2 columns) in {account_mapping_file}")
continue
id_map[cols[0]] = cols[1] # jira name -> github account
return id_map
@@ -235,4 +239,4 @@
"Schema and Analysis": None,
"Tests": None,
"contrib - Solr Cell (Tika extraction)": None,
-}
\ No newline at end of file
+}
diff --git a/migration/src/find_orphans.py b/migration/src/find_orphans.py
new file mode 100644
index 0000000..5a7cff2
--- /dev/null
+++ b/migration/src/find_orphans.py
@@ -0,0 +1,42 @@
+from operator import itemgetter
+from pathlib import Path
+import json
+import re
+import itertools
+from collections import defaultdict
+
+from common import JIRA_DUMP_DIRNAME, MAPPINGS_DATA_DIRNAME, JIRA_USERS_FILENAME, ACCOUNT_MAPPING_FILENAME, read_jira_users_map, read_account_map
+from jira_util import REGEX_MENION_TILDE, extract_description, extract_comments
+
+dump_dir = Path(__file__).resolve().parent.parent.joinpath(JIRA_DUMP_DIRNAME)
+mappings_dir = Path(__file__).resolve().parent.parent.joinpath(MAPPINGS_DATA_DIRNAME)
+jira_users_file = mappings_dir.joinpath(JIRA_USERS_FILENAME)
+jira_users = read_jira_users_map(jira_users_file) if jira_users_file.exists() else {}
+account_mapping_file = mappings_dir.joinpath(ACCOUNT_MAPPING_FILENAME)
+account_map = read_account_map(account_mapping_file) if account_mapping_file.exists() else {}
+
+
+def extract_tilde_mentions(text):
+ mentions = re.findall(REGEX_MENION_TILDE, text)
+ mentions = set(filter(lambda x: x != '', itertools.chain.from_iterable(mentions)))
+ mentions = [x[2:-1] for x in mentions]
+ return mentions
+
+
+orphan_ids = defaultdict(int)
+for dump_file in dump_dir.glob("LUCENE-*.json"):
+ mentions = set([])
+ with open(dump_file) as fp:
+ o = json.load(fp)
+ description = extract_description(o)
+ mentions.update(extract_tilde_mentions(description))
+ comments = extract_comments(o)
+ for (_, _, comment, _, _, _) in comments:
+ mentions.update(extract_tilde_mentions(comment))
+ for m in mentions:
+ if m not in account_map:
+ orphan_ids[m] += 1
+
+orphan_ids = sorted(orphan_ids.items(), key=itemgetter(1), reverse=True)
+for id, count in orphan_ids:
+ print(f'{id}\t{count}')