#103 #96: add @mocobeta's scratchy tool with one thorn removed so I can iterate locally on the mapping file; allow commented out lines in mapping file; log a warning on malformed lines when loading mapping file; add a few more entries to mapping file

commit: 6ef20aedbe99b6b6945de555313501ad2c52a411 [log] [tgz]
author: Mike McCandless <mikemccand@apache.org> Mon Aug 01 08:20:43 2022 -0400
committer: Mike McCandless <mikemccand@apache.org> Mon Aug 01 08:20:43 2022 -0400
tree: 6d56c6439593b3c1b2d84666391a3bb93ba46005
parent: b91ce0c11cd5006548f352c811655085485aa0a7 [diff]
diff --git a/migration/mappings-data/account-map.csv.20220722.verified b/migration/mappings-data/account-map.csv.20220722.verified
index dd9422d..ca920fb 100644
--- a/migration/mappings-data/account-map.csv.20220722.verified
+++ b/migration/mappings-data/account-map.csv.20220722.verified

@@ -176,10 +176,12 @@
 TimOwen,timatbw,Tim Owen
 sarowe,sarowe,Steven Rowe
 steve_rowe,sarowe,Steven Rowe
+sarowe@syr.edu,sarowe,Steven Rowe
 gf2121,gf2121,Feng Guo
 mharwood,markharwood,Mark Harwood
 markharw00d@yahoo.co.uk,markharwood,Mark Harwood
 hossman,hossman,Chris M. Hostetter
+hossman_lucene@fucit.org,hossman,Chris M. Hostetter
 munendrasn,munendrasn,Munendra S N
 vajda,ovalhub,Andi Vajda
 manish82,manishbafna,Manish
@@ -356,3 +358,8 @@
 marcussorealheis,marcussorealheis,Marcus Eagan
 pru30,praveennish,Praveen Nishchal
 steffkes,steffkes,Stefan Matheis
+ab,sigram,Andrzej Bialecki
+goankur,goankur,Ankur Goel
+# goller@detego-software.de,???,Christoph Goller
+# markus17,???,Markus Jelsma
+# ???,paul.elschot@xs4all.nl,Paul Elschot

diff --git a/migration/src/common.py b/migration/src/common.py
index 0ce828e..9f50564 100644
--- a/migration/src/common.py
+++ b/migration/src/common.py

@@ -142,8 +142,12 @@
     with open(account_mapping_file) as fp:
         fp.readline()  # skip header
         for line in fp:
+            line = line.strip()
+            if line.startswith('#'):
+                continue
             cols = line.strip().split(",")
             if len(cols) < 2:
+                logger.warning(f"Skipping malformed entry {line} (< 2 columns) in {account_mapping_file}")
                 continue
             id_map[cols[0]] = cols[1]  # jira name -> github account
     return id_map
@@ -235,4 +239,4 @@
     "Schema and Analysis": None,
     "Tests": None,
     "contrib - Solr Cell (Tika extraction)": None,
-}
\ No newline at end of file
+}

diff --git a/migration/src/find_orphans.py b/migration/src/find_orphans.py
new file mode 100644
index 0000000..5a7cff2
--- /dev/null
+++ b/migration/src/find_orphans.py

@@ -0,0 +1,42 @@
+from operator import itemgetter
+from pathlib import Path
+import json
+import re
+import itertools
+from collections import defaultdict
+
+from common import JIRA_DUMP_DIRNAME, MAPPINGS_DATA_DIRNAME, JIRA_USERS_FILENAME, ACCOUNT_MAPPING_FILENAME, read_jira_users_map, read_account_map
+from jira_util import REGEX_MENION_TILDE, extract_description, extract_comments
+
+dump_dir = Path(__file__).resolve().parent.parent.joinpath(JIRA_DUMP_DIRNAME)
+mappings_dir = Path(__file__).resolve().parent.parent.joinpath(MAPPINGS_DATA_DIRNAME)
+jira_users_file = mappings_dir.joinpath(JIRA_USERS_FILENAME)
+jira_users = read_jira_users_map(jira_users_file) if jira_users_file.exists() else {}
+account_mapping_file = mappings_dir.joinpath(ACCOUNT_MAPPING_FILENAME)
+account_map = read_account_map(account_mapping_file) if account_mapping_file.exists() else {}
+
+
+def extract_tilde_mentions(text):
+    mentions = re.findall(REGEX_MENION_TILDE, text)
+    mentions = set(filter(lambda x: x != '', itertools.chain.from_iterable(mentions)))
+    mentions = [x[2:-1] for x in mentions]
+    return mentions
+
+
+orphan_ids = defaultdict(int)
+for dump_file in dump_dir.glob("LUCENE-*.json"):
+    mentions = set([])
+    with open(dump_file) as fp:
+        o = json.load(fp)
+        description = extract_description(o)
+        mentions.update(extract_tilde_mentions(description))
+        comments = extract_comments(o)
+        for (_, _, comment, _, _, _) in comments:
+            mentions.update(extract_tilde_mentions(comment))
+    for m in mentions:
+        if m not in account_map:
+            orphan_ids[m] += 1
+
+orphan_ids = sorted(orphan_ids.items(), key=itemgetter(1), reverse=True)
+for id, count in orphan_ids:
+    print(f'{id}\t{count}')
commit	6ef20aedbe99b6b6945de555313501ad2c52a411	[log] [tgz]
author	Mike McCandless <mikemccand@apache.org>	Mon Aug 01 08:20:43 2022 -0400
committer	Mike McCandless <mikemccand@apache.org>	Mon Aug 01 08:20:43 2022 -0400
tree	6d56c6439593b3c1b2d84666391a3bb93ba46005
parent	b91ce0c11cd5006548f352c811655085485aa0a7 [diff]