Show unmentioned images in issue description (#135) * show unmentioned images in issue description * support other image file types

commit: ffda38df689ecc8a2fc8ccad97d31fceb0d2c936 [log] [tgz]
author: Tomoko Uchida <tomoko.uchida.1111@gmail.com> Sun Aug 07 19:29:43 2022 +0900
committer: GitHub <noreply@github.com> Sun Aug 07 06:29:43 2022 -0400
tree: cc597371c39125ea1edabae8cae7595a0beed33a
parent: c4c28c273302140626785ba2ecb106f1b3ccedbf [diff]
diff --git a/migration/src/jira2github_import.py b/migration/src/jira2github_import.py
index 18d0ad2..5ba69c0 100644
--- a/migration/src/jira2github_import.py
+++ b/migration/src/jira2github_import.py

@@ -61,6 +61,7 @@
         priority = extract_priority(o)
         vote_count = extract_vote_count(o)
         parent_issue_key = extract_parent_key(o)
+        comments = extract_comments(o)
 
         reporter_gh = account_map.get(reporter_name)
         reporter = f"{reporter_dispname} (@{reporter_gh})" if reporter_gh else f"{reporter_dispname}"
@@ -74,6 +75,14 @@
             attachment_list_items.append(f"[{filename}]({attachment_url(num, filename, att_repo, att_branch)})" + (f" (versions: {cnt})" if cnt > 1 else ""))
             att_replace_map[filename] = attachment_url(num, filename, att_repo, att_branch)
 
+        # detect unmentioned image files
+        # https://github.com/apache/lucene-jira-archive/issues/126
+        image_files = [x[0] for x in attachments if re.match(r"^.+\.(png|jpg|jpeg|gif|svg|bmp|ico|tif|tiff)$", x[0], flags=re.IGNORECASE)]
+        embedded_image_files = extract_embedded_image_files(description, image_files)
+        for (_, _, comment_body, _, _, _) in comments:
+            embedded_image_files.update(extract_embedded_image_files(comment_body, image_files))
+        unmentioned_images = [x for x in image_files if x not in embedded_image_files]
+
         # embed github issue number next to linked issue keys
         linked_issues_list_items = []
         for jira_key in linked_issues:
@@ -93,6 +102,10 @@
 
         try:
             body = f'{convert_text(description, att_replace_map, account_map, jira_users)}\n\n'
+            for image_file in unmentioned_images:
+                # show orphaned (unmentioned) image files in the issue description
+                att_url = att_replace_map.get(image_file)
+                body += f'![{image_file}]({att_url})\n\n'
         except Exception as e:
             logger.error(traceback.format_exc(limit=100))
             logger.error(f"Failed to convert opening issue description on {jira_issue_id(num)} due to above exception, ({str(e)}); falling back to original Jira description as code block.")
@@ -149,7 +162,6 @@
                     lines.append(line)
             return "\n".join(lines)
 
-        comments = extract_comments(o)
         comments_data = []
         for (comment_author_name, comment_author_dispname, comment_body, comment_created, comment_updated, comment_id) in comments:
             # TODO: since we now have accurate created_at reflected in the github comment, mabye we remove these

diff --git a/migration/src/jira_util.py b/migration/src/jira_util.py
index deb36e7..cf76c77 100644
--- a/migration/src/jira_util.py
+++ b/migration/src/jira_util.py

@@ -205,6 +205,8 @@
     return res
 
 
+REGEX_EMBEDDED_IMAGE = r"!([^!\n]+)!"
+
 # space character + Jira emoji + space character
 JIRA_EMOJI_TO_UNICODE = {
     "(?<=\s)\(y\)((?=$)|(?=\s))": "\U0001F44D",
@@ -234,6 +236,23 @@
 REGEX_GITHUB_ISSUE_LINK = re.compile(r"(\s)(#\d+)(\s)")
 
 
+def extract_embedded_image_files(text: str, image_files: list[str]) -> set[str]:
+    """Extract embedded image files in the given text.
+    https://jira.atlassian.com/secure/WikiRendererHelpAction.jspa?section=images
+    """
+    # capture candidates for embedded images
+    candidates = re.findall(REGEX_EMBEDDED_IMAGE, text)
+    embedded_image_files = set([])
+    for x in candidates:
+        if x in image_files:
+            # !xyz.png!
+            embedded_image_files.add(x)
+        elif any(map(lambda s: x.startswith(s + "|"), image_files)):
+            # !xyz.png|styles!
+            embedded_image_files.add(x.split("|", 1)[0])
+    return embedded_image_files
+
+
 def convert_text(text: str, att_replace_map: dict[str, str] = {}, account_map: dict[str, str] = {}, jira_users: dict[str, str] = {}) -> str:
     """Convert Jira markup to Markdown
     """
commit	ffda38df689ecc8a2fc8ccad97d31fceb0d2c936	[log] [tgz]
author	Tomoko Uchida <tomoko.uchida.1111@gmail.com>	Sun Aug 07 19:29:43 2022 +0900
committer	GitHub <noreply@github.com>	Sun Aug 07 06:29:43 2022 -0400
tree	cc597371c39125ea1edabae8cae7595a0beed33a
parent	c4c28c273302140626785ba2ecb106f1b3ccedbf [diff]