Embed attachments' text data for known file types (#136) * embed attachment data for known file types * add comments * add comments * remove trash files * add comments about known extensions * add missing () * open by default for small files < 5k bytes

commit: 1944b364a794a72f98ec8fe3162295317f171291 [log] [tgz]
author: Tomoko Uchida <tomoko.uchida.1111@gmail.com> Mon Aug 08 19:03:46 2022 +0900
committer: GitHub <noreply@github.com> Mon Aug 08 06:03:46 2022 -0400
tree: 63cf82c696977f6a82a34ddbb27e8858aa8bd22a
parent: 7dd8583b6ab8b192c9c194a225c83b401cd58c55 [diff]
diff --git a/migration/src/jira2github_import.py b/migration/src/jira2github_import.py
index 5ba69c0..3eb9234 100644
--- a/migration/src/jira2github_import.py
+++ b/migration/src/jira2github_import.py

@@ -30,7 +30,7 @@
     return ts[:-9] + "Z"
 
 
-def convert_issue(num: int, dump_dir: Path, output_dir: Path, account_map: dict[str, str], jira_users: dict[str, str], att_repo: str, att_branch: str, logger: Logger) -> bool:
+def convert_issue(num: int, dump_dir: Path, output_dir: Path, account_map: dict[str, str], jira_users: dict[str, str], att_repo: str, att_branch: str, att_dir: Optional[Path], logger: Logger) -> bool:
     jira_id = jira_issue_id(num)
     dump_file = jira_dump_file(dump_dir, num)
     if not dump_file.exists():
@@ -101,7 +101,7 @@
             resolutiondate_datetime = None
 
         try:
-            body = f'{convert_text(description, att_replace_map, account_map, jira_users)}\n\n'
+            body = f'{convert_text(description, att_replace_map, account_map, jira_users, att_dir)}\n\n'
             for image_file in unmentioned_images:
                 # show orphaned (unmentioned) image files in the issue description
                 att_url = att_replace_map.get(image_file)
@@ -172,7 +172,7 @@
             if comment_updated_datetime.date() != comment_created_datetime.date():
                 comment_time += f' [updated: {comment_updated_datetime.strftime("%b %d %Y")}]'
             try:
-                comment_body = f'{convert_text(comment_body, att_replace_map, account_map, jira_users)}\n\n'
+                comment_body = f'{convert_text(comment_body, att_replace_map, account_map, jira_users, att_dir)}\n\n'
                 # apply a special conversion for jira-bot's comments.
                 # see https://github.com/apache/lucene-jira-archive/issues/54
                 if comment_author_name == "jira-bot":
@@ -311,7 +311,9 @@
     def task(num):
         logger = logging.getLogger(name)
         try:
-            convert_issue(num, dump_dir, output_dir, account_map, jira_users, github_att_repo, github_att_branch, logger)
+            # if you have attachment files in $JIRA_ATTACHMENTS_DIRPATH, text attachments'data may be embedded in issue comments.
+            att_dir = Path(JIRA_ATTACHMENTS_DIRPATH).joinpath(jira_issue_id(num)) if JIRA_ATTACHMENTS_DIRPATH else None
+            convert_issue(num, dump_dir, output_dir, account_map, jira_users, github_att_repo, github_att_branch, att_dir, logger)
         except Exception as e:
             logger.error(traceback.format_exc(limit=100))
             logger.error(f"Failed to convert Jira issue. An error '{str(e)}' occurred; skipped {jira_issue_id(num)}.")

diff --git a/migration/src/jira_util.py b/migration/src/jira_util.py
index d02c974..985ebfb 100644
--- a/migration/src/jira_util.py
+++ b/migration/src/jira_util.py

@@ -1,3 +1,4 @@
+from pathlib import Path
 import re
 import itertools
 from dataclasses import dataclass
@@ -236,6 +237,32 @@
 REGEX_GITHUB_ISSUE_LINK = re.compile(r"(\s)(#\d+)(\s)")
 
 
+# common file extensions in Lucene Jira attachments
+# these extensions appear at least three times in Lucene Jira.
+FILE_EXT_TO_LANG = {
+    ".patch": "diff",
+    ".PATCH": "diff",
+    ".pat": "diff",
+    ".diff": "diff",
+    ".java": "java",
+    ".jj": "java",
+    ".jflex": "java",  # text?
+    ".txt": "text",
+    ".log": "text",
+    ".out": "text",
+    ".alg": "text",
+    ".perf": "text",
+    ".benchmark": "text",
+    ".test": "text",
+    ".py": "python",
+    ".html": "html",
+    ".xml": "xml",
+    ".sh": "sh",
+    ".json": "json",
+    ".jsp": "jsp",
+    ".properties": "ini"
+}
+
 def extract_embedded_image_files(text: str, image_files: list[str]) -> set[str]:
     """Extract embedded image files in the given text.
     https://jira.atlassian.com/secure/WikiRendererHelpAction.jspa?section=images
@@ -253,7 +280,7 @@
     return embedded_image_files
 
 
-def convert_text(text: str, att_replace_map: dict[str, str] = {}, account_map: dict[str, str] = {}, jira_users: dict[str, str] = {}) -> str:
+def convert_text(text: str, att_replace_map: dict[str, str] = {}, account_map: dict[str, str] = {}, jira_users: dict[str, str] = {}, att_dir: Optional[Path] = None) -> str:
     """Convert Jira markup to Markdown
     """
     def repl_att(m: re.Match):
@@ -318,6 +345,28 @@
     # escape github style cross-issue link (#NN)
     text = re.sub(REGEX_GITHUB_ISSUE_LINK, escape_gh_issue_link, text)
 
+    # embed attachments (patches, etc.) if possible
+    links = re.findall(REGEX_LINK, text)
+    if links and att_dir:
+        paths = list(filter(lambda p: p.exists(), (att_dir.joinpath(x[0]) for x in links)))
+        if paths:
+            path = paths[0]
+            # skip unknown file extensions; skip too large files.
+            if path.suffix in FILE_EXT_TO_LANG and path.stat().st_size < 50000:
+                text += __textdata_as_details(path, FILE_EXT_TO_LANG[path.suffix])
+
+    return text
+
+
+def __textdata_as_details(path: Path, lang: str) -> str:
+    assert path.exists()
+    name = path.name
+    att_open = "open" if path.stat().st_size < 5000 else ""
+    with open(path) as fp:
+        data = fp.read()
+        # use <details> markup to collapse long texts as default
+        # https://gist.github.com/pierrejoubert73/902cc94d79424356a8d20be2b382e1ab
+        text = f"\n<details {att_open}>\n<summary>{name}</summary>\n\n```{lang}\n{data}\n```\n\n</details>\n\n"
     return text
commit	1944b364a794a72f98ec8fe3162295317f171291	[log] [tgz]
author	Tomoko Uchida <tomoko.uchida.1111@gmail.com>	Mon Aug 08 19:03:46 2022 +0900
committer	GitHub <noreply@github.com>	Mon Aug 08 06:03:46 2022 -0400
tree	63cf82c696977f6a82a34ddbb27e8858aa8bd22a
parent	7dd8583b6ab8b192c9c194a225c83b401cd58c55 [diff]