Embed attachments' text data for known file types (#136)
* embed attachment data for known file types
* add comments
* add comments
* remove trash files
* add comments about known extensions
* add missing ()
* open by default for small files < 5k bytes
diff --git a/migration/src/jira2github_import.py b/migration/src/jira2github_import.py
index 5ba69c0..3eb9234 100644
--- a/migration/src/jira2github_import.py
+++ b/migration/src/jira2github_import.py
@@ -30,7 +30,7 @@
return ts[:-9] + "Z"
-def convert_issue(num: int, dump_dir: Path, output_dir: Path, account_map: dict[str, str], jira_users: dict[str, str], att_repo: str, att_branch: str, logger: Logger) -> bool:
+def convert_issue(num: int, dump_dir: Path, output_dir: Path, account_map: dict[str, str], jira_users: dict[str, str], att_repo: str, att_branch: str, att_dir: Optional[Path], logger: Logger) -> bool:
jira_id = jira_issue_id(num)
dump_file = jira_dump_file(dump_dir, num)
if not dump_file.exists():
@@ -101,7 +101,7 @@
resolutiondate_datetime = None
try:
- body = f'{convert_text(description, att_replace_map, account_map, jira_users)}\n\n'
+ body = f'{convert_text(description, att_replace_map, account_map, jira_users, att_dir)}\n\n'
for image_file in unmentioned_images:
# show orphaned (unmentioned) image files in the issue description
att_url = att_replace_map.get(image_file)
@@ -172,7 +172,7 @@
if comment_updated_datetime.date() != comment_created_datetime.date():
comment_time += f' [updated: {comment_updated_datetime.strftime("%b %d %Y")}]'
try:
- comment_body = f'{convert_text(comment_body, att_replace_map, account_map, jira_users)}\n\n'
+ comment_body = f'{convert_text(comment_body, att_replace_map, account_map, jira_users, att_dir)}\n\n'
# apply a special conversion for jira-bot's comments.
# see https://github.com/apache/lucene-jira-archive/issues/54
if comment_author_name == "jira-bot":
@@ -311,7 +311,9 @@
def task(num):
logger = logging.getLogger(name)
try:
- convert_issue(num, dump_dir, output_dir, account_map, jira_users, github_att_repo, github_att_branch, logger)
+ # if you have attachment files in $JIRA_ATTACHMENTS_DIRPATH, text attachments'data may be embedded in issue comments.
+ att_dir = Path(JIRA_ATTACHMENTS_DIRPATH).joinpath(jira_issue_id(num)) if JIRA_ATTACHMENTS_DIRPATH else None
+ convert_issue(num, dump_dir, output_dir, account_map, jira_users, github_att_repo, github_att_branch, att_dir, logger)
except Exception as e:
logger.error(traceback.format_exc(limit=100))
logger.error(f"Failed to convert Jira issue. An error '{str(e)}' occurred; skipped {jira_issue_id(num)}.")
diff --git a/migration/src/jira_util.py b/migration/src/jira_util.py
index d02c974..985ebfb 100644
--- a/migration/src/jira_util.py
+++ b/migration/src/jira_util.py
@@ -1,3 +1,4 @@
+from pathlib import Path
import re
import itertools
from dataclasses import dataclass
@@ -236,6 +237,32 @@
REGEX_GITHUB_ISSUE_LINK = re.compile(r"(\s)(#\d+)(\s)")
+# common file extensions in Lucene Jira attachments
+# these extensions appear at least three times in Lucene Jira.
+FILE_EXT_TO_LANG = {
+ ".patch": "diff",
+ ".PATCH": "diff",
+ ".pat": "diff",
+ ".diff": "diff",
+ ".java": "java",
+ ".jj": "java",
+ ".jflex": "java", # text?
+ ".txt": "text",
+ ".log": "text",
+ ".out": "text",
+ ".alg": "text",
+ ".perf": "text",
+ ".benchmark": "text",
+ ".test": "text",
+ ".py": "python",
+ ".html": "html",
+ ".xml": "xml",
+ ".sh": "sh",
+ ".json": "json",
+ ".jsp": "jsp",
+ ".properties": "ini"
+}
+
def extract_embedded_image_files(text: str, image_files: list[str]) -> set[str]:
"""Extract embedded image files in the given text.
https://jira.atlassian.com/secure/WikiRendererHelpAction.jspa?section=images
@@ -253,7 +280,7 @@
return embedded_image_files
-def convert_text(text: str, att_replace_map: dict[str, str] = {}, account_map: dict[str, str] = {}, jira_users: dict[str, str] = {}) -> str:
+def convert_text(text: str, att_replace_map: dict[str, str] = {}, account_map: dict[str, str] = {}, jira_users: dict[str, str] = {}, att_dir: Optional[Path] = None) -> str:
"""Convert Jira markup to Markdown
"""
def repl_att(m: re.Match):
@@ -318,6 +345,28 @@
# escape github style cross-issue link (#NN)
text = re.sub(REGEX_GITHUB_ISSUE_LINK, escape_gh_issue_link, text)
+ # embed attachments (patches, etc.) if possible
+ links = re.findall(REGEX_LINK, text)
+ if links and att_dir:
+ paths = list(filter(lambda p: p.exists(), (att_dir.joinpath(x[0]) for x in links)))
+ if paths:
+ path = paths[0]
+ # skip unknown file extensions; skip too large files.
+ if path.suffix in FILE_EXT_TO_LANG and path.stat().st_size < 50000:
+ text += __textdata_as_details(path, FILE_EXT_TO_LANG[path.suffix])
+
+ return text
+
+
+def __textdata_as_details(path: Path, lang: str) -> str:
+ assert path.exists()
+ name = path.name
+ att_open = "open" if path.stat().st_size < 5000 else ""
+ with open(path) as fp:
+ data = fp.read()
+ # use <details> markup to collapse long texts as default
+ # https://gist.github.com/pierrejoubert73/902cc94d79424356a8d20be2b382e1ab
+ text = f"\n<details {att_open}>\n<summary>{name}</summary>\n\n```{lang}\n{data}\n```\n\n</details>\n\n"
return text