migration/src/jira2github_import.py - lucene-jira-archive - Git at Google

 #
 # Convert Jira issues to GitHub issues for Import Issues API (https://gist.github.com/jonmagic/5282384165e0f86ef105)
 # Usage:
 #   python src/jira2github_import.py --issues <jira issue number list> [--num-workers <# worker processes>]
 #   python src/jira2github_import.py --min <min issue number> --max <max issue number> [--num-workers <# worker processes>]
 #

 import argparse
 from logging import Logger
 from pathlib import Path
 import json
 import sys
 from urllib.parse import quote
 import dateutil.parser
 import os
 import traceback
 import multiprocessing

 from common import *
 from jira_util import *


 def attachment_url(issue_num: int, filename: str, att_repo: str, att_branch: str) -> str:
     return f"https://raw.githubusercontent.com/{att_repo}/{att_branch}/attachments/{jira_issue_id(issue_num)}/{quote(filename)}"


 def jira_timestamp_to_github_timestamp(ts: str) -> str:
     # convert Jira timestamp format to GitHub acceptable format
     # e.g., "2006-06-06T06:24:38.000+0000" -> "2006-06-06T06:24:38Z"
     return ts[:-9] + "Z"


 def convert_issue(num: int, dump_dir: Path, output_dir: Path, account_map: dict[str, str], jira_users: dict[str, str], att_repo: str, att_branch: str, logger: Logger) -> bool:
     jira_id = jira_issue_id(num)
     dump_file = jira_dump_file(dump_dir, num)
     if not dump_file.exists():
         logger.warning(f"Jira dump file not found: {dump_file}")
         return False

     with open(dump_file) as fp:
         o = json.load(fp)
         summary = extract_summary(o).strip()
         description = extract_description(o).strip()
         status = extract_status(o)
         issue_type = extract_issue_type(o)
         (reporter_name, reporter_dispname) = extract_reporter(o)
         (assignee_name, assignee_dispname) = extract_assignee(o)
         created = extract_created(o)
         updated = extract_updated(o)
         resolutiondate = extract_resolutiondate(o)
         fix_versions = extract_fixversions(o)
         versions = extract_versions(o)
         components = extract_components(o)
         attachments = extract_attachments(o)
         linked_issues = extract_issue_links(o)
         subtasks = extract_subtasks(o)
         pull_requests =extract_pull_requests(o)

         reporter_gh = account_map.get(reporter_name)
         reporter = f"{reporter_dispname} (@{reporter_gh})" if reporter_gh else f"{reporter_dispname}"
         assignee_gh = account_map.get(assignee_name)
         assignee = f"{assignee_dispname} (@{assignee_gh})" if assignee_gh else f"{assignee_dispname}"

         # make attachment list
         attachment_list_items = []
         att_replace_map = {}
         for (filename, cnt) in attachments:
             attachment_list_items.append(f"[{filename}]({attachment_url(num, filename, att_repo, att_branch)})" + (f" (versions: {cnt})" if cnt > 1 else ""))
             att_replace_map[filename] = attachment_url(num, filename, att_repo, att_branch)

         # embed github issue number next to linked issue keys
         linked_issues_list_items = []
         for jira_key in linked_issues:
             linked_issues_list_items.append(f"- [{jira_key}]({jira_issue_url(jira_key)})\n")

         # embed github issue number next to sub task keys
         subtasks_list_items = []
         for jira_key in subtasks:
             subtasks_list_items.append(f"- [{jira_key}]({jira_issue_url(jira_key)})\n")

         created_datetime = dateutil.parser.parse(created)
         updated_datetime = dateutil.parser.parse(updated)
         if resolutiondate is not None:
             resolutiondate_datetime = dateutil.parser.parse(resolutiondate)
         else:
             resolutiondate_datetime = None

         try:
             body = f'{convert_text(description, att_replace_map, account_map, jira_users)}\n\n'
         except Exception as e:
             logger.error(traceback.format_exc(limit=100))
             logger.error(f"Failed to convert opening issue description on {jira_issue_id(num)} due to above exception, ({str(e)}); falling back to original Jira description as code block.")
             logger.error(f"Original description: {description}")
             body = f"```\n{description}```\n\n"

         body += f"""

 ---
 ### Legacy Jira details

 [{jira_id}]({jira_issue_url(jira_id)}) by {reporter} on {created_datetime.strftime('%b %d %Y')}"""

         if resolutiondate_datetime is not None:
             body += f", resolved {resolutiondate_datetime.strftime('%b %d %Y')}"
         elif created_datetime.date() != updated_datetime.date():
             body += f", updated {updated_datetime.strftime('%b %d %Y')}"

         if len(attachment_list_items) > 0:
             body += f'\nAttachments: {", ".join(attachment_list_items)}'

         if len(linked_issues_list_items) > 0:
             body += f'\nLinked issues:\n {"".join(linked_issues_list_items)}'

         if len(subtasks_list_items) > 0:
             body += f'\nSub-tasks:\n {"".join(subtasks_list_items)}'

         if len(pull_requests) > 0:
             body += f'\nPull requests: {", ".join([str(x) for x in pull_requests])}'

         body += '\n'

         def comment_author(author_name, author_dispname):
             author_gh = account_map.get(author_name)
             return f"{author_dispname} (@{author_gh})" if author_gh else author_dispname

         def enable_hyperlink_to_commit(comment_body: str):
             lines = []
             for line in comment_body.split("\n"):
                 # remove '[' and ']' iff it contains a URL (i.e. link to a commit in ASF GitBox repo).
                 m = re.match(r"^\[\s?(https?://\S+)\s?\]$", line.strip())
                 if m:
                     lines.append(m.group(1))
                 else:
                     lines.append(line)
             return "\n".join(lines)

         comments = extract_comments(o)
         comments_data = []
         for (comment_author_name, comment_author_dispname, comment_body, comment_created, comment_updated, comment_id) in comments:
             # TODO: since we now have accurate created_at reflected in the github comment, mabye we remove these
             #       timestamps?  also, if the account id mapped over to known GH account, we can drop Jira footer entirely?
             comment_created_datetime = dateutil.parser.parse(comment_created)
             comment_time = f'{comment_created_datetime.strftime("%b %d %Y")}'
             comment_updated_datetime = dateutil.parser.parse(comment_updated)
             if comment_updated_datetime.date() != comment_created_datetime.date():
                 comment_time += f' [updated: {comment_updated_datetime.strftime("%b %d %Y")}]'
             try:
                 comment_body = f'{convert_text(comment_body, att_replace_map, account_map, jira_users)}\n\n'
                 # apply a special conversion for jira-bot's comments.
                 # see https://github.com/apache/lucene-jira-archive/issues/54
                 if comment_author_name == "jira-bot":
                     comment_body = enable_hyperlink_to_commit(comment_body)
             except Exception as e:
                 logger.error(traceback.format_exc(limit=100))
                 logger.error(f"Failed to convert comment on {jira_issue_id(num)} due to above exception ({str(e)}); falling back to original Jira comment as code block.")
                 logger.error(f"Original text: {comment_body}")
                 comment_body = f"```\n{comment_body}```\n\n"

             jira_comment_link = f'https://issues.apache.org/jira/browse/{jira_id}?focusedCommentId={comment_id}&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-{comment_id}'

             comment_body += f'[Legacy Jira: {comment_author(comment_author_name, comment_author_dispname)} on [{comment_time}]({jira_comment_link})]\n'
             data = {
                 "body": comment_body
             }
             if comment_created:
                 data["created_at"] = jira_timestamp_to_github_timestamp(comment_created)
             comments_data.append(data)

         labels = []
         if issue_type and ISSUE_TYPE_TO_LABEL_MAP.get(issue_type):
             labels.append(ISSUE_TYPE_TO_LABEL_MAP.get(issue_type))
         # milestone?
         for v in fix_versions:
             if v:
                 labels.append(f"fix-version:{v}")
         for v in versions:
             if v:
                 labels.append(f"affects-version:{v}")
         for c in components:
             if c.startswith("core"):
                 labels.append(f"module:{c}")
             elif c in COMPONENT_TO_LABEL_MAP:
                 l = COMPONENT_TO_LABEL_MAP[c]
                 if l is not None:
                     labels.append(l)
             else:
                 logger.error(f"Unknown Component: {c}")

         data = {
             "issue": {
                 "title": make_github_title(summary, jira_id),
                 "body": body,
                 "closed": status in ["Closed", "Resolved"],
                 "labels": labels,
             },
             "comments": comments_data
         }
         if created:
             data["issue"]["created_at"] = jira_timestamp_to_github_timestamp(created)
         if updated:
             data["issue"]["updated_at"] = jira_timestamp_to_github_timestamp(updated)
         if resolutiondate:
             data["issue"]["closed_at"] = jira_timestamp_to_github_timestamp(resolutiondate)
         if assignee_gh:
             data["issue"]["assignee"] = assignee_gh

         data_file = github_data_file(output_dir, num)
         with open(data_file, "w") as fp:
             json.dump(data, fp, indent=2)

     logger.debug(f"GitHub issue data created: {data_file}")
     return True


 if __name__ == "__main__":
     github_att_repo = os.getenv("GITHUB_ATT_REPO")
     if not github_att_repo:
         print("Please set your GitHub attachment repo to GITHUB_ATT_REPO environment variable.")
         sys.exit(1)
     github_att_branch = os.getenv("GITHUB_ATT_BRANCH")
     if not github_att_repo:
         print("Please set your GitHub attachment branch to GITHUB_ATT_BRANCH environment variable.")
         sys.exit(1)

     parser = argparse.ArgumentParser()
     parser.add_argument('--issues', type=int, required=False, nargs='*', help='Jira issue number list to be downloaded')
     parser.add_argument('--min', type=int, dest='min', required=False, default=1, help='Minimum Jira issue number to be converted')
     parser.add_argument('--max', type=int, dest='max', required=False, help='Maximum Jira issue number to be converted')
     parser.add_argument('--num-workers', type=int, dest='num_workers', required=False, default=1, help='Number of worker processes')
     args = parser.parse_args()

     dump_dir = Path(__file__).resolve().parent.parent.joinpath(JIRA_DUMP_DIRNAME)
     if not dump_dir.exists():
         print(f"Jira dump dir not exists: {dump_dir}")
         sys.exit(1)

     mappings_dir = Path(__file__).resolve().parent.parent.joinpath(MAPPINGS_DATA_DIRNAME)
     account_mapping_file = mappings_dir.joinpath(ACCOUNT_MAPPING_FILENAME)
     jira_users_file = mappings_dir.joinpath(JIRA_USERS_FILENAME)

     output_dir = Path(__file__).resolve().parent.parent.joinpath(GITHUB_IMPORT_DATA_DIRNAME)
     if not output_dir.exists():
         output_dir.mkdir()
     assert output_dir.exists()

     account_map = read_account_map(account_mapping_file) if account_mapping_file.exists() else {}
     jira_users = read_jira_users_map(jira_users_file) if jira_users_file.exists() else {}

     issues = []
     if args.issues:
         issues = args.issues
     else:
         if args.max:
             issues.extend(list(range(args.min, args.max + 1)))
         else:
             issues.append(args.min)
     num_workers = args.num_workers

     log_dir = Path(__file__).resolve().parent.parent.joinpath(LOG_DIRNAME)
     name = "jira2github_import"
     (listener, queue) = log_listener(log_dir, name)
     listener.start()
     logging_setup_worker(queue)
     logger = logging.getLogger(name)

     logger.info(f"Converting Jira issues to GitHub issues in {output_dir}. num_workers={num_workers}")

     def task(num):
         logger = logging.getLogger(name)
         try:
             convert_issue(num, dump_dir, output_dir, account_map, jira_users, github_att_repo, github_att_branch, logger)
         except Exception as e:
             logger.error(traceback.format_exc(limit=100))
             logger.error(f"Failed to convert Jira issue. An error '{str(e)}' occurred; skipped {jira_issue_id(num)}.")

     results = []
     # Try to support Windows: The worker configuration is done at the start of the worker process run.
     with multiprocessing.Pool(num_workers, initializer=logging_setup_worker, initargs=(queue,)) as pool:
         for num in issues:
             result = pool.apply_async(task, (num,))
             results.append(result)
         for res in results:
             res.get()

     logger.info("Done.")
     queue.put_nowait(None)
     listener.join()
	#
	# Convert Jira issues to GitHub issues for Import Issues API (https://gist.github.com/jonmagic/5282384165e0f86ef105)
	# Usage:
	# python src/jira2github_import.py --issues <jira issue number list> [--num-workers <# worker processes>]
	# python src/jira2github_import.py --min <min issue number> --max <max issue number> [--num-workers <# worker processes>]
	#

	import argparse
	from logging import Logger
	from pathlib import Path
	import json
	import sys
	from urllib.parse import quote
	import dateutil.parser
	import os
	import traceback
	import multiprocessing

	from common import *
	from jira_util import *


	def attachment_url(issue_num: int, filename: str, att_repo: str, att_branch: str) -> str:
	return f"https://raw.githubusercontent.com/{att_repo}/{att_branch}/attachments/{jira_issue_id(issue_num)}/{quote(filename)}"


	def jira_timestamp_to_github_timestamp(ts: str) -> str:
	# convert Jira timestamp format to GitHub acceptable format
	# e.g., "2006-06-06T06:24:38.000+0000" -> "2006-06-06T06:24:38Z"
	return ts[:-9] + "Z"


	def convert_issue(num: int, dump_dir: Path, output_dir: Path, account_map: dict[str, str], jira_users: dict[str, str], att_repo: str, att_branch: str, logger: Logger) -> bool:
	jira_id = jira_issue_id(num)
	dump_file = jira_dump_file(dump_dir, num)
	if not dump_file.exists():
	logger.warning(f"Jira dump file not found: {dump_file}")
	return False

	with open(dump_file) as fp:
	o = json.load(fp)
	summary = extract_summary(o).strip()
	description = extract_description(o).strip()
	status = extract_status(o)
	issue_type = extract_issue_type(o)
	(reporter_name, reporter_dispname) = extract_reporter(o)
	(assignee_name, assignee_dispname) = extract_assignee(o)
	created = extract_created(o)
	updated = extract_updated(o)
	resolutiondate = extract_resolutiondate(o)
	fix_versions = extract_fixversions(o)
	versions = extract_versions(o)
	components = extract_components(o)
	attachments = extract_attachments(o)
	linked_issues = extract_issue_links(o)
	subtasks = extract_subtasks(o)
	pull_requests =extract_pull_requests(o)

	reporter_gh = account_map.get(reporter_name)
	reporter = f"{reporter_dispname} (@{reporter_gh})" if reporter_gh else f"{reporter_dispname}"
	assignee_gh = account_map.get(assignee_name)
	assignee = f"{assignee_dispname} (@{assignee_gh})" if assignee_gh else f"{assignee_dispname}"

	# make attachment list
	attachment_list_items = []
	att_replace_map = {}
	for (filename, cnt) in attachments:
	attachment_list_items.append(f"[{filename}]({attachment_url(num, filename, att_repo, att_branch)})" + (f" (versions: {cnt})" if cnt > 1 else ""))
	att_replace_map[filename] = attachment_url(num, filename, att_repo, att_branch)

	# embed github issue number next to linked issue keys
	linked_issues_list_items = []
	for jira_key in linked_issues:
	linked_issues_list_items.append(f"- [{jira_key}]({jira_issue_url(jira_key)})\n")

	# embed github issue number next to sub task keys
	subtasks_list_items = []
	for jira_key in subtasks:
	subtasks_list_items.append(f"- [{jira_key}]({jira_issue_url(jira_key)})\n")

	created_datetime = dateutil.parser.parse(created)
	updated_datetime = dateutil.parser.parse(updated)
	if resolutiondate is not None:
	resolutiondate_datetime = dateutil.parser.parse(resolutiondate)
	else:
	resolutiondate_datetime = None

	try:
	body = f'{convert_text(description, att_replace_map, account_map, jira_users)}\n\n'
	except Exception as e:
	logger.error(traceback.format_exc(limit=100))
	logger.error(f"Failed to convert opening issue description on {jira_issue_id(num)} due to above exception, ({str(e)}); falling back to original Jira description as code block.")
	logger.error(f"Original description: {description}")
	body = f"```\n{description}```\n\n"

	body += f"""

	---
	### Legacy Jira details

	[{jira_id}]({jira_issue_url(jira_id)}) by {reporter} on {created_datetime.strftime('%b %d %Y')}"""

	if resolutiondate_datetime is not None:
	body += f", resolved {resolutiondate_datetime.strftime('%b %d %Y')}"
	elif created_datetime.date() != updated_datetime.date():
	body += f", updated {updated_datetime.strftime('%b %d %Y')}"

	if len(attachment_list_items) > 0:
	body += f'\nAttachments: {", ".join(attachment_list_items)}'

	if len(linked_issues_list_items) > 0:
	body += f'\nLinked issues:\n {"".join(linked_issues_list_items)}'

	if len(subtasks_list_items) > 0:
	body += f'\nSub-tasks:\n {"".join(subtasks_list_items)}'

	if len(pull_requests) > 0:
	body += f'\nPull requests: {", ".join([str(x) for x in pull_requests])}'

	body += '\n'

	def comment_author(author_name, author_dispname):
	author_gh = account_map.get(author_name)
	return f"{author_dispname} (@{author_gh})" if author_gh else author_dispname

	def enable_hyperlink_to_commit(comment_body: str):
	lines = []
	for line in comment_body.split("\n"):
	# remove '[' and ']' iff it contains a URL (i.e. link to a commit in ASF GitBox repo).
	m = re.match(r"^\[\s?(https?://\S+)\s?\]$", line.strip())
	if m:
	lines.append(m.group(1))
	else:
	lines.append(line)
	return "\n".join(lines)

	comments = extract_comments(o)
	comments_data = []
	for (comment_author_name, comment_author_dispname, comment_body, comment_created, comment_updated, comment_id) in comments:
	# TODO: since we now have accurate created_at reflected in the github comment, mabye we remove these
	# timestamps? also, if the account id mapped over to known GH account, we can drop Jira footer entirely?
	comment_created_datetime = dateutil.parser.parse(comment_created)
	comment_time = f'{comment_created_datetime.strftime("%b %d %Y")}'
	comment_updated_datetime = dateutil.parser.parse(comment_updated)
	if comment_updated_datetime.date() != comment_created_datetime.date():
	comment_time += f' [updated: {comment_updated_datetime.strftime("%b %d %Y")}]'
	try:
	comment_body = f'{convert_text(comment_body, att_replace_map, account_map, jira_users)}\n\n'
	# apply a special conversion for jira-bot's comments.
	# see https://github.com/apache/lucene-jira-archive/issues/54
	if comment_author_name == "jira-bot":
	comment_body = enable_hyperlink_to_commit(comment_body)
	except Exception as e:
	logger.error(traceback.format_exc(limit=100))
	logger.error(f"Failed to convert comment on {jira_issue_id(num)} due to above exception ({str(e)}); falling back to original Jira comment as code block.")
	logger.error(f"Original text: {comment_body}")
	comment_body = f"```\n{comment_body}```\n\n"

	jira_comment_link = f'https://issues.apache.org/jira/browse/{jira_id}?focusedCommentId={comment_id}&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-{comment_id}'

	comment_body += f'[Legacy Jira: {comment_author(comment_author_name, comment_author_dispname)} on [{comment_time}]({jira_comment_link})]\n'
	data = {
	"body": comment_body
	}
	if comment_created:
	data["created_at"] = jira_timestamp_to_github_timestamp(comment_created)
	comments_data.append(data)

	labels = []
	if issue_type and ISSUE_TYPE_TO_LABEL_MAP.get(issue_type):
	labels.append(ISSUE_TYPE_TO_LABEL_MAP.get(issue_type))
	# milestone?
	for v in fix_versions:
	if v:
	labels.append(f"fix-version:{v}")
	for v in versions:
	if v:
	labels.append(f"affects-version:{v}")
	for c in components:
	if c.startswith("core"):
	labels.append(f"module:{c}")
	elif c in COMPONENT_TO_LABEL_MAP:
	l = COMPONENT_TO_LABEL_MAP[c]
	if l is not None:
	labels.append(l)
	else:
	logger.error(f"Unknown Component: {c}")

	data = {
	"issue": {
	"title": make_github_title(summary, jira_id),
	"body": body,
	"closed": status in ["Closed", "Resolved"],
	"labels": labels,
	},
	"comments": comments_data
	}
	if created:
	data["issue"]["created_at"] = jira_timestamp_to_github_timestamp(created)
	if updated:
	data["issue"]["updated_at"] = jira_timestamp_to_github_timestamp(updated)
	if resolutiondate:
	data["issue"]["closed_at"] = jira_timestamp_to_github_timestamp(resolutiondate)
	if assignee_gh:
	data["issue"]["assignee"] = assignee_gh

	data_file = github_data_file(output_dir, num)
	with open(data_file, "w") as fp:
	json.dump(data, fp, indent=2)

	logger.debug(f"GitHub issue data created: {data_file}")
	return True


	if __name__ == "__main__":
	github_att_repo = os.getenv("GITHUB_ATT_REPO")
	if not github_att_repo:
	print("Please set your GitHub attachment repo to GITHUB_ATT_REPO environment variable.")
	sys.exit(1)
	github_att_branch = os.getenv("GITHUB_ATT_BRANCH")
	if not github_att_repo:
	print("Please set your GitHub attachment branch to GITHUB_ATT_BRANCH environment variable.")
	sys.exit(1)

	parser = argparse.ArgumentParser()
	parser.add_argument('--issues', type=int, required=False, nargs='*', help='Jira issue number list to be downloaded')
	parser.add_argument('--min', type=int, dest='min', required=False, default=1, help='Minimum Jira issue number to be converted')
	parser.add_argument('--max', type=int, dest='max', required=False, help='Maximum Jira issue number to be converted')
	parser.add_argument('--num-workers', type=int, dest='num_workers', required=False, default=1, help='Number of worker processes')
	args = parser.parse_args()

	dump_dir = Path(__file__).resolve().parent.parent.joinpath(JIRA_DUMP_DIRNAME)
	if not dump_dir.exists():
	print(f"Jira dump dir not exists: {dump_dir}")
	sys.exit(1)

	mappings_dir = Path(__file__).resolve().parent.parent.joinpath(MAPPINGS_DATA_DIRNAME)
	account_mapping_file = mappings_dir.joinpath(ACCOUNT_MAPPING_FILENAME)
	jira_users_file = mappings_dir.joinpath(JIRA_USERS_FILENAME)

	output_dir = Path(__file__).resolve().parent.parent.joinpath(GITHUB_IMPORT_DATA_DIRNAME)
	if not output_dir.exists():
	output_dir.mkdir()
	assert output_dir.exists()

	account_map = read_account_map(account_mapping_file) if account_mapping_file.exists() else {}
	jira_users = read_jira_users_map(jira_users_file) if jira_users_file.exists() else {}

	issues = []
	if args.issues:
	issues = args.issues
	else:
	if args.max:
	issues.extend(list(range(args.min, args.max + 1)))
	else:
	issues.append(args.min)
	num_workers = args.num_workers

	log_dir = Path(__file__).resolve().parent.parent.joinpath(LOG_DIRNAME)
	name = "jira2github_import"
	(listener, queue) = log_listener(log_dir, name)
	listener.start()
	logging_setup_worker(queue)
	logger = logging.getLogger(name)

	logger.info(f"Converting Jira issues to GitHub issues in {output_dir}. num_workers={num_workers}")

	def task(num):
	logger = logging.getLogger(name)
	try:
	convert_issue(num, dump_dir, output_dir, account_map, jira_users, github_att_repo, github_att_branch, logger)
	except Exception as e:
	logger.error(traceback.format_exc(limit=100))
	logger.error(f"Failed to convert Jira issue. An error '{str(e)}' occurred; skipped {jira_issue_id(num)}.")

	results = []
	# Try to support Windows: The worker configuration is done at the start of the worker process run.
	with multiprocessing.Pool(num_workers, initializer=logging_setup_worker, initargs=(queue,)) as pool:
	for num in issues:
	result = pool.apply_async(task, (num,))
	results.append(result)
	for res in results:
	res.get()

	logger.info("Done.")
	queue.put_nowait(None)
	listener.join()