dev/create-release/generate-contributors.py - spark - Git at Google

 #!/usr/bin/env python3

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This script automates the process of creating release notes.

 import os
 import re
 import sys

 from releaseutils import tag_exists, get_commits, yesOrNoPrompt, get_date, \
     is_valid_author, capitalize_author, JIRA, find_components, translate_issue_type, \
     translate_component, CORE_COMPONENT, contributors_file_name, nice_join

 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
 RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
 PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")

 # If the release tags are not provided, prompt the user to provide them
 while not tag_exists(RELEASE_TAG):
     RELEASE_TAG = input("Please provide a valid release tag: ")
 while not tag_exists(PREVIOUS_RELEASE_TAG):
     print("Please specify the previous release tag.")
     PREVIOUS_RELEASE_TAG = input(
         "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")

 # Gather commits found in the new tag but not in the old tag.
 # This filters commits based on both the git hash and the PR number.
 # If either is present in the old tag, then we ignore the commit.
 print("Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
 release_commits = get_commits(RELEASE_TAG)
 previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
 previous_release_hashes = set()
 previous_release_prs = set()
 for old_commit in previous_release_commits:
     previous_release_hashes.add(old_commit.get_hash())
     if old_commit.get_pr_number():
         previous_release_prs.add(old_commit.get_pr_number())
 new_commits = []
 for this_commit in release_commits:
     this_hash = this_commit.get_hash()
     this_pr_number = this_commit.get_pr_number()
     if this_hash in previous_release_hashes:
         continue
     if this_pr_number and this_pr_number in previous_release_prs:
         continue
     new_commits.append(this_commit)
 if not new_commits:
     sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))

 # Prompt the user for confirmation that the commit range is correct
 print("\n==================================================================================")
 print("JIRA server: %s" % JIRA_API_BASE)
 print("Release tag: %s" % RELEASE_TAG)
 print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
 print("Number of commits in this range: %s" % len(new_commits))
 print("")


 def print_indented(_list):
     for x in _list:
         print("  %s" % x)
 if yesOrNoPrompt("Show all commits?"):
     print_indented(new_commits)
 print("==================================================================================\n")
 if not yesOrNoPrompt("Does this look correct?"):
     sys.exit("Ok, exiting")

 # Filter out special commits
 releases = []
 maintenance = []
 reverts = []
 nojiras = []
 filtered_commits = []


 def is_release(commit_title):
     return ("[release]" in commit_title.lower() or
             "preparing spark release" in commit_title.lower() or
             "preparing development version" in commit_title.lower() or
             "CHANGES.txt" in commit_title)


 def is_maintenance(commit_title):
     return "maintenance" in commit_title.lower() or \
         "manually close" in commit_title.lower()


 def has_no_jira(commit_title):
     return not re.findall("SPARK-[0-9]+", commit_title.upper())


 def is_revert(commit_title):
     return "revert" in commit_title.lower()


 def is_docs(commit_title):
     return re.findall("docs*", commit_title.lower()) or \
         "programming guide" in commit_title.lower()


 for c in new_commits:
     t = c.get_title()
     if not t:
         continue
     elif is_release(t):
         releases.append(c)
     elif is_maintenance(t):
         maintenance.append(c)
     elif is_revert(t):
         reverts.append(c)
     elif is_docs(t):
         filtered_commits.append(c)  # docs may not have JIRA numbers
     elif has_no_jira(t):
         nojiras.append(c)
     else:
         filtered_commits.append(c)

 # Warn against ignored commits
 if releases or maintenance or reverts or nojiras:
     print("\n==================================================================================")
     if releases:
         print("Found %d release commits" % len(releases))
     if maintenance:
         print("Found %d maintenance commits" % len(maintenance))
     if reverts:
         print("Found %d revert commits" % len(reverts))
     if nojiras:
         print("Found %d commits with no JIRA" % len(nojiras))
     print("* Warning: these commits will be ignored.\n")
     if yesOrNoPrompt("Show ignored commits?"):
         if releases:
             print("Release (%d)" % len(releases))
             print_indented(releases)
         if maintenance:
             print("Maintenance (%d)" % len(maintenance))
             print_indented(maintenance)
         if reverts:
             print("Revert (%d)" % len(reverts))
             print_indented(reverts)
         if nojiras:
             print("No JIRA (%d)" % len(nojiras))
             print_indented(nojiras)
     print("==================== Warning: the above commits will be ignored ==================\n")
 prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
 if not yesOrNoPrompt(prompt_msg):
     sys.exit("Ok, exiting.")

 # Keep track of warnings to tell the user at the end
 warnings = []

 # Mapping from the invalid author name to its associated JIRA issues
 # E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
 invalid_authors = {}

 # Populate a map that groups issues and components by author
 # It takes the form: Author name -> { Contribution type -> Spark components }
 # For instance,
 # {
 #   'Andrew Or': {
 #     'bug fixes': ['windows', 'core', 'web ui'],
 #     'improvements': ['core']
 #   },
 #   'Tathagata Das' : {
 #     'bug fixes': ['streaming']
 #     'new feature': ['streaming']
 #   }
 # }
 #
 author_info = {}
 jira_options = {"server": JIRA_API_BASE}
 jira_client = JIRA(options=jira_options)
 print("\n=========================== Compiling contributor list ===========================")
 for commit in filtered_commits:
     _hash = commit.get_hash()
     title = commit.get_title()
     issues = re.findall("SPARK-[0-9]+", title.upper())
     author = commit.get_author()
     date = get_date(_hash)
     # If the author name is invalid, keep track of it along
     # with all associated issues so we can translate it later
     if is_valid_author(author):
         author = capitalize_author(author)
     else:
         if author not in invalid_authors:
             invalid_authors[author] = set()
         for issue in issues:
             invalid_authors[author].add(issue)
     # Parse components from the commit title, if any
     commit_components = find_components(title, _hash)
     # Populate or merge an issue into author_info[author]

     def populate(issue_type, components):
         components = components or [CORE_COMPONENT]  # assume core if no components provided
         if author not in author_info:
             author_info[author] = {}
         if issue_type not in author_info[author]:
             author_info[author][issue_type] = set()
         for component in components:
             author_info[author][issue_type].add(component)
     # Find issues and components associated with this commit
     for issue in issues:
         try:
             jira_issue = jira_client.issue(issue)
             jira_type = jira_issue.fields.issuetype.name
             jira_type = translate_issue_type(jira_type, issue, warnings)
             jira_components = [translate_component(c.name, _hash, warnings)
                                for c in jira_issue.fields.components]
             all_components = set(jira_components + commit_components)
             populate(jira_type, all_components)
         except Exception as e:
             print("Unexpected error:", e)
     # For docs without an associated JIRA, manually add it ourselves
     if is_docs(title) and not issues:
         populate("documentation", commit_components)
     print("  Processed commit %s authored by %s on %s" % (_hash, author, date))
 print("==================================================================================\n")

 # Write to contributors file ordered by author names
 # Each line takes the format " * Author name -- semi-colon delimited contributions"
 # e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
 # e.g. * Tathagata Das -- Bug fixes and new features in Streaming
 contributors_file = open(contributors_file_name, "w")
 authors = list(author_info.keys())
 authors.sort()
 for author in authors:
     contribution = ""
     components = set()
     issue_types = set()
     for issue_type, comps in author_info[author].items():
         components.update(comps)
         issue_types.add(issue_type)
     # If there is only one component, mention it only once
     # e.g. Bug fixes, improvements in MLlib
     if len(components) == 1:
         contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
     # Otherwise, group contributions by issue types instead of modules
     # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
     else:
         contributions = ["%s in %s" % (issue_type, nice_join(comps))
                          for issue_type, comps in author_info[author].items()]
         contribution = "; ".join(contributions)
     # Do not use python's capitalize() on the whole string to preserve case
     assert contribution
     contribution = contribution[0].capitalize() + contribution[1:]
     # If the author name is invalid, use an intermediate format that
     # can be translated through translate-contributors.py later
     # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
     if author in invalid_authors and invalid_authors[author]:
         author = author + "/" + "/".join(invalid_authors[author])
     # line = " * %s -- %s" % (author, contribution)
     line = author
     contributors_file.write(line + "\n")
 contributors_file.close()
 print("Contributors list is successfully written to %s!" % contributors_file_name)

 # Prompt the user to translate author names if necessary
 if invalid_authors:
     warnings.append("Found the following invalid authors:")
     for a in invalid_authors:
         warnings.append("\t%s" % a)
     warnings.append("Please run './translate-contributors.py' to translate them.")

 # Log any warnings encountered in the process
 if warnings:
     print("\n============ Warnings encountered while creating the contributor list ============")
     for w in warnings:
         print(w)
     print("Please correct these in the final contributors list at %s." % contributors_file_name)
     print("==================================================================================\n")
	#!/usr/bin/env python3

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# This script automates the process of creating release notes.

	import os
	import re
	import sys

	from releaseutils import tag_exists, get_commits, yesOrNoPrompt, get_date, \
	is_valid_author, capitalize_author, JIRA, find_components, translate_issue_type, \
	translate_component, CORE_COMPONENT, contributors_file_name, nice_join

	# You must set the following before use!
	JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
	RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
	PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")

	# If the release tags are not provided, prompt the user to provide them
	while not tag_exists(RELEASE_TAG):
	RELEASE_TAG = input("Please provide a valid release tag: ")
	while not tag_exists(PREVIOUS_RELEASE_TAG):
	print("Please specify the previous release tag.")
	PREVIOUS_RELEASE_TAG = input(
	"For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")

	# Gather commits found in the new tag but not in the old tag.
	# This filters commits based on both the git hash and the PR number.
	# If either is present in the old tag, then we ignore the commit.
	print("Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
	release_commits = get_commits(RELEASE_TAG)
	previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
	previous_release_hashes = set()
	previous_release_prs = set()
	for old_commit in previous_release_commits:
	previous_release_hashes.add(old_commit.get_hash())
	if old_commit.get_pr_number():
	previous_release_prs.add(old_commit.get_pr_number())
	new_commits = []
	for this_commit in release_commits:
	this_hash = this_commit.get_hash()
	this_pr_number = this_commit.get_pr_number()
	if this_hash in previous_release_hashes:
	continue
	if this_pr_number and this_pr_number in previous_release_prs:
	continue
	new_commits.append(this_commit)
	if not new_commits:
	sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))

	# Prompt the user for confirmation that the commit range is correct
	print("\n==================================================================================")
	print("JIRA server: %s" % JIRA_API_BASE)
	print("Release tag: %s" % RELEASE_TAG)
	print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
	print("Number of commits in this range: %s" % len(new_commits))
	print("")


	def print_indented(_list):
	for x in _list:
	print(" %s" % x)
	if yesOrNoPrompt("Show all commits?"):
	print_indented(new_commits)
	print("==================================================================================\n")
	if not yesOrNoPrompt("Does this look correct?"):
	sys.exit("Ok, exiting")

	# Filter out special commits
	releases = []
	maintenance = []
	reverts = []
	nojiras = []
	filtered_commits = []


	def is_release(commit_title):
	return ("[release]" in commit_title.lower() or
	"preparing spark release" in commit_title.lower() or
	"preparing development version" in commit_title.lower() or
	"CHANGES.txt" in commit_title)


	def is_maintenance(commit_title):
	return "maintenance" in commit_title.lower() or \
	"manually close" in commit_title.lower()


	def has_no_jira(commit_title):
	return not re.findall("SPARK-[0-9]+", commit_title.upper())


	def is_revert(commit_title):
	return "revert" in commit_title.lower()


	def is_docs(commit_title):
	return re.findall("docs*", commit_title.lower()) or \
	"programming guide" in commit_title.lower()


	for c in new_commits:
	t = c.get_title()
	if not t:
	continue
	elif is_release(t):
	releases.append(c)
	elif is_maintenance(t):
	maintenance.append(c)
	elif is_revert(t):
	reverts.append(c)
	elif is_docs(t):
	filtered_commits.append(c) # docs may not have JIRA numbers
	elif has_no_jira(t):
	nojiras.append(c)
	else:
	filtered_commits.append(c)

	# Warn against ignored commits
	if releases or maintenance or reverts or nojiras:
	print("\n==================================================================================")
	if releases:
	print("Found %d release commits" % len(releases))
	if maintenance:
	print("Found %d maintenance commits" % len(maintenance))
	if reverts:
	print("Found %d revert commits" % len(reverts))
	if nojiras:
	print("Found %d commits with no JIRA" % len(nojiras))
	print("* Warning: these commits will be ignored.\n")
	if yesOrNoPrompt("Show ignored commits?"):
	if releases:
	print("Release (%d)" % len(releases))
	print_indented(releases)
	if maintenance:
	print("Maintenance (%d)" % len(maintenance))
	print_indented(maintenance)
	if reverts:
	print("Revert (%d)" % len(reverts))
	print_indented(reverts)
	if nojiras:
	print("No JIRA (%d)" % len(nojiras))
	print_indented(nojiras)
	print("==================== Warning: the above commits will be ignored ==================\n")
	prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
	if not yesOrNoPrompt(prompt_msg):
	sys.exit("Ok, exiting.")

	# Keep track of warnings to tell the user at the end
	warnings = []

	# Mapping from the invalid author name to its associated JIRA issues
	# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
	invalid_authors = {}

	# Populate a map that groups issues and components by author
	# It takes the form: Author name -> { Contribution type -> Spark components }
	# For instance,
	# {
	# 'Andrew Or': {
	# 'bug fixes': ['windows', 'core', 'web ui'],
	# 'improvements': ['core']
	# },
	# 'Tathagata Das' : {
	# 'bug fixes': ['streaming']
	# 'new feature': ['streaming']
	# }
	# }
	#
	author_info = {}
	jira_options = {"server": JIRA_API_BASE}
	jira_client = JIRA(options=jira_options)
	print("\n=========================== Compiling contributor list ===========================")
	for commit in filtered_commits:
	_hash = commit.get_hash()
	title = commit.get_title()
	issues = re.findall("SPARK-[0-9]+", title.upper())
	author = commit.get_author()
	date = get_date(_hash)
	# If the author name is invalid, keep track of it along
	# with all associated issues so we can translate it later
	if is_valid_author(author):
	author = capitalize_author(author)
	else:
	if author not in invalid_authors:
	invalid_authors[author] = set()
	for issue in issues:
	invalid_authors[author].add(issue)
	# Parse components from the commit title, if any
	commit_components = find_components(title, _hash)
	# Populate or merge an issue into author_info[author]

	def populate(issue_type, components):
	components = components or [CORE_COMPONENT] # assume core if no components provided
	if author not in author_info:
	author_info[author] = {}
	if issue_type not in author_info[author]:
	author_info[author][issue_type] = set()
	for component in components:
	author_info[author][issue_type].add(component)
	# Find issues and components associated with this commit
	for issue in issues:
	try:
	jira_issue = jira_client.issue(issue)
	jira_type = jira_issue.fields.issuetype.name
	jira_type = translate_issue_type(jira_type, issue, warnings)
	jira_components = [translate_component(c.name, _hash, warnings)
	for c in jira_issue.fields.components]
	all_components = set(jira_components + commit_components)
	populate(jira_type, all_components)
	except Exception as e:
	print("Unexpected error:", e)
	# For docs without an associated JIRA, manually add it ourselves
	if is_docs(title) and not issues:
	populate("documentation", commit_components)
	print(" Processed commit %s authored by %s on %s" % (_hash, author, date))
	print("==================================================================================\n")

	# Write to contributors file ordered by author names
	# Each line takes the format " * Author name -- semi-colon delimited contributions"
	# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
	# e.g. * Tathagata Das -- Bug fixes and new features in Streaming
	contributors_file = open(contributors_file_name, "w")
	authors = list(author_info.keys())
	authors.sort()
	for author in authors:
	contribution = ""
	components = set()
	issue_types = set()
	for issue_type, comps in author_info[author].items():
	components.update(comps)
	issue_types.add(issue_type)
	# If there is only one component, mention it only once
	# e.g. Bug fixes, improvements in MLlib
	if len(components) == 1:
	contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
	# Otherwise, group contributions by issue types instead of modules
	# e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
	else:
	contributions = ["%s in %s" % (issue_type, nice_join(comps))
	for issue_type, comps in author_info[author].items()]
	contribution = "; ".join(contributions)
	# Do not use python's capitalize() on the whole string to preserve case
	assert contribution
	contribution = contribution[0].capitalize() + contribution[1:]
	# If the author name is invalid, use an intermediate format that
	# can be translated through translate-contributors.py later
	# E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
	if author in invalid_authors and invalid_authors[author]:
	author = author + "/" + "/".join(invalid_authors[author])
	# line = " * %s -- %s" % (author, contribution)
	line = author
	contributors_file.write(line + "\n")
	contributors_file.close()
	print("Contributors list is successfully written to %s!" % contributors_file_name)

	# Prompt the user to translate author names if necessary
	if invalid_authors:
	warnings.append("Found the following invalid authors:")
	for a in invalid_authors:
	warnings.append("\t%s" % a)
	warnings.append("Please run './translate-contributors.py' to translate them.")

	# Log any warnings encountered in the process
	if warnings:
	print("\n============ Warnings encountered while creating the contributor list ============")
	for w in warnings:
	print(w)
	print("Please correct these in the final contributors list at %s." % contributors_file_name)
	print("==================================================================================\n")