#!/usr/bin/env python
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
HELP = '''
Compares two specified branches, using the Gerrit Change-Id as the
primary identifier. Ignored commits can be added via a JSON
configuration file or with a special string in the commit message.
Changes can be cherrypicked with the --cherry_pick argument.
This script can be used to keep two development branches
(by default, "master" and "2.x", in sync). It is equivalent
to cherry-picking commits one by one, but automates identifying
the commits to cherry-pick. Unlike "git cherry", it uses
the Gerrit Change-Id identifier in the commit message
as a key.
The ignored_commits.json configuration file is of the following
form. Note that commits are the full 20-byte git hashes.
"source": "master",
"target": "2.x",
"commits": [
{ "hash": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "comment": "..."},
{ "hash": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", "comment": "..."}
The --target_remote_name is optional. If not specified, the target remote is set to
the value of the --source_remote_name. Debug logging to stderr can be enabled with
$bin/ --source_branch master --target_branch 2.x
Commits in asf-gerrit/master but not in asf-gerrit/2.x:
35a3e186d61b8f365b0f7d1127be311758437e16 IMPALA-5478: Run TPCDS queries with decimal_v2 enabled (Thu Jan 18 03:28:51 2018 +0000) - Taras Bobrovytsky
d9b6fd073055b436c7404d49454dc215b2c7a369 IMPALA-6386: Invalidate metadata at table level for dataload (Wed Jan 17 22:52:58 2018 +0000) - Joe McDonnell
dcc7be0ed483b332dac22d6596f56ff2a6cfdaa3 IMPALA-4315: Allow USE and SHOW TABLES if the user has only column privileges (Wed Jan 17 22:40:13 2018 +0000) - Csaba Ringhofer
b6e43133e671773d2757612f72cfcdb0ff303226 IMPALA-6399: Increase timeout in test_observability to reduce flakiness (Wed Jan 17 22:31:33 2018 +0000) - Lars Volker
Jira keys referenced (Note: not all commit messages will reference a jira key):
import argparse
import json
import logging
import os
import re
import subprocess
import sys
from collections import defaultdict
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from pprint import pformat
def create_parser():
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
Mix-in to leave the description alone, but show
parser = argparse.ArgumentParser(
parser.add_argument('--cherry_pick', action='store_true', default=False,
help='Cherry-pick mismatched commits to current branch. This ' +
'must match (in the hash sense) the target branch.')
parser.add_argument('--partial_ok', action='store_true', default=False,
help='Exit with success if at least one cherrypick succeeded.')
parser.add_argument('--source_branch', default='master')
parser.add_argument('--target_branch', default='2.x')
parser.add_argument('--source_remote_name', default='asf-gerrit',
help='Name of the source git remote. If set to empty string, ' +
'this remote is not fetched and branch names are used ' +
' as is; otherwise, the source ref is remote/branch.')
parser.add_argument('--target_remote_name', default=None,
help='Name of the target git remote; defaults to source remote. ' +
'Empty strings are handled the same way as --source_remote_name.')
default_ignored_commits_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), 'ignored_commits.json')
parser.add_argument('--ignored_commits_file', default=default_ignored_commits_path,
help='JSON File that contains ignored commits as specified in the help')
default="Cherry-pick.?:.?not (for|to) {branch}",
help='Regex searched for in commit messages that causes the commit to be ignored.' +
' {branch} is replaced with target branch; the search is case-insensitive')
parser.add_argument('--verbose', '-v', action='store_true', default=False,
help='Turn on DEBUG and INFO logging')
return parser
def read_ignored_commits(ignored_commits_file):
'''Returns a dictionary containing commits that should be ignored.
ignored_commits_file is a path to a JSON file with schema
specified at the top of this file.
The return structure has dictionary keys are a tuple containing
(source_branch, target_branch) and values are a set of git hashes.
ignored_commits = defaultdict(set)
with open(ignored_commits_file) as f:
json_data = json.load(f)
for result_dict in json_data:
logging.debug("Parsing result_dict: {0}".format(result_dict))
ignored_commits[(result_dict['source'], result_dict['target'])] =\
set([ commit["hash"] for commit in result_dict['commits'] ])
return ignored_commits
def build_commit_map(branch, merge_base):
'''Creates a map from change id to (hash, subject, author, date, body).'''
# Disable git pager in order for the sh.git.log command to work
os.environ['GIT_PAGER'] = ''
fields = ['%H', '%s', '%an', '%cd', '%b']
pretty_format = '\x1f'.join(fields) + '\x1e'
result = OrderedDict()
for line in subprocess.check_output(["git", "log", branch, "^" + merge_base,
"--pretty=" + pretty_format, "--color=never"]).split('\x1e'):
if line == "":
# if no changes are identified by the git log, we get an empty string
if line == "\n":
# git log adds a newline to the end; we can skip it
commit_hash, subject, author, date, body = [t.strip() for t in line.split('\x1f')]
change_id_matches = re.findall('Change-Id: (.*)', body)
if change_id_matches:
if len(change_id_matches) > 1:
logging.warning("Commit %s contains multiple change ids; using first one.",
change_id = change_id_matches[0]
result[change_id] = (commit_hash, subject, author, date, body)
logging.warning('Commit {0} ({1}...) has no Change-Id.'.format(
commit_hash, subject[:40]))
logging.debug("Commit map for branch %s has size %d.", branch, len(result))
return result
def cherrypick(cherry_pick_hashes, full_target_branch_name, partial_ok):
"""Cherrypicks the given commits.
Also, asserts that full_target_branch_name matches the current HEAD.
cherry_pick_hashes is a list of git hashes, in the order to
be cherry-picked.
If partial_ok is true, return gracefully if at least one cherrypick
has succeeded.
Note that this function does not push to the remote.
print "Cherrypicking %d changes." % (len(cherry_pick_hashes),)
if len(cherry_pick_hashes) == 0:
# Cherrypicking only makes sense if we're on the equivalent of the target branch.
head_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
target_branch_sha = subprocess.check_output(
['git', 'rev-parse', full_target_branch_name]).strip()
if head_sha != target_branch_sha:
print "Cannot cherrypick because %s (%s) and HEAD (%s) are divergent." % (
full_target_branch_name, target_branch_sha, head_sha)
for i, cherry_pick_hash in enumerate(cherry_pick_hashes):
ret =
['git', 'cherry-pick', '--keep-redundant-commits', cherry_pick_hash])
if ret != 0:
if partial_ok and i > 0:
subprocess.check_call(['git', 'cherry-pick', '--abort'])
print "Failed to cherry-pick %s; stopping picks." % (cherry_pick_hash,)
raise Exception("Failed to cherry-pick: %s" % (cherry_pick_hash,))
def main():
parser = create_parser()
options = parser.parse_args()
log_level = logging.WARNING
if options.verbose:
log_level = logging.DEBUG
format='%(asctime)s %(threadName)s %(levelname)s: %(message)s')
if options.target_remote_name is None:
options.target_remote_name = options.source_remote_name
# Ensure all branches are up to date, unless remotes are disabled
# by specifying them with an empty string.
if options.source_remote_name != "":
subprocess.check_call(['git', 'fetch', options.source_remote_name])
full_source_branch_name = options.source_remote_name + '/' + options.source_branch
full_source_branch_name = options.source_branch
if options.target_remote_name != "":
if options.source_remote_name != options.target_remote_name:
subprocess.check_call(['git', 'fetch', options.target_remote_name])
full_target_branch_name = options.target_remote_name + '/' + options.target_branch
full_target_branch_name = options.target_branch
merge_base = subprocess.check_output(["git", "merge-base",
full_source_branch_name, full_target_branch_name]).strip()
source_commits = build_commit_map(full_source_branch_name, merge_base)
target_commits = build_commit_map(full_target_branch_name, merge_base)
ignored_commits = read_ignored_commits(options.ignored_commits_file)
logging.debug("ignored commits from {0}:\n{1}"
.format(options.ignored_commits_file, pformat(ignored_commits)))
commits_ignored = [] # Track commits actually ignored for debug logging
cherry_pick_hashes = []
print '-' * 80
print 'Commits in {0} but not in {1}:'.format(
full_source_branch_name, full_target_branch_name)
print '-' * 80
jira_keys = []
jira_key_pat = re.compile(r'(IMPALA-\d+)')
skip_commits_matching = options.skip_commits_matching.format(
for change_id, (commit_hash, msg, author, date, body) in source_commits.iteritems():
change_in_target = change_id in target_commits
ignore_by_config = commit_hash in ignored_commits[
(options.source_branch, options.target_branch)]
ignore_by_commit_message =, "\n".join([msg, body]),
# This conditional block just for debug logging of ignored commits
if ignore_by_config or ignore_by_commit_message:
if change_in_target:
logging.debug("Not ignoring commit because change is already in target: {0}"
if ignore_by_commit_message:
logging.debug("Ignoring commit {0} by commit message.".format(commit_hash))
logging.debug("Ignoring commit {0} by config file.".format(commit_hash))
logging.debug("NOT ignoring commit {0} since not in ignored commits ({1},{2})"
.format(commit_hash, options.source_branch, options.target_branch))
if not change_in_target and not ignore_by_config and not ignore_by_commit_message:
print u'{0} {1} ({2}) - {3}'.format(commit_hash, msg, date, author)
jira_keys += jira_key_pat.findall(msg)
print '-' * 80
print "Jira keys referenced (Note: not all commit messages will reference a jira key):"
print ','.join(jira_keys)
print '-' * 80
logging.debug("Commits actually ignored (change was not in target): {0}"
if options.cherry_pick:
cherrypick(cherry_pick_hashes, full_target_branch_name, options.partial_ok)
if __name__ == '__main__':