| #!/usr/bin/python3 |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # Usage: critique-gerrit-review.py [--dryrun] |
| # |
| # This script is meant to run on an jenkins.impala.io build slave and post back comments |
| # to a code review. It does not need to run on all supported platforms so we use system |
| # python instead of the full Impala virtualenv. |
| # |
| # This script runs in the context of a source checkout. It posts comments for issues |
| # introduced between HEAD^ and HEAD. It picks up metadata from environment variables |
| # set by the jenkins gerrit trigger: GERRIT_CHANGE_NUMBER, GERRIT_PATCHSET_NUMBER, etc. |
| # |
| # It uses the gerrit ssh interface to post the review, connecting as |
| # impala-public-jenkins. |
| # Ref: https://gerrit-review.googlesource.com/Documentation/cmd-review.html |
| # |
| # Dependencies: |
| # ssh, pip, venv |
| # |
| # TODO: generalise to other warnings |
| # * clang-tidy |
| |
| from argparse import ArgumentParser |
| from collections import defaultdict |
| import json |
| import os |
| from os import environ |
| import os.path |
| import re |
| from subprocess import check_call, check_output, DEVNULL, Popen, PIPE |
| import sys |
| import venv |
| |
| |
| FLAKE8_VERSION = "7.1.1" |
| FLAKE8_DIFF_VERSION = "0.2.2" |
| PYPARSING_VERSION = "3.1.4" |
| FLAKE8_UNUSED_ARG_VERSION = "0.0.13" |
| |
| VENV_PATH = "gerrit_critic_venv" |
| VENV_BIN = os.path.join(VENV_PATH, "bin") |
| PIP_PATH = os.path.join(VENV_BIN, "pip3") |
| FLAKE8_DIFF_PATH = os.path.join(VENV_BIN, "flake8-diff") |
| |
| # Limit on length of lines in source files. |
| LINE_LIMIT = 90 |
| |
| # Source file extensions that we should apply our line limit and whitespace rules to. |
| SOURCE_EXTENSIONS = {".cc", ".h", ".java", ".py", ".sh", ".thrift"} |
| |
| # Source file patterns that we exclude from our checks. |
| EXCLUDE_FILE_PATTERNS = [ |
| re.compile(r".*be/src/kudu.*"), # Kudu source code may have different rules. |
| re.compile(r".*-benchmark.cc"), # Benchmark files tend to have long lines. |
| re.compile(r".*/function-registry/impala_functions.py"), # Many long strings. |
| re.compile(r".*/catalog/BuiltinsDb.java"), # Many long strings. |
| re.compile(r".*/codegen/gen_ir_descriptions.py"), # Many long strings. |
| re.compile(r".*shell/ext-py/.*"), # Third-party code. |
| re.compile(r".*be/src/thirdparty/.*"), # Third-party code. |
| re.compile(r".*/.*\.xml\.py") # Long lines in config template files. |
| ] |
| |
| # Thrift files that are not used in communication between impalad and catalogd/statestore |
| EXCLUDE_THRIFT_FILES = { |
| "BackendGflags.thrift", # Only used between FE and BE |
| "beeswax.thrift", # Only used between client and impalad |
| "DataSinks.thrift", # Only used in impalads |
| "Descriptors.thrift", # Only used in impalads |
| "ExecStats.thrift", # Only used in impalads |
| "LineageGraph.thrift", # Only used in impalads |
| "NetworkTest.thrift", # Unused in production |
| "Planner.thrift", # Only used in impalads |
| "PlanNodes.thrift", # Only used in impalads |
| "parquet.thrift", # Only used in impalads |
| "ResourceProfile.thrift", # Only used in impalads |
| "SystemTables.thrift", # Only used in impalads |
| "Zip.thrift", # Unused |
| } |
| |
| THRIFT_FILE_COMMENT = ( |
| "This file is used in communication between impalad and catalogd/statestore. " |
| "Please make sure impalads can still work with new/old versions of catalogd and " |
| "statestore. Basically only new optional fields can be added.") |
| FBS_FILE_COMMENT = ( |
| "This file is used in communication between impalad and catalogd/statestore. " |
| "Please make sure impalads can still work with new/old versions of catalogd and " |
| "statestore. Basically only new fields can be added and should be added at the end " |
| "of a table definition.\n" |
| "https://flatbuffers.dev/flatbuffers_guide_writing_schema.html") |
| PLANNER_TEST_TABLE_STATS = ( |
| "Table numRows and totalSize is changed in this planner test. It is unnecessary to " |
| "change them. Consider running testdata/bin/restore-stats-on-planner-tests.py " |
| "script to restore them to consistent values.") |
| UNCOMMITTED_CHANGE = "Uncommitted change detected!" |
| COMMENT_REVISION_SIDE = "REVISION" |
| COMMENT_PARENT_SIDE = "PARENT" |
| |
| # Matches range information like: |
| # @@ -229,0 +230,2 @@ struct TColumnStats { |
| RANGE_RE = re.compile(r"^@@ -([0-9]*).* \+([0-9]*).*@@\s*(.*)$") |
| # Matches required/optional fields like: |
| # 7: required i64 num_trues |
| REQUIRED_FIELD_RE = re.compile(r"[0-9]*: required \w* \w*") |
| OPTIONAL_FIELD_RE = re.compile(r"[0-9]*: optional \w* \w*") |
| # Matches include statements like: |
| # include "Exprs.thrift" |
| INCLUDE_FILE_RE = re.compile(r"include \"(\w+\.thrift)\"") |
| THRIFT_WARNING_SUFFIX = (" might break the compatibility between impalad and " |
| "catalogd/statestore during upgrade") |
| |
| |
| def setup_virtualenv(): |
| """Set up virtualenv with flake8-diff.""" |
| venv.create(VENV_PATH, with_pip=True, system_site_packages=True) |
| check_call([PIP_PATH, "install", |
| "wheel", |
| f"flake8=={FLAKE8_VERSION}", |
| f"flake8-diff=={FLAKE8_DIFF_VERSION}", |
| f"pyparsing=={PYPARSING_VERSION}", |
| f"flake8-unused-arguments=={FLAKE8_UNUSED_ARG_VERSION}"]) |
| # Add the libpath of the installed venv to import pyparsing |
| sys.path.append(os.path.join(VENV_PATH, f"lib/python{sys.version_info.major}." |
| f"{sys.version_info.minor}/site-packages/")) |
| |
| |
| def get_comment_input(message, line_number=0, side=COMMENT_REVISION_SIDE, |
| context_line="", dryrun=False): |
| comment = {"message": message, "line": line_number, "side": side} |
| if dryrun: |
| comment["context_line"] = context_line |
| return comment |
| |
| |
| def get_flake8_comments(base_revision, revision): |
| """Get flake8 warnings for code changes made in the git commit 'revision'. |
| Returns a dict with file path as keys and a list of CommentInput objects. See |
| https://gerrit-review.googlesource.com/Documentation/rest-api-changes.html#review-input |
| for information on the format.""" |
| comments = defaultdict(lambda: []) |
| # flake8 needs to be on the path. |
| flake8_env = os.environ.copy() |
| flake8_env["PATH"] = "{0}:{1}".format(VENV_BIN, flake8_env["PATH"]) |
| |
| flake8_diff_proc = Popen( |
| [FLAKE8_DIFF_PATH, "--standard-flake8-output", "--color", "off", base_revision, |
| revision], |
| stdin=PIPE, stdout=PIPE, stderr=PIPE, env=flake8_env, universal_newlines=True) |
| stdout, stderr = flake8_diff_proc.communicate() |
| # Ignore the return code since it will be non-zero if any violations are found. We want |
| # to continue in that case. Instead check stderr for any errors. |
| if stderr: |
| raise Exception("Did not expect flake8-diff to write to stderr:\n{0}".format(stderr)) |
| |
| # Match output lines like: |
| # bin/jenkins/flake8-gerrit-review.py:25:1: F401 'json' imported but unused |
| VIOLATION_RE = re.compile(r"^([^:]*):([0-9]*):([0-9]*): (.*)$") |
| |
| for line in stdout.splitlines(): |
| match = VIOLATION_RE.match(line) |
| if not match: |
| raise Exception("Pattern did not match line:\n{0}".format(line)) |
| file, line, col, details = match.groups() |
| line = int(line) |
| col = int(col) |
| skip_file = False |
| for pattern in EXCLUDE_FILE_PATTERNS: |
| if pattern.match(file): |
| skip_file = True |
| break |
| if skip_file: |
| continue |
| comments_for_file = comments[file] |
| comment = {"message": "flake8: {0}".format(details)} |
| # Heuristic: if the error is on the first column, assume it applies to the whole line. |
| if col == 1: |
| comment["line"] = line |
| else: |
| comment["range"] = {"start_line": line, "end_line": line, |
| "start_character": col - 1, "end_character": col} |
| comments_for_file.append(comment) |
| return comments |
| |
| |
| def get_misc_comments(base_revision, revision, dryrun=False): |
| """Get miscellaneous warnings for code changes made in the git commit 'revision', e.g. |
| long lines and trailing whitespace. These warnings are produced by directly parsing the |
| diff output.""" |
| comments = defaultdict(lambda: []) |
| # Matches range information like: |
| # @@ -128 +133,2 @@ if __name__ == "__main__": |
| RANGE_RE = re.compile(r"^@@ -[0-9,]* \+([0-9]*).*$") |
| |
| diff = check_output(["git", "diff", "-U0", "{0}..{1}".format(base_revision, revision)], |
| universal_newlines=True) |
| curr_file = None |
| check_source_file = False |
| curr_line_num = 0 |
| for diff_line in diff.splitlines(): |
| if diff_line.startswith("+++ "): |
| # Start of diff for a file. Strip off "+++ b/" to get the file path. |
| curr_file = diff_line[6:] |
| check_source_file = os.path.splitext(curr_file)[1] in SOURCE_EXTENSIONS |
| if check_source_file: |
| for pattern in EXCLUDE_FILE_PATTERNS: |
| if pattern.match(curr_file): |
| check_source_file = False |
| break |
| elif diff_line.startswith("@@ "): |
| # Figure out the starting line of the hunk. Format of unified diff is: |
| # @@ -128 +133,2 @@ if __name__ == "__main__": |
| # We want to extract the start line for the added lines |
| match = RANGE_RE.match(diff_line) |
| if not match: |
| raise Exception("Pattern did not match diff line:\n{0}".format(diff_line)) |
| curr_line_num = int(match.group(1)) |
| elif diff_line.startswith("+") and check_source_file: |
| # An added or modified line - check it to see if we should generate warnings. |
| add_misc_comments_for_line(comments, diff_line[1:], curr_file, curr_line_num, |
| dryrun) |
| curr_line_num += 1 |
| return comments |
| |
| |
| def add_misc_comments_for_line(comments, line, curr_file, curr_line_num, dryrun=False): |
| """Helper for get_misc_comments to generate comments for 'line' at 'curr_line_num' in |
| 'curr_file' and append them to 'comments'.""" |
| # Check for trailing whitespace. |
| if line.rstrip() != line: |
| comments[curr_file].append(get_comment_input( |
| "line has trailing whitespace", curr_line_num, context_line=line, dryrun=dryrun)) |
| |
| # Check for long lines. Skip .py files since flake8 already flags long lines. |
| if len(line) > LINE_LIMIT and os.path.splitext(curr_file)[1] != ".py": |
| msg = "line too long ({0} > {1})".format(len(line), LINE_LIMIT) |
| comments[curr_file].append(get_comment_input( |
| msg, curr_line_num, context_line=line, dryrun=dryrun)) |
| |
| if '\t' in line: |
| comments[curr_file].append(get_comment_input( |
| "tab used for whitespace", curr_line_num, context_line=line, dryrun=dryrun)) |
| |
| if 'ThriftDebugString' in line and curr_file.startswith("be/src/"): |
| comments[curr_file].append(get_comment_input( |
| "Please make sure you don't output sensitive data with ThriftDebugString(). " |
| "If so, use impala::RedactedDebugString() instead.", |
| curr_line_num, context_line=line, dryrun=dryrun)) |
| |
| |
| def compare_thrift_structs(curr_file, old_structs, curr_structs, comments): |
| # Skip if the file is removed or new |
| if old_structs is None or curr_structs is None: |
| return |
| # Adding new structs is OK. Compare fields of existing structs. |
| for struct_name, old_fields in old_structs.items(): |
| if struct_name not in curr_structs: |
| print(f"Removed struct {struct_name}") |
| continue |
| curr_fields = curr_structs[struct_name] |
| for fid, old_field in old_fields.items(): |
| if fid not in curr_fields: |
| if old_field.qualifier == 'required': |
| comments[curr_file].append(get_comment_input( |
| f"Deleting required field '{old_field.name}' of {struct_name}" |
| + THRIFT_WARNING_SUFFIX, |
| old_field.line_num, COMMENT_PARENT_SIDE)) |
| continue |
| curr_field = curr_fields.pop(fid) |
| if curr_field.name != old_field.name: |
| comments[curr_file].append(get_comment_input( |
| f"Renaming field '{old_field.name}' to '{curr_field.name}' in {struct_name}" |
| + THRIFT_WARNING_SUFFIX, |
| curr_field.line_num)) |
| continue |
| if curr_field.qualifier != old_field.qualifier: |
| comments[curr_file].append(get_comment_input( |
| f"Changing field '{old_field.name}' from {old_field.qualifier} to " |
| f"{curr_field.qualifier} in {struct_name}" + THRIFT_WARNING_SUFFIX, |
| curr_field.line_num)) |
| continue |
| if curr_field.type != old_field.type: |
| comments[curr_file].append(get_comment_input( |
| f"Changing type of field '{old_field.name}' from {old_field.type} to " |
| f"{curr_field.type} in {struct_name}" + THRIFT_WARNING_SUFFIX, |
| curr_field.line_num)) |
| if len(curr_fields) > 0: |
| for new_field in curr_fields.values(): |
| if new_field.qualifier == 'required': |
| comments[curr_file].append(get_comment_input( |
| f"Adding a required field '{new_field.name}' in {struct_name}" |
| + THRIFT_WARNING_SUFFIX, |
| new_field.line_num)) |
| new_struct_names = curr_structs.keys() - old_structs.keys() |
| if len(new_struct_names) > 0: |
| print(f"New structs {new_struct_names}") |
| |
| |
| def compare_thrift_enums(curr_file, old_enums, curr_enums, comments): |
| # Skip if the file is removed or new |
| if old_enums is None or curr_enums is None: |
| return |
| for enum_name, old_items in old_enums.items(): |
| if enum_name not in curr_enums: |
| print(f"Removed enum {enum_name}") |
| continue |
| curr_items = curr_enums[enum_name] |
| for value, old_item in old_items.items(): |
| if value not in curr_items: |
| comments[curr_file].append(get_comment_input( |
| f"Removing the enum item {enum_name}.{old_item.name}={value}" |
| + THRIFT_WARNING_SUFFIX, |
| old_item.line_num, COMMENT_PARENT_SIDE)) |
| continue |
| if curr_items[value].name != old_item.name: |
| comments[curr_file].append(get_comment_input( |
| f"Enum item {old_item.name}={value} changed to " |
| f"{curr_items[value].name}={value} in {enum_name}. This" |
| + THRIFT_WARNING_SUFFIX, |
| curr_items[value].line_num)) |
| |
| |
| def extract_thrift_defs_of_revision(revision, file_name): |
| """Extract a dict of thrift structs from pyparsing.ParseResults""" |
| # Importing thrift_parser depends on pyparsing being installed in setup_virtualenv(). |
| from thrift_parser import extract_thrift_defs |
| try: |
| contents = check_output(["git", "show", f"{revision}:{file_name}"], |
| universal_newlines=True) |
| except Exception as e: |
| # Usually it's due to file doesn't exist in that revision |
| print(f"Failed to read {file_name} of revision {revision}: {e}") |
| return None, None |
| return extract_thrift_defs(contents) |
| |
| |
| def get_catalog_compatibility_comments(base_revision, revision): |
| """Get comments on Thrift/FlatBuffers changes that might break the communication |
| between impalad and catalogd/statestore""" |
| comments = defaultdict(lambda: []) |
| |
| diff = check_output( |
| ["git", "diff", "--name-only", "{0}..{1}".format(base_revision, revision), |
| "--", "common/thrift/*.thrift"], |
| universal_newlines=True) |
| for curr_file in diff.splitlines(): |
| if os.path.basename(curr_file) in EXCLUDE_THRIFT_FILES: |
| continue |
| print(f"Parsing {curr_file}") |
| curr_structs, curr_enums = extract_thrift_defs_of_revision(revision, curr_file) |
| old_structs, old_enums = extract_thrift_defs_of_revision(base_revision, curr_file) |
| compare_thrift_structs(curr_file, old_structs, curr_structs, comments) |
| compare_thrift_enums(curr_file, old_enums, curr_enums, comments) |
| |
| merge_comments( |
| comments, get_flatbuffers_compatibility_comments(base_revision, revision)) |
| return comments |
| |
| |
| def get_flatbuffers_compatibility_comments(base_revision, revision): |
| comments = defaultdict(lambda: []) |
| diff = check_output( |
| ["git", "diff", "--numstat", "{0}..{1}".format(base_revision, revision), |
| "--", "common/fbs"], |
| universal_newlines=True) |
| for diff_line in diff.splitlines(): |
| _, _, path = diff_line.split() |
| if os.path.splitext(path)[1] == ".fbs": |
| comments[path].append(get_comment_input(FBS_FILE_COMMENT)) |
| return comments |
| |
| |
| def get_planner_tests_comments(): |
| comments = defaultdict(lambda: []) |
| # First, check that testdata/ dir is clean. |
| diff = check_output( |
| ["git", "diff", "--numstat", "--", "testdata"], |
| universal_newlines=True) |
| for diff_line in diff.splitlines(): |
| _, _, path = diff_line.split() |
| comments[path].append(get_comment_input(UNCOMMITTED_CHANGE)) |
| if len(comments) > 0: |
| return comments |
| |
| # All clean. Run script and comment any changes. |
| check_call(['./testdata/bin/restore-stats-on-planner-tests.py'], stdout=DEVNULL) |
| diff = check_output( |
| ["git", "diff", "--numstat", "--", "testdata"], |
| universal_newlines=True) |
| for diff_line in diff.splitlines(): |
| _, _, path = diff_line.split() |
| if os.path.splitext(path)[1] == ".test": |
| comments[path].append(get_comment_input(PLANNER_TEST_TABLE_STATS)) |
| |
| # Restore testdata/ dir. |
| check_call(['git', 'checkout', 'testdata']) |
| return comments |
| |
| |
| def get_eslint_comments(base_revision, revision): |
| """Get comments based on JS code analysis and styling rules for |
| all scripts present in the git commit 'revision'.""" |
| filenames = check_output(["git", "diff", "{0}..{1}".format(base_revision, revision), |
| "--name-only", "--", "*.js"], universal_newlines=True) |
| filenames = filenames.strip().splitlines() |
| comments = {} |
| if (len(filenames) > 0): |
| node_env = {} |
| if (os.environ.get("IMPALA_SCRIPTS_HOME")): |
| node_env["IMPALA_HOME"] = os.environ.get("IMPALA_SCRIPTS_HOME") |
| else: |
| node_env["IMPALA_HOME"] = os.environ.get("IMPALA_HOME") |
| node_env["HOME"] = os.environ.get("HOME") |
| |
| NODEJS_SETUP = os.path.join(node_env["IMPALA_HOME"], "bin/nodejs/setup_nodejs.sh") |
| NODEJS_BIN = check_output([NODEJS_SETUP], env=node_env).decode("utf-8") |
| NODEJS_BIN = NODEJS_BIN.splitlines()[-1] |
| NODE = os.path.join(NODEJS_BIN, "node") |
| NPM = os.path.join(NODEJS_BIN, "npm") |
| LINTING_TESTS_HOME = os.path.join(node_env["IMPALA_HOME"], "tests/webui/linting") |
| check_output([NODE, NPM, "--prefix", LINTING_TESTS_HOME, "install"]) |
| filenames = [os.path.join(node_env["IMPALA_HOME"], filename) for filename in |
| filenames] |
| try: |
| node_env["IMPALA_NODEJS"] = NODE |
| node_env["REVIEW_JS_FILES"] = ' '.join(filenames) |
| LINTING_BIN = os.path.join(LINTING_TESTS_HOME, "node_modules/.bin/eslint") |
| LINTING_FORMATTER = os.path.join(LINTING_TESTS_HOME, |
| "eslint/eslint_custom_output_formatter.js") |
| LINTING_CONFIG = os.path.join(LINTING_TESTS_HOME, "eslint.config.js") |
| check_output([NODE, LINTING_BIN, "--config", LINTING_CONFIG, *filenames, "-f", |
| LINTING_FORMATTER], env=node_env).decode("utf-8") |
| except Exception as e: |
| comments = json.loads(e.output) |
| return comments |
| |
| def post_review_to_gerrit(review_input): |
| """Post a review to the gerrit patchset. 'review_input' is a ReviewInput JSON object |
| containing the review comments. The gerrit change and patchset are picked up from |
| environment variables set by the gerrit jenkins trigger.""" |
| change_num = environ["GERRIT_CHANGE_NUMBER"] |
| patch_num = environ["GERRIT_PATCHSET_NUMBER"] |
| proc = Popen(["ssh", "-p", environ["GERRIT_PORT"], |
| "impala-public-jenkins@" + environ["GERRIT_HOST"], "gerrit", "review", |
| "--project", environ["GERRIT_PROJECT"], "--json", |
| "{0},{1}".format(change_num, patch_num)], stdin=PIPE, |
| universal_newlines=True) |
| proc.communicate(json.dumps(review_input)) |
| if proc.returncode != 0: |
| raise Exception("Error posting review to gerrit.") |
| |
| |
| def merge_comments(a, b): |
| for k, v in b.items(): |
| a[k].extend(v) |
| |
| |
| if __name__ == "__main__": |
| parser = ArgumentParser(description="Generate and post gerrit comments") |
| parser.add_argument("--dryrun", action='store_true', |
| help="Don't post comments back to gerrit. Also shows the context " |
| "lines if possible.") |
| parser.add_argument("--revision", default="HEAD", |
| help="The revision to check. Defaults to HEAD. Note that " |
| "flake8-diff only actually works correctly on HEAD. So " |
| "specifying other commits might miss the results of " |
| "flake8-diff.") |
| parser.add_argument("--base-revision", |
| help="The base revision to check. Defaults to the parent commit of" |
| " revision") |
| args = parser.parse_args() |
| |
| setup_virtualenv() |
| revision = args.revision |
| base_revision = args.base_revision if args.base_revision else "{0}^".format(revision) |
| comments = get_flake8_comments(base_revision, revision) |
| merge_comments(comments, get_misc_comments(base_revision, revision, args.dryrun)) |
| merge_comments(comments, get_catalog_compatibility_comments(base_revision, revision)) |
| merge_comments(comments, get_planner_tests_comments()) |
| merge_comments(comments, get_eslint_comments(base_revision, revision)) |
| review_input = {"comments": comments} |
| if len(comments) > 0: |
| review_input["message"] = ( |
| "gerrit-auto-critic failed. You can reproduce it locally using command:\n\n" |
| " python3 bin/jenkins/critique-gerrit-review.py --dryrun\n\n" |
| "To run it, you might need a virtual env with Python3's venv installed.") |
| print(json.dumps(review_input, indent=True)) |
| if not args.dryrun: |
| post_review_to_gerrit(review_input) |