blob: 5cc68329ccca4178a932d9072aa80de82a0b2fae [file] [log] [blame]
#!/usr/bin/env python
"""Executable Python script for scanning source code for compliance.
This script checks some (simple) conventions:
- no symlinks
- no tabs
- no trailing whitespace
- files end with EOL
- valid license headers in source files (where applicable)
- general regex. string search
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
import argparse
import collections
try:
import configparser
except ImportError:
import ConfigParser as configparser
import fnmatch
import itertools
import os
import platform
import re
import sys
import textwrap
# import pathspec from local lib path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/lib')
import pathspec
from gitwildmatch import GitWildMatchPattern
VERBOSE = False
# Terminal colors
BLUE = '\033[94m'
CYAN = '\033[36m'
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[33m'
# Translatable messages (error and general)
ERR_REGEX = "line contains forbidden pattern [%s]. line=[%s]"
ERR_GENERAL = "an unspecified error was detected."
ERR_INVALID_CONFIG_FILE = "Invalid configuration file [%s]: %s.\n"
ERR_INVALID_SCAN_FUNCTION = "Config. file filter [%s] lists invalid " \
"function [%s]."
ERR_LICENSE = "file does not include required license header."
ERR_LICENSE_FILE_NOT_FOUND = "License file [%s] could not be found."
ERR_NO_EOL_AT_EOF = "file does not end with EOL."
ERR_PATH_IS_NOT_DIRECTORY = "%s: [%s] is not a valid directory.\n"
ERR_REQUIRED_SECTION = "Configuration file missing required section: [%s]"
ERR_SYMBOLIC_LINK = "file is a symbolic link."
ERR_TABS = "line contains tabs."
ERR_TRAILING_WHITESPACE = "line has trailing whitespace."
HELP_CONFIG_FILE = "provide custom configuration file"
HELP_GITIGNORE_FILE = "provide .gitignore file for additional path exclusions"
HELP_DISPLAY_EXCLUSIONS = "display path exclusion information"
HELP_ROOT_DIR = "the starting directory for the scan"
HELP_VERBOSE = "enable verbose output"
MSG_CHECKING_FILE = " [%s]..."
MSG_CHECKS_PASSED = "All checks passed."
MSG_CONFIG_ADDING_LICENSE_FILE = "Adding valid license from: [%s], value:\n%s"
MSG_ERROR_SUMMARY = "Scan detected %d error(s) in %d file(s):"
MSG_READING_CONFIGURATION = "Reading configuration file [%s]..."
MSG_READING_GITIGNORE = "Reading gitignore file [%s]..."
MSG_READING_LICENSE_FILE = "Reading license file [%s]..."
MSG_RUNNING_FILE_CHECKS = " Running File Check [%s]"
MSG_RUNNING_LINE_CHECKS = " Running Line Check [%s]"
MSG_SCANNING_FILTER = "Scanning files with filter: [%s]:"
MSG_SCANNING_STARTED = "Scanning files starting at [%s]..."
MSG_SKIPPING_FILE = "SKIPPING non-existent file [%s]"
MSG_SKIPPING_BINARY_FILE = "SKIPPING binary file [%s]"
WARN_CONFIG_SECTION_NOT_FOUND = "Configuration file section [%s] not found."
WARN_SCAN_EXCLUDED_PATH_SUMMARY = "Scan excluded (%s) patterns:"
WARN_SCAN_EXCLUDED_FILE_SUMMARY = "Scan excluded (%s) files:"
WARN_SCAN_EXCLUDED_FILE = " Excluded file: %s"
WARN_SCAN_EXCLUDED_PATH = " Excluded pattern: %s"
MSG_DESCRIPTION = "Scans all source code under specified directory for " \
"project compliance using provided configuration."
# Default values for command line arguments
DEFAULT_ROOT_DIR = "."
DEFAULT_PROGRAM_PATH = "./"
DEFAULT_CONFIG_FILE = "ASF-Release.cfg"
DEFAULT_LICENSE_SEARCH_SLACK = 500
# Configuration file sections
SECTION_LICENSE = "Licenses"
SECTION_EXCLUDE = "Excludes"
SECTION_INCLUDE = "Includes"
SECTION_OPTIONS = "Options"
SECTION_REGEX = "Regex"
# Configuration Options known keys
OPT_LICENSE_SLACK_LEN = "license_slack_length"
# Globals
"""Hold valid license headers within an array strings."""
valid_licenses = []
"""Paths to exclude from directory search."""
exclusion_paths = []
"""Regex. patterns to search for."""
regex_patterns = []
"""globals."""
exclusion_files_set = set()
license_search_slack_len = DEFAULT_LICENSE_SEARCH_SLACK
FILE_CHECK_FUNCTIONS = dict()
LINE_CHECK_FUNCTIONS = dict()
FILTERS_WITH_CHECK_FUNCTIONS = []
def print_error(msg):
"""Print error message to stderr."""
sys.stderr.write(col.red(msg) + "\n")
def print_warning(msg):
"""Print warning message to stdout."""
print(col.yellow(msg))
def print_status(msg):
"""Print status message to stdout."""
print(msg)
def print_success(msg):
"""Print success message to stdout."""
print(col.green(msg))
def print_highlight(msg):
"""Print highlighted message to stdout."""
print(col.cyan(msg))
def vprint(s):
"""Conditional print (stdout)."""
if VERBOSE:
print_status(s)
def get_config_section_dict(config, section):
"""Retrieve key-value(s) for requested section of a config. file."""
dict1 = {}
try:
options = config.options(section)
# print_warning("options for section: %s\n%s" % (section, options))
for option in options:
try:
dict1[option] = config.get(section, option)
except:
dict1[option] = None
except:
print_warning(WARN_CONFIG_SECTION_NOT_FOUND % section)
return None
return dict1
def find_license_on_path(filename, path):
"""Find the specified filename in path; return it or raise error."""
filename = os.path.join(path, filename)
if not os.path.exists(filename):
raise Exception(ERR_LICENSE_FILE_NOT_FOUND %
filename)
return filename
def read_license_files(config):
"""Read the license files to use when scanning source files."""
file_dict = get_config_section_dict(config, SECTION_LICENSE)
# vprint("file_dict: " + str(file_dict))
if file_dict is not None:
# for each key (license filename) in license section
for license_filename in file_dict:
# Read and append text of each license (header) to a global array.
# Each 'key' should be a filename containing license text.
try:
# if the file is not in current directory, try to find
# it in the path this script is being executed from.
if not os.path.exists(license_filename):
license_filename = find_license_on_path(
license_filename,
DEFAULT_PROGRAM_PATH)
with open(license_filename, 'r') as temp_file:
vprint(MSG_READING_LICENSE_FILE % license_filename)
str1 = str(temp_file.read())
valid_licenses.append(str(str1))
# vprint(MSG_CONFIG_ADDING_LICENSE_FILE % (license_filename, str1))
except Exception as e:
raise e
else:
raise Exception(ERR_REQUIRED_SECTION % SECTION_LICENSE)
def read_path_exclusions(config, gitignore_file):
"""Read the list of paths to exclude from the scan."""
path_dict = get_config_section_dict(config, SECTION_EXCLUDE)
# vprint("path_dict: " + str(path_dict))
if path_dict is not None:
# each 'key' is an exclusion path
for key in path_dict:
key = str.strip(key)
if key is not None:
exclusion_paths.append(key)
if gitignore_file is not None:
print_highlight(MSG_READING_GITIGNORE % gitignore_file.name)
for line in gitignore_file.read().splitlines():
exclusion_paths.append(line)
def read_scan_options(config):
"""Read the Options from the configuration file."""
options_dict = get_config_section_dict(config, SECTION_OPTIONS)
# vprint("options_dict: " + str(options_dict))
if options_dict is not None:
# Check for license scan slack length option
# Set global variable to value found in config.
if OPT_LICENSE_SLACK_LEN in options_dict:
global license_search_slack_len
license_search_slack_len = int(options_dict[OPT_LICENSE_SLACK_LEN])
else:
raise Exception(ERR_REQUIRED_SECTION % SECTION_OPTIONS)
def read_regex(config):
"""Read the Regular Expressions from the configuration file."""
options_dict = get_config_section_dict(config, SECTION_REGEX)
# vprint("options_dict: " + str(options_dict))
if options_dict is not None:
# each key is a regex string
for pattern in options_dict:
if pattern is not None:
regex_patterns.append(pattern)
else:
raise Exception(ERR_REQUIRED_SECTION % SECTION_REGEX)
def read_config_file(file, gitignore_file):
"""Read in and validate configuration file."""
try:
print_highlight(MSG_READING_CONFIGURATION % file.name)
# Provide for sections that have simply values (not key=value)
config = configparser.ConfigParser(allow_no_value=True)
# This option prevents options from being normalized to lowercase
# by allowing the raw string in the config. to be passed through
config.optionxform = str
if sys.version_info[0] < 3:
config.readfp(file)
else:
config.read_file(file)
read_license_files(config)
read_path_inclusions(config)
read_path_exclusions(config, gitignore_file)
read_scan_options(config)
read_regex(config)
except Exception as e:
print_error(e)
return -1
return 0
def no_tabs(line):
"""Assert line does not contains a TAB character."""
if re.match("\t", line):
return ERR_TABS
else:
return None
def no_trailing_spaces(line):
"""Assert line does not have trailing whitespace."""
if len(line) > 0 and line[-1] == '\n':
line = line[:-1]
if re.match("""^.*\s$""", line):
return ERR_TRAILING_WHITESPACE
else:
return None
def eol_at_eof(line):
"""Assert line at End of File is an End of Line character."""
if len(line) == 0 or line[-1] != '\n':
return ERR_NO_EOL_AT_EOF
else:
return None
def has_block_license(path):
"""Open file and verify it contains a valid license header."""
if not os.path.isfile(path):
if VERBOSE:
print_error(MSG_SKIPPING_FILE % path)
return []
with open(path) as fp:
for license in valid_licenses:
# Assure license string is normalized to remove indentations
# caused by declaration (above) as a string literal.
normalized_license = textwrap.dedent(license)
# Search for license at start of file,
# allowing for some "slack" length
file_head = fp.read(len(normalized_license) +
license_search_slack_len)
if file_head is None:
return [(1, ERR_LICENSE)]
elif normalized_license in file_head:
return []
# reset and try finding the next license
fp.seek(0)
return [(1, ERR_LICENSE)]
def is_not_symlink(path):
"""Assert a file is not a symbolic link."""
if os.path.islink(path):
return [(0, ERR_SYMBOLIC_LINK)]
else:
return None
def regex_check(line):
"""Assert line does not contain strings matching regex. expressions."""
# vprint("regex pattern: " + str(regex_patterns))
for pattern in regex_patterns:
if re.search(pattern, line):
return ERR_REGEX % (pattern, line)
else:
return None
# Note: this function must appear after all "check" functions are defined
def read_path_inclusions(config):
"""Read the list of paths to include in scan tests."""
inclusion_dict = get_config_section_dict(config, SECTION_INCLUDE)
# vprint("inclusion_dict: " + str(inclusion_dict))
for key in inclusion_dict:
all_checks = inclusion_dict[key]
# strip off all whitespace, regardless of index
all_checks = all_checks.replace(' ', '')
# retrieve the names of all functions to scan for
# the respective filename (wildcards allowed)
function_names = all_checks.split(',')
file_check_fxs = []
line_check_fxs = []
for fname in function_names:
try:
fx = globals()[fname]
if fname in FILE_CHECK_FUNCTIONS:
file_check_fxs.append(fx)
elif fname in LINE_CHECK_FUNCTIONS:
line_check_fxs.append(fx)
except Exception:
print_error(ERR_INVALID_SCAN_FUNCTION % (key, fname))
sys.exit(1)
a_tuple = (key, file_check_fxs, line_check_fxs)
FILTERS_WITH_CHECK_FUNCTIONS.append(a_tuple)
# vprint("filters(checks):" + str(FILTERS_WITH_CHECK_FUNCTIONS))
def run_file_checks(file_path, checks):
"""Run a series of file-by-file checks."""
errors = []
# if VERBOSE (True) then print filename being checked
vprint(MSG_CHECKING_FILE % file_path)
for check in checks:
vprint(col.cyan(MSG_RUNNING_FILE_CHECKS % check.__name__))
errs = check(file_path)
if errs:
errors += errs
return errors
def run_line_checks(file_path, checks):
"""Check each line in a file against given list of filters."""
if not os.path.isfile(file_path):
if VERBOSE:
print_error(MSG_SKIPPING_FILE % file_path)
return []
errors = []
line_number = 0
# For each line in the file, run all "line checks"
# open file in text mode; skip any binary files
try:
with open(file_path, 'r') as fp:
for line in fp:
line_number += 1
for check in checks:
if line_number == 1:
vprint(col.cyan(MSG_RUNNING_LINE_CHECKS %
check.__name__))
err = check(line)
if err is not None:
errors.append((line_number, err))
except UnicodeDecodeError:
if VERBOSE:
print_error(MSG_SKIPPING_BINARY_FILE % file_path)
return errors
def all_paths(root_dir):
"""Generator that returns files with known extensions that can be scanned.
Iteration is recursive beginning at the passed root directory and
skipping directories that are listed as exception paths.
"""
spec = pathspec.PathSpec.from_lines(GitWildMatchPattern, exclusion_paths)
exclusion_files_set = set(map(lambda f: os.path.join(root_dir, f), spec.match_tree(root_dir)))
for dir_path, dir_names, files in os.walk(root_dir):
for f in files:
filename = os.path.join(dir_path, f)
if filename not in exclusion_files_set:
yield filename
def colors():
"""Create a collection of helper functions to colorize strings."""
ansi = hasattr(sys.stderr, "isatty") and platform.system() != "Windows"
def colorize(code, string):
# Enable ANSI terminal color only around string provided (if valid)
return "%s%s%s" % (code, string, '\033[0m') if ansi else string
def cyan(s):
return colorize(CYAN, s)
def green(s):
return colorize(GREEN, s)
def red(s):
return colorize(RED, s)
def yellow(s):
return colorize(YELLOW, s)
return collections.namedtuple(
"Colorizer",
"cyan green red yellow")(cyan, green, red, yellow)
# Script entrypoint.
if __name__ == "__main__":
# Prepare message colorization methods
col = colors()
# Parser helpers
def is_dir(path):
"""Check if path is a directory."""
return os.path.isdir(root_dir)
# identify the path (directory) where scanCode.py is located
# Use this as default for finding default configuration
DEFAULT_PROGRAM_PATH = os.path.split(os.path.abspath(__file__))[0]
# vprintf("DEFAULT_PROGRAM_PATH: =[%s]" % DEFAULT_PROGRAM_PATH)
DEFAULT_CONFIG_FILE = os.path.join(DEFAULT_PROGRAM_PATH,
DEFAULT_CONFIG_FILE)
# create / configure our argument parser
# Note: ArgumentParser catches all errors and outputs a message
# to override this behavior you would need to subclass it.
parser = argparse.ArgumentParser(description=MSG_DESCRIPTION)
parser.add_argument("-v", "--verbose",
action="store_true",
dest="verbose",
default=False,
help=HELP_VERBOSE)
parser.add_argument("-x",
action="store_true",
dest="display_exclusions",
default=False,
help=HELP_DISPLAY_EXCLUSIONS)
parser.add_argument("--config",
type=argparse.FileType('r'),
action="store",
dest="config",
default=DEFAULT_CONFIG_FILE,
help=HELP_CONFIG_FILE)
parser.add_argument("--gitignore",
type=argparse.FileType('r'),
action="store",
dest="gitignore",
help=HELP_GITIGNORE_FILE)
parser.add_argument("root_directory",
type=str,
default=DEFAULT_ROOT_DIR,
help=HELP_ROOT_DIR)
# Invoke parser, assign argument values to locals
args = parser.parse_args()
root_dir = args.root_directory
VERBOSE = args.verbose
# Config file at this point is an actual file object
config_file = args.config
gitignore_file = args.gitignore
# Assign supported scan functions to either file or line globals
# These checks run once per-file
FILE_CHECK_FUNCTIONS.update({
"is_not_symlink": is_not_symlink,
"has_block_license": has_block_license
})
# These checks run once per-line, per-file
LINE_CHECK_FUNCTIONS.update({
"no_tabs": no_tabs,
"no_trailing_spaces": no_trailing_spaces,
"eol_at_eof": eol_at_eof,
"regex_check": regex_check
})
# Read / load configuration file from file (pointer)
if read_config_file(config_file, gitignore_file) == -1:
exit(1)
# Verify starting path parameter is valid
if not is_dir(root_dir):
print_error(ERR_PATH_IS_NOT_DIRECTORY % (sys.argv[0], root_dir))
parser.print_help()
exit(1)
# Positive feedback to caller that scanning has started
print_highlight(MSG_SCANNING_STARTED % root_dir)
# Runs all listed checks on all relevant files.
all_errors = []
paths_to_check = set(all_paths(root_dir))
for fltr, chks1, chks2 in FILTERS_WITH_CHECK_FUNCTIONS:
# vprint(col.cyan(MSG_SCANNING_FILTER % fltr))
# vprint("chks1=" + str(chks1))
# vprint("chks2=" + str(chks2))
matches = fnmatch.filter(paths_to_check, fltr)
# vprint("paths=" + str(paths_to_check))
# vprint("matches=" + str(matches))
for path in matches:
# vprint("path=[" + path + "]")
errors = run_file_checks(path, chks1)
errors += run_line_checks(path, chks2)
all_errors += map(lambda p: (path, p[0], p[1]), errors)
# Display directory (path) exclusion details
if VERBOSE:
print_warning(WARN_SCAN_EXCLUDED_PATH_SUMMARY % len(exclusion_paths))
# Display all paths that were excluded (by configuration)
for excluded_path in exclusion_paths:
print_warning(WARN_SCAN_EXCLUDED_PATH % excluded_path)
# Display which files where excluded from these paths
if args.display_exclusions:
print_warning(WARN_SCAN_EXCLUDED_FILE_SUMMARY %
len(exclusion_files_set))
for excluded_file in exclusion_files_set:
print_warning(WARN_SCAN_EXCLUDED_FILE % excluded_file)
def sort_key(p):
"""Define sort key for error listing as the filename."""
# Filename is the 0th entry in tuple
return p[0]
if all_errors:
# Group / sort errors by filename
error_listing = ""
files_with_errors = 0
for path, triples in itertools.groupby(sorted(all_errors,
key=sort_key),
key=sort_key):
files_with_errors += 1
error_listing += " [%s]:\n" % path
pairs = sorted(map(lambda t: (t[1], t[2]), triples),
key=lambda p: p[0])
for line, msg in pairs:
error_listing += col.red(" %4d: %s\n" % (line, msg))
# Summarize errors
summary = MSG_ERROR_SUMMARY % (len(all_errors), files_with_errors)
print_highlight(summary)
print(error_listing)
print_error(summary)
sys.exit(1)
else:
print_success(MSG_CHECKS_PASSED)
sys.exit(0)