blob: c0754cae76b3f9f483f489fa0177253070b960ac [file] [log] [blame]
#!/usr/bin/env python3
############################################################################
# tools/codeowners/codeoweners.py
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership. The
# ASF licenses this file to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance with the
# License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
############################################################################
import os
import re
import subprocess
import sys
from multiprocessing import Pool
from typing import TypeAlias
# Number of processes to use
N_PROCESSES: int = 20
# Top `n` authors to list as code owners in the CODEOWNERS file
TOP_N_AUTHORS: int = 5
# git repository folder
GIT_FOLDER: str = ".git"
# Command to get author email and number of changes
CHANGE_STATS_CMD: list[str] = [
"git",
"--no-pager",
"log",
'--pretty="%ae"',
"--shortstat",
]
# Command to list all files tracked by git
GIT_FILE_CMD: list[str] = ["git", "ls-files"]
# Locations to ignore for code owner generation
IGNORE_LOCS: list[str] = [
# Generated release notes
"Documentation/ReleaseNotes/*",
# Boiler plate files
"**/__init__.py",
# Files that aren't really necessary to have owners
"**/Make.defs",
"**/CMakeLists.txt",
# Image files from documentation
"*.png",
"*.jpg",
]
# Represents a code owner by email
CodeOwner: TypeAlias = str
class Ownership:
"""Represents an ownership relationship for a path."""
def __init__(self, owner: CodeOwner, path: str, changes: int) -> None:
"""Creates a new ownership relationship."""
self.owner: CodeOwner = owner
self.path: str = path
self.changes: int = changes
def __str__(self) -> str:
return f"Ownership(path='{self.path}', owner='{self.owner}', changes={self.changes})"
def __repr__(self) -> str:
return self.__str__()
# Represents all contributors to a file
Roster: TypeAlias = dict[str, list[Ownership]]
def parse_change_count(changestr: str) -> int:
"""Parses a string summary of changes to the file into an integer number of changes."""
change_types = changestr.split(",")
# Skip the number of files changed, which is always the first index
count = 0
for change in change_types[1:]:
value = re.search(r"\d+", change)
if value is not None:
count += int(value.group())
return count
def get_owners(path: str) -> list[Ownership]:
"""Gets ownership relationships for `path`."""
cmd = CHANGE_STATS_CMD.copy() + [path]
raw_logs = subprocess.run(cmd, capture_output=True, text=True).stdout
logs = [e for e in raw_logs.split("\n") if e != ""]
ownerships: dict[CodeOwner, Ownership] = dict()
for raw_email, raw_changes in zip(logs[::2], logs[1::2]):
owner = raw_email.replace('"', "")
change_count = parse_change_count(raw_changes)
if ownerships.get(owner) is None:
ownerships[owner] = Ownership(
path=path,
owner=owner,
changes=change_count,
)
else:
ownerships[owner].changes += change_count
return list(ownerships.values())
def sort_by_changes(owners: list[Ownership]) -> None:
"""Sorts `owners` by the number of changes each owner has made, from most to least changes."""
owners.sort(key=lambda o: o.changes, reverse=True)
def filter_owners(owners: list[Ownership], ignore: list[CodeOwner]) -> list[Ownership]:
"""Returns a list composing of the original `owners` list, but without any of the owners in the `ignore` list."""
return [o for o in owners if o.owner not in ignore]
def get_files() -> list[str]:
"""Returns a list of currently tracked files in the git repository."""
ignored_dirs = []
for loc in IGNORE_LOCS:
ignored_dirs.append(f":!:{loc}")
cmd = GIT_FILE_CMD.copy() + ["--"] + ignored_dirs
return (
subprocess.run(cmd, text=True, capture_output=True).stdout.strip().split("\n")
)
def parse_ignore_list(path: str) -> list[CodeOwner]:
"""Parses a file of code owners to ignore into a list."""
owners = []
with open(path, "r") as file:
for line in file:
owners.append(line.strip())
return owners
def generate_codeowners(roster: Roster) -> None:
"""Generates the output code-owner file in the console from the `roster`."""
for file, ownerships in roster.items():
print(file, end=" ")
for o in ownerships:
print(o.owner, end=" ")
print()
def main() -> None:
# Check that we are running in the repository root
if GIT_FOLDER not in os.listdir():
print("You should only run this script from the root of the git repository.")
return
ignore_list = []
if len(sys.argv) < 2:
print(
"You can pass in the path to the list of emails to ignore when generating the code owner file."
"This prevents these authors from getting auto-requested in PR reviews."
)
print("Ex: ./tools/codeowners.py ./tools/codeowners-ignore.txt")
else:
ignore_list = parse_ignore_list(sys.argv[1])
source_files = get_files()
roster: Roster = dict()
with Pool(N_PROCESSES) as p:
for file, ownerships in zip(source_files, p.map(get_owners, source_files)):
sort_by_changes(ownerships) # Sort by greatest change count
ownerships = ownerships[:TOP_N_AUTHORS] # Only use the top `N`
ownerships = filter_owners(
ownerships, ignore_list
) # Filter authors who wish to be ignored
# Skip empty lines since they do nothing of value
if len(ownerships) == 0:
continue
# Store in roster
roster[file] = ownerships
# Print like code-owner file
generate_codeowners(roster)
if __name__ == "__main__":
main()