| #!/usr/bin/env python3 |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| from __future__ import annotations |
| |
| import heapq |
| import logging |
| import math |
| import pickle |
| import re |
| import textwrap |
| from datetime import datetime |
| from functools import cached_property |
| from typing import TYPE_CHECKING |
| |
| import pendulum |
| import rich_click as click |
| from github import Github, UnknownObjectException |
| from rich.console import Console |
| |
| if TYPE_CHECKING: |
| from github.PullRequest import PullRequest |
| |
| logger = logging.getLogger(__name__) |
| |
| console = Console(width=400, color_system="standard") |
| |
| option_github_token = click.option( |
| "--github-token", |
| type=str, |
| required=True, |
| help=textwrap.dedent( |
| """ |
| A GitHub token is required, and can also be provided by setting the GITHUB_TOKEN env variable. |
| Can be generated with: |
| https://github.com/settings/tokens/new?description=Read%20issues&scopes=repo:status""" |
| ), |
| envvar="GITHUB_TOKEN", |
| ) |
| |
| |
| class PrStat: |
| PROVIDER_SCORE = 0.8 |
| REGULAR_SCORE = 1.0 |
| REVIEW_INTERACTION_VALUE = 2.0 |
| COMMENT_INTERACTION_VALUE = 1.0 |
| REACTION_INTERACTION_VALUE = 0.5 |
| |
| def __init__(self, g, pull_request: PullRequest): |
| self.g = g |
| self.pull_request = pull_request |
| self.title = pull_request.title |
| self._users: set[str] = set() |
| self.len_comments: int = 0 |
| self.comment_reactions: int = 0 |
| self.issue_nums: list[int] = [] |
| self.len_issue_comments: int = 0 |
| self.num_issue_comments: int = 0 |
| self.num_issue_reactions: int = 0 |
| self.num_comments: int = 0 |
| self.num_conv_comments: int = 0 |
| self.num_protm: int = 0 |
| self.conv_comment_reactions: int = 0 |
| self.interaction_score = 1.0 |
| |
| @property |
| def label_score(self) -> float: |
| """assigns label score""" |
| labels = self.pull_request.labels |
| for label in labels: |
| if "provider" in label.name: |
| return PrStat.PROVIDER_SCORE |
| return PrStat.REGULAR_SCORE |
| |
| def calc_comments(self): |
| """counts reviewer comments, checks for #protm tag, counts rxns""" |
| for comment in self.pull_request.get_comments(): |
| self._users.add(comment.user.login) |
| lowercase_body = comment.body.lower() |
| if "protm" in lowercase_body: |
| self.num_protm += 1 |
| self.num_comments += 1 |
| if comment.body is not None: |
| self.len_comments += len(comment.body) |
| for reaction in comment.get_reactions(): |
| self._users.add(reaction.user.login) |
| self.comment_reactions += 1 |
| |
| def calc_conv_comments(self): |
| """counts conversational comments, checks for #protm tag, counts rxns""" |
| for conv_comment in self.pull_request.get_issue_comments(): |
| self._users.add(conv_comment.user.login) |
| lowercase_body = conv_comment.body.lower() |
| if "protm" in lowercase_body: |
| self.num_protm += 1 |
| self.num_conv_comments += 1 |
| for reaction in conv_comment.get_reactions(): |
| self._users.add(reaction.user.login) |
| self.conv_comment_reactions += 1 |
| if conv_comment.body is not None: |
| self.len_issue_comments += len(conv_comment.body) |
| |
| @cached_property |
| def num_reviews(self) -> int: |
| """counts reviews""" |
| num_reviews = 0 |
| for review in self.pull_request.get_reviews(): |
| self._users.add(review.user.login) |
| num_reviews += 1 |
| return num_reviews |
| |
| def issues(self): |
| """finds issues in PR""" |
| if self.pull_request.body is not None: |
| regex = r"(?<=closes: #|elated: #)\d{5}" |
| issue_strs = re.findall(regex, self.pull_request.body) |
| self.issue_nums = [eval(s) for s in issue_strs] |
| |
| def issue_reactions(self): |
| """counts reactions to issue comments""" |
| if self.issue_nums: |
| repo = self.g.get_repo("apache/airflow") |
| for num in self.issue_nums: |
| try: |
| issue = repo.get_issue(num) |
| except UnknownObjectException: |
| continue |
| for reaction in issue.get_reactions(): |
| self._users.add(reaction.user.login) |
| self.num_issue_reactions += 1 |
| for issue_comment in issue.get_comments(): |
| self.num_issue_comments += 1 |
| self._users.add(issue_comment.user.login) |
| if issue_comment.body is not None: |
| self.len_issue_comments += len(issue_comment.body) |
| |
| def calc_interaction_score(self): |
| """calculates interaction score""" |
| interactions = ( |
| self.num_comments + self.num_conv_comments + self.num_issue_comments |
| ) * PrStat.COMMENT_INTERACTION_VALUE |
| interactions += ( |
| self.comment_reactions + self.conv_comment_reactions + self.num_issue_reactions |
| ) * PrStat.REACTION_INTERACTION_VALUE |
| self.interaction_score += interactions + self.num_reviews * PrStat.REVIEW_INTERACTION_VALUE |
| |
| @cached_property |
| def num_interacting_users(self) -> int: |
| _ = self.interaction_score # make sure the _users set is populated |
| return len(self._users) |
| |
| @cached_property |
| def num_changed_files(self) -> float: |
| return self.pull_request.changed_files |
| |
| @cached_property |
| def body_length(self) -> int: |
| if self.pull_request.body is not None: |
| return len(self.pull_request.body) |
| else: |
| return 0 |
| |
| @cached_property |
| def num_additions(self) -> int: |
| return self.pull_request.additions |
| |
| @cached_property |
| def num_deletions(self) -> int: |
| return self.pull_request.deletions |
| |
| @property |
| def change_score(self) -> float: |
| lineactions = self.num_additions + self.num_deletions |
| actionsperfile = lineactions / self.num_changed_files |
| if self.num_changed_files > 10: |
| if actionsperfile > 20: |
| return 1.2 |
| if actionsperfile < 5: |
| return 0.7 |
| return 1.0 |
| |
| @cached_property |
| def comment_length(self) -> int: |
| rev_length = 0 |
| for comment in self.pull_request.get_review_comments(): |
| if comment.body is not None: |
| rev_length += len(comment.body) |
| return self.len_comments + self.len_issue_comments + rev_length |
| |
| @property |
| def length_score(self) -> float: |
| score = 1.0 |
| if self.len_comments > 3000: |
| score *= 1.3 |
| if self.len_comments < 200: |
| score *= 0.8 |
| if self.body_length > 2000: |
| score *= 1.4 |
| if self.body_length < 1000: |
| score *= 0.8 |
| if self.body_length < 20: |
| score *= 0.4 |
| return round(score, 3) |
| |
| def adjust_interaction_score(self): |
| self.interaction_score *= min(self.num_protm + 1, 3) |
| |
| @property |
| def score(self): |
| # |
| # Current principles: |
| # |
| # Provider and dev-tools PRs should be considered, but should matter 20% less. |
| # |
| # A review is worth twice as much as a comment, and a comment is worth twice as much as a reaction. |
| # |
| # If a PR changed more than 20 files, it should matter less the more files there are. |
| # |
| # If the avg # of changed lines/file is < 5 and there are > 10 files, it should matter 30% less. |
| # If the avg # of changed lines/file is > 20 and there are > 10 files, it should matter 20% more. |
| # |
| # If there are over 3000 characters worth of comments, the PR should matter 30% more. |
| # If there are fewer than 200 characters worth of comments, the PR should matter 20% less. |
| # If the body contains over 2000 characters, the PR should matter 40% more. |
| # If the body contains fewer than 1000 characters, the PR should matter 20% less. |
| # |
| # Weight PRs with protm tags more heavily: |
| # If there is at least one protm tag, multiply the interaction score by the number of tags, up to 3. |
| # |
| self.calc_comments() |
| self.calc_conv_comments() |
| self.calc_interaction_score() |
| self.adjust_interaction_score() |
| |
| return round( |
| self.interaction_score |
| * self.label_score |
| * self.length_score |
| * self.change_score |
| / (math.log10(self.num_changed_files) if self.num_changed_files > 20 else 1), |
| 3, |
| ) |
| |
| def __str__(self) -> str: |
| if self.num_protm > 0: |
| return ( |
| "[magenta]##Tagged PR## [/]" |
| f"Score: {self.score:.2f}: PR{self.pull_request.number}" |
| f"by @{self.pull_request.user.login}: " |
| f'"{self.pull_request.title}". ' |
| f"Merged at {self.pull_request.merged_at}: {self.pull_request.html_url}" |
| ) |
| else: |
| return ( |
| f"Score: {self.score:.2f}: PR{self.pull_request.number}" |
| f"by @{self.pull_request.user.login}: " |
| f'"{self.pull_request.title}". ' |
| f"Merged at {self.pull_request.merged_at}: {self.pull_request.html_url}" |
| ) |
| |
| def verboseStr(self) -> str: |
| if self.num_protm > 0: |
| console.print("********************* Tagged with '#protm' *********************", style="magenta") |
| return ( |
| f"-- Created at [bright_blue]{self.pull_request.created_at}[/], " |
| f"merged at [bright_blue]{self.pull_request.merged_at}[/]\n" |
| f"-- Label score: [green]{self.label_score}[/]\n" |
| f"-- Length score: [green]{self.length_score}[/] " |
| f"(body length: {self.body_length}, " |
| f"comment length: {self.len_comments})\n" |
| f"-- Interaction score: [green]{self.interaction_score}[/] " |
| f"(users interacting: {self.num_interacting_users}, " |
| f"reviews: {self.num_reviews}, " |
| f"review comments: {self.num_comments}, " |
| f"review reactions: {self.comment_reactions}, " |
| f"non-review comments: {self.num_conv_comments}, " |
| f"non-review reactions: {self.conv_comment_reactions}, " |
| f"issue comments: {self.num_issue_comments}, " |
| f"issue reactions: {self.num_issue_reactions})\n" |
| f"-- Change score: [green]{self.change_score}[/] " |
| f"(changed files: {self.num_changed_files}, " |
| f"additions: {self.num_additions}, " |
| f"deletions: {self.num_deletions})\n" |
| f"-- Overall score: [red]{self.score:.2f}[/]\n" |
| ) |
| |
| |
| DAYS_BACK = 5 |
| # Current (or previous during first few days of the next month) |
| DEFAULT_BEGINNING_OF_MONTH = pendulum.now().subtract(days=DAYS_BACK).start_of("month") |
| DEFAULT_END_OF_MONTH = DEFAULT_BEGINNING_OF_MONTH.end_of("month").add(days=1) |
| |
| MAX_PR_CANDIDATES = 500 |
| DEFAULT_TOP_PRS = 10 |
| |
| |
| @click.command() |
| @option_github_token # TODO: this should only be required if --load isn't provided |
| @click.option( |
| "--date-start", type=click.DateTime(formats=["%Y-%m-%d"]), default=str(DEFAULT_BEGINNING_OF_MONTH.date()) |
| ) |
| @click.option( |
| "--date-end", type=click.DateTime(formats=["%Y-%m-%d"]), default=str(DEFAULT_END_OF_MONTH.date()) |
| ) |
| @click.option("--top-number", type=int, default=DEFAULT_TOP_PRS, help="The number of PRs to select") |
| @click.option("--save", type=click.File("wb"), help="Save PR data to a pickle file") |
| @click.option("--load", type=click.File("rb"), help="Load PR data from a file and recalculate scores") |
| @click.option("--verbose", is_flag="True", help="Print scoring details") |
| @click.option( |
| "--rate-limit", |
| is_flag="True", |
| help="Print API rate limit reset time using system time, and requests remaining", |
| ) |
| def main( |
| github_token: str, |
| date_start: datetime, |
| save: click.File(), # type: ignore |
| load: click.File(), # type: ignore |
| date_end: datetime, |
| top_number: int, |
| verbose: bool, |
| rate_limit: bool, |
| ): |
| g = Github(github_token) |
| |
| if rate_limit: |
| r = g.get_rate_limit() |
| requests_remaining: int = r.core.remaining |
| console.print( |
| f"[blue]GitHub API Rate Limit Info\n" |
| f"[green]Requests remaining: [red]{requests_remaining}\n" |
| f"[green]Reset time: [blue]{r.core.reset.astimezone()}" |
| ) |
| |
| selected_prs: list[PrStat] = [] |
| if load: |
| console.print("Loading PRs from cache and recalculating scores.") |
| selected_prs = pickle.load(load, encoding="bytes") |
| for pr in selected_prs: |
| console.print( |
| f"[green]Loading PR: #{pr.pull_request.number} `{pr.pull_request.title}`.[/]" |
| f" Score: {pr.score}." |
| f" Url: {pr.pull_request.html_url}" |
| ) |
| |
| if verbose: |
| console.print(pr.verboseStr()) |
| |
| else: |
| console.print(f"Finding best candidate PRs between {date_start} and {date_end}.") |
| repo = g.get_repo("apache/airflow") |
| commits = repo.get_commits(since=date_start, until=date_end) |
| pulls: list[PullRequest] = [pull for commit in commits for pull in commit.get_pulls()] |
| scores: dict = {} |
| for issue_num, pull in enumerate(pulls, 1): |
| p = PrStat(g=g, pull_request=pull) # type: ignore |
| scores.update({pull.number: [p.score, pull.title]}) |
| console.print( |
| f"[green]Selecting PR: #{pull.number} `{pull.title}` as candidate.[/]" |
| f" Score: {scores[pull.number][0]}." |
| f" Url: {pull.html_url}" |
| ) |
| |
| if verbose: |
| console.print(p.verboseStr()) |
| |
| selected_prs.append(p) |
| if issue_num == MAX_PR_CANDIDATES: |
| console.print(f"[red]Reached {MAX_PR_CANDIDATES}. Stopping") |
| break |
| |
| console.print(f"Top {top_number} out of {issue_num} PRs:") |
| for pr_scored in heapq.nlargest(top_number, scores.items(), key=lambda s: s[1]): |
| console.print(f"[green] * PR #{pr_scored[0]}: {pr_scored[1][1]}. Score: [magenta]{pr_scored[1][0]}") |
| |
| if save: |
| pickle.dump(selected_prs, save) |
| |
| if rate_limit: |
| r = g.get_rate_limit() |
| console.print( |
| f"[blue]GitHub API Rate Limit Info\n" |
| f"[green]Requests remaining: [red]{r.core.remaining}\n" |
| f"[green]Requests made: [red]{requests_remaining - r.core.remaining}\n" |
| f"[green]Reset time: [blue]{r.core.reset.astimezone()}" |
| ) |
| |
| |
| if __name__ == "__main__": |
| main() |