| #!/usr/bin/env python3 |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| |
| import os |
| import sys |
| import argparse |
| import json |
| from datetime import datetime, timedelta |
| import requests |
| from dateutil.parser import parse |
| import pytz |
| from openai import OpenAI |
| |
| import logging |
| |
| logging.basicConfig(level=logging.DEBUG) |
| |
| |
| def get_github_api_token(): |
| """Get GitHub API token from environment variables.""" |
| token = os.environ.get("GITHUB_TOKEN") |
| if not token: |
| print( |
| "Warning: GitHub API token not found. Set the GITHUB_TOKEN environment variable." |
| ) |
| print("Without a token, API rate limits will be lower.") |
| return token |
| |
| |
| def get_openai_api_key(): |
| """Get OpenAI API key from environment variables.""" |
| api_key = os.environ.get("OPENAI_API_KEY") |
| if not api_key: |
| print( |
| "Error: OpenAI API key not found. Set the OPENAI_API_KEY environment variable." |
| ) |
| sys.exit(1) |
| return api_key |
| |
| |
| def init_openai_client(): |
| """Initialize the OpenAI client.""" |
| api_key = get_openai_api_key() |
| # Get the OpenAI API base URL from environment variable or use the default |
| api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") |
| # Get the model from environment variable or use a default |
| model = os.environ.get("OPENAI_MODEL", "gpt-4o") |
| |
| client = OpenAI( |
| api_key=api_key, |
| base_url=api_base, |
| default_query={"api-version": "2023-05-15"}, |
| ) |
| return client, model |
| |
| |
| def is_recent(timestamp_str, days=7): |
| """Check if the timestamp is within the last 'days' days.""" |
| now = datetime.now(pytz.utc) |
| timestamp = parse(timestamp_str) |
| delta = now - timestamp |
| return delta.days < days |
| |
| |
| def fetch_issues(repo, token, days=7): |
| """Fetch recent issues from a repository.""" |
| since_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%SZ") |
| headers = {"Accept": "application/vnd.github.v3+json"} |
| if token: |
| headers["Authorization"] = f"token {token}" |
| |
| url = f"https://api.github.com/repos/{repo}/issues" |
| params = { |
| "state": "all", |
| "since": since_date, |
| "sort": "updated", |
| "direction": "desc", |
| "per_page": 100, |
| } |
| |
| issues = [] |
| prs = [] |
| good_first_issues = [] |
| |
| response = requests.get(url, params=params, headers=headers) |
| if response.status_code != 200: |
| print(f"Error fetching issues: {response.status_code}") |
| return [], [], [] |
| |
| for item in response.json(): |
| if is_recent(item["updated_at"], days): |
| entry = { |
| "id": item["number"], |
| "title": item["title"], |
| "url": item["html_url"], |
| "user": item["user"]["login"], |
| "updated_at": item["updated_at"], |
| "body": item.get("body", "") or "", |
| "labels": [label["name"] for label in item.get("labels", [])], |
| "state": item["state"], |
| "comments": item["comments"], |
| } |
| |
| # Check if it's a good first issue |
| label_names = [label["name"].lower() for label in item.get("labels", [])] |
| is_good_first = any( |
| name |
| in [ |
| "good first issue", |
| "good-first-issue", |
| "beginner friendly", |
| "beginner-friendly", |
| "easy", |
| ] |
| for name in label_names |
| ) |
| |
| if "pull_request" in item: |
| # Get additional PR details |
| if token: |
| pr_url = ( |
| f"https://api.github.com/repos/{repo}/pulls/{item['number']}" |
| ) |
| pr_response = requests.get(pr_url, headers=headers) |
| if pr_response.status_code == 200: |
| pr_data = pr_response.json() |
| entry["additions"] = pr_data.get("additions", 0) |
| entry["deletions"] = pr_data.get("deletions", 0) |
| entry["changed_files"] = pr_data.get("changed_files", 0) |
| entry["mergeable"] = pr_data.get("mergeable", None) |
| entry["draft"] = pr_data.get("draft", False) |
| |
| prs.append(entry) |
| else: |
| issues.append(entry) |
| if is_good_first and item["state"] == "open": |
| good_first_issues.append(entry) |
| |
| return issues, prs, good_first_issues |
| |
| |
| def fetch_discussions(repo, token, days=7): |
| """Fetch recent discussions from a repository.""" |
| headers = {"Accept": "application/vnd.github.v3+json"} |
| if token: |
| headers["Authorization"] = f"token {token}" |
| |
| # GraphQL query to fetch discussions |
| query = """ |
| query($owner: String!, $name: String!) { |
| repository(owner: $owner, name: $name) { |
| discussions(first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { |
| nodes { |
| number |
| title |
| url |
| author { |
| login |
| } |
| updatedAt |
| bodyText |
| category { |
| name |
| } |
| comments { |
| totalCount |
| } |
| answerChosenAt |
| } |
| } |
| } |
| } |
| """ |
| |
| owner, name = repo.split("/") |
| variables = {"owner": owner, "name": name} |
| |
| url = "https://api.github.com/graphql" |
| response = requests.post( |
| url, json={"query": query, "variables": variables}, headers=headers |
| ) |
| |
| discussions = [] |
| if response.status_code != 200: |
| print(f"Error fetching discussions: {response.status_code}") |
| return discussions |
| |
| result = response.json() |
| if ( |
| "data" in result |
| and "repository" in result["data"] |
| and "discussions" in result["data"]["repository"] |
| ): |
| for discussion in result["data"]["repository"]["discussions"]["nodes"]: |
| if is_recent(discussion["updatedAt"], days): |
| discussions.append( |
| { |
| "id": discussion["number"], |
| "title": discussion["title"], |
| "url": discussion["url"], |
| "user": discussion["author"]["login"] |
| if discussion["author"] |
| else "Anonymous", |
| "updated_at": discussion["updatedAt"], |
| "body": discussion.get("bodyText", "") or "", |
| "category": discussion.get("category", {}).get( |
| "name", "General" |
| ), |
| "comments": discussion.get("comments", {}).get("totalCount", 0), |
| "answered": discussion.get("answerChosenAt") is not None, |
| } |
| ) |
| |
| return discussions |
| |
| |
| def fetch_additional_good_first_issues(repo, token, count=5): |
| """Fetch additional good first issues even if they're older.""" |
| headers = {"Accept": "application/vnd.github.v3+json"} |
| if token: |
| headers["Authorization"] = f"token {token}" |
| |
| url = f"https://api.github.com/repos/{repo}/issues" |
| params = { |
| "state": "open", |
| "labels": "good first issue", |
| "sort": "updated", |
| "direction": "desc", |
| "per_page": count, |
| } |
| |
| additional_issues = [] |
| |
| # Try with 'good first issue' |
| response = requests.get(url, params=params, headers=headers) |
| if response.status_code == 200: |
| additional_issues.extend(response.json()) |
| |
| # If we didn't get enough, try with 'good-first-issue' |
| if len(additional_issues) < count: |
| params["labels"] = "good-first-issue" |
| response = requests.get(url, params=params, headers=headers) |
| if response.status_code == 200: |
| additional_issues.extend(response.json()) |
| |
| # If still not enough, try with 'beginner friendly' |
| if len(additional_issues) < count: |
| params["labels"] = "beginner friendly" |
| response = requests.get(url, params=params, headers=headers) |
| if response.status_code == 200: |
| additional_issues.extend(response.json()) |
| |
| # Format the issues |
| formatted_issues = [] |
| for item in additional_issues[:count]: |
| formatted_issues.append( |
| { |
| "id": item["number"], |
| "title": item["title"], |
| "url": item["html_url"], |
| "user": item["user"]["login"], |
| "updated_at": item["updated_at"], |
| "body": item.get("body", "") or "", |
| "labels": [label["name"] for label in item.get("labels", [])], |
| "state": item["state"], |
| "comments": item["comments"], |
| } |
| ) |
| |
| return formatted_issues |
| |
| |
| def format_data_for_llm(repo, issues, prs, discussions, good_first_issues, days=7): |
| """Format data in a JSON structure that's friendly for LLM processing.""" |
| now = datetime.now() |
| |
| # Combine recent good first issues with additional ones |
| # Remove duplicates by creating a dict with issue ID as key |
| all_good_first_issues = {} |
| for issue in good_first_issues: |
| all_good_first_issues[issue["id"]] = issue |
| |
| result = { |
| "metadata": { |
| "repository": repo, |
| "date_generated": now.strftime("%Y-%m-%d"), |
| "period_days": days, |
| }, |
| "pull_requests": [ |
| { |
| "id": pr["id"], |
| "title": pr["title"], |
| "url": pr["url"], |
| "author": pr["user"], |
| "updated_at": pr["updated_at"], |
| "description": pr["body"], |
| "labels": pr["labels"], |
| "state": pr["state"], |
| "comments": pr["comments"], |
| "additions": pr.get("additions", "unknown"), |
| "deletions": pr.get("deletions", "unknown"), |
| "changed_files": pr.get("changed_files", "unknown"), |
| "draft": pr.get("draft", False), |
| } |
| for pr in prs |
| ], |
| "issues": [ |
| { |
| "id": issue["id"], |
| "title": issue["title"], |
| "url": issue["url"], |
| "author": issue["user"], |
| "updated_at": issue["updated_at"], |
| "description": issue["body"], |
| "labels": issue["labels"], |
| "state": issue["state"], |
| "comments": issue["comments"], |
| } |
| for issue in issues |
| ], |
| "discussions": [ |
| { |
| "id": discussion["id"], |
| "title": discussion["title"], |
| "url": discussion["url"], |
| "author": discussion["user"], |
| "updated_at": discussion["updated_at"], |
| "description": discussion["body"], |
| "category": discussion["category"], |
| "comments": discussion["comments"], |
| "answered": discussion.get("answered", False), |
| } |
| for discussion in discussions |
| ], |
| "good_first_issues": [ |
| { |
| "id": issue["id"], |
| "title": issue["title"], |
| "url": issue["url"], |
| "author": issue["user"], |
| "updated_at": issue["updated_at"], |
| "description": issue["body"], |
| "labels": issue["labels"], |
| "comments": issue["comments"], |
| } |
| for issue in all_good_first_issues.values() |
| ], |
| } |
| |
| return result |
| |
| |
| def summarize_with_openai(data, client, model): |
| """Use OpenAI to summarize and prioritize the repository activity.""" |
| |
| prompt = f""" |
| You are an open-source community evangelist responsible for reporting GitHub repository activity and encouraging more contributions. |
| |
| I will provide you with JSON data containing recent pull requests, issues, and discussions from |
| the repository {data["metadata"]["repository"]} for the past {data["metadata"]["period_days"]} days. |
| |
| Please analyze this data and provide: |
| 1. A concise summary of the overall activity and key themes |
| 2. The most important ongoing projects or initiatives based on the data |
| 3. Prioritized issues and PRs that need immediate attention |
| 4. Major discussions that should be highlighted |
| 5. Identify any emerging trends or patterns in development |
| |
| Additionally, include a section highlighting "Good First Issues" to encourage new contributors to join the project. Summarize what skills might be needed and why these issues are good entry points. |
| |
| IMPORTANT: For each PR you mention, ALWAYS include the contributor's GitHub username with @ symbol (e.g., @username) to properly credit their contributions. This is critical for recognizing contributors' work. |
| |
| CRITICAL FORMATTING INSTRUCTIONS: |
| |
| - When referring to PRs, issues, or discussions, use ONLY the GitHub reference format: #XXXX (number with # prefix) |
| - DO NOT include the title after the reference number |
| - DO NOT repeat the title in your explanation if you've already mentioned the reference number |
| - AVOID listing large numbers of PRs in sequence - instead, summarize them by theme or use bulleted lists with no more than 3-5 items per bullet |
| - For groups of related PRs, summarize the theme and mention 1-2 representative examples instead of listing all of them |
| - When appropriate, use standard Markdown URL syntax [meaningful text](full link) instead of just the reference number |
| |
| Example of correct format: |
| |
| - #1234 by @username implements the core authentication framework |
| - Multiple documentation updates were contributed by @contributor focusing on installation guides and API references |
| |
| Here's the JSON data: |
| ```json |
| {json.dumps(data, ensure_ascii=False)} |
| ``` |
| |
| Format your response as: |
| |
| *This weekly update is generated by LLMs. You're welcome to join our [Discord](https://opendal.apache.org/discord/) for in-depth discussions.* |
| |
| ## Overall Activity Summary |
| [Provide a concise overview of activity] |
| |
| ## Key Ongoing Projects |
| [List major projects/initiatives with brief descriptions - always mention contributors with @ symbol] |
| |
| ## Priority Items |
| [List issues/PRs that need immediate attention - always mention contributors with @ symbol] |
| |
| ## Notable Discussions |
| [Highlight important ongoing discussions - always mention contributors with @ symbol, use format like #1234: brief description] |
| |
| ## Emerging Trends |
| [Identify patterns or trends] |
| |
| ## Good First Issues |
| [List good first issues for new contributors with brief explanations of what makes them approachable, use format like #1234: brief description] |
| """ |
| |
| try: |
| response = client.chat.completions.create( |
| model=model, |
| messages=[ |
| { |
| "role": "system", |
| "content": "You are an open-source community evangelist responsible for reporting GitHub repository activity and encouraging more contributions.", |
| }, |
| {"role": "user", "content": prompt}, |
| ], |
| temperature=0.3, |
| max_tokens=4000, |
| ) |
| return response.choices[0].message.content |
| except Exception as e: |
| print(f"Error with OpenAI API: {e}") |
| return f"Error generating summary: {e}" |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Generate a weekly summary of GitHub repository activity with OpenAI analysis." |
| ) |
| parser.add_argument("repo", help="GitHub repository in the format owner/repo") |
| parser.add_argument( |
| "--days", type=int, default=7, help="Number of days to look back (default: 7)" |
| ) |
| parser.add_argument("--output", help="Output file path (default: stdout)") |
| parser.add_argument( |
| "--raw", action="store_true", help="Output raw JSON data without OpenAI summary" |
| ) |
| parser.add_argument( |
| "--json-output", help="Path to save raw JSON data (in addition to summary)" |
| ) |
| parser.add_argument( |
| "--gfi-count", |
| type=int, |
| default=5, |
| help="Number of good first issues to include (default: 5)", |
| ) |
| |
| args = parser.parse_args() |
| |
| token = get_github_api_token() |
| |
| # Fetch data from GitHub API |
| print(f"Fetching data from {args.repo} for the last {args.days} days...") |
| issues, prs, recent_good_first_issues = fetch_issues(args.repo, token, args.days) |
| discussions = fetch_discussions(args.repo, token, args.days) |
| |
| # If we don't have enough good first issues from recent activity, fetch additional ones |
| if len(recent_good_first_issues) < args.gfi_count: |
| print("Fetching additional good first issues...") |
| additional_gfi = fetch_additional_good_first_issues( |
| args.repo, token, args.gfi_count - len(recent_good_first_issues) |
| ) |
| good_first_issues = recent_good_first_issues + additional_gfi |
| else: |
| good_first_issues = recent_good_first_issues |
| |
| print(f"Found {len(good_first_issues)} good first issues.") |
| |
| # Generate LLM-friendly structured data |
| structured_data = format_data_for_llm( |
| args.repo, issues, prs, discussions, good_first_issues, args.days |
| ) |
| |
| # Save raw JSON data if requested |
| if args.json_output: |
| with open(args.json_output, "w", encoding="utf-8") as f: |
| json.dump(structured_data, f, ensure_ascii=False, indent=2) |
| print(f"Raw JSON data written to {args.json_output}") |
| |
| # If raw output is requested, just print the JSON and exit |
| if args.raw: |
| if args.output: |
| with open(args.output, "w", encoding="utf-8") as f: |
| json.dump(structured_data, f, ensure_ascii=False, indent=2) |
| print(f"Raw data written to {args.output}") |
| else: |
| print(json.dumps(structured_data, ensure_ascii=False, indent=2)) |
| return |
| |
| # Initialize openai |
| print("Initializing openai API for summarization...") |
| client, model = init_openai_client() |
| |
| # Generate summary with Gemini |
| print("Generating summary with OpenAI ...") |
| summary = summarize_with_openai(structured_data, client, model) |
| |
| # Output the result |
| if args.output: |
| with open(args.output, "w", encoding="utf-8") as f: |
| f.write(summary) |
| print(f"Summary written to {args.output}") |
| else: |
| print(summary) |
| |
| |
| if __name__ == "__main__": |
| main() |