| #!/usr/bin/env -S uv run --script |
| # /// script |
| # dependencies = [ |
| # "requests", |
| # ] |
| # /// |
| |
| import argparse |
| import os |
| import sys |
| import subprocess |
| import mailbox |
| import email |
| import requests |
| from datetime import datetime, timedelta |
| from collections import defaultdict |
| from pathlib import Path |
| |
| DATA_DIR = Path(__file__).parent / "DATA" |
| COMMON_LISTS = ["dev", "user", "users", "commits", "issues", "reviews"] |
| |
| |
| def check_list_exists(project, list_name): |
| """Check if an Apache mailing list exists.""" |
| domain = f"{project}.apache.org" |
| date = datetime.now().strftime("%Y-%m") |
| url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={date}" |
| try: |
| r = requests.get(url, timeout=10) |
| if r.status_code == 200 and len(r.content) > 100: |
| return True |
| except requests.RequestException: |
| pass |
| # Try a few months back |
| for i in range(1, 12): |
| d = datetime.now() - timedelta(days=30 * i) |
| url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={d.strftime('%Y-%m')}" |
| try: |
| r = requests.get(url, timeout=10) |
| if r.status_code == 200 and len(r.content) > 100: |
| return True |
| except requests.RequestException: |
| continue |
| return False |
| |
| |
| def fetch_mbox(project, list_name, year_month): |
| """Fetch an mbox file for a given list and month. Returns path or None.""" |
| domain = f"{project}.apache.org" |
| dest_dir = DATA_DIR / "mbox" / project / list_name |
| dest_dir.mkdir(parents=True, exist_ok=True) |
| dest = dest_dir / f"{year_month}.mbox" |
| |
| now = datetime.now() |
| ym_parts = year_month.split("-") |
| target_year, target_month = int(ym_parts[0]), int(ym_parts[1]) |
| |
| # Determine if we should skip fetching |
| if dest.exists(): |
| is_current_month = (target_year == now.year and target_month == now.month) |
| if is_current_month: |
| pass # Always refetch current month |
| else: |
| # For past months, check if file was fetched after end of that month |
| end_of_month = datetime(target_year, target_month + 1, 1) if target_month < 12 else datetime(target_year + 1, 1, 1) |
| file_mtime = datetime.fromtimestamp(dest.stat().st_mtime) |
| if file_mtime > end_of_month: |
| return dest # Already complete |
| |
| url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={year_month}" |
| try: |
| r = requests.get(url, timeout=60) |
| if r.status_code == 200 and len(r.content) > 100: |
| dest.write_bytes(r.content) |
| return dest |
| except requests.RequestException as e: |
| print(f" Warning: Failed to fetch {list_name}/{year_month}: {e}") |
| return None |
| |
| |
| def fetch_mailing_lists(project, months): |
| """Discover lists and fetch mbox archives.""" |
| print("Checking mailing lists...") |
| active_lists = [] |
| for ln in COMMON_LISTS: |
| if check_list_exists(project, ln): |
| print(f" Found list: {ln}") |
| active_lists.append(ln) |
| |
| if not active_lists: |
| print(" No mailing lists found.") |
| return active_lists |
| |
| now = datetime.now() |
| month_list = [] |
| for i in range(months + 1): |
| d = now - timedelta(days=30 * i) |
| month_list.append(d.strftime("%Y-%m")) |
| |
| for ln in active_lists: |
| print(f" Fetching archives for {ln}...") |
| for ym in month_list: |
| result = fetch_mbox(project, ln, ym) |
| if result: |
| print(f" {ym}: OK") |
| else: |
| print(f" {ym}: no data") |
| |
| return active_lists |
| |
| |
| def discover_repos(project): |
| """Discover GitHub repos for apache/{project} and apache/{project}-*.""" |
| repos = [] |
| # Check main repo |
| try: |
| r = requests.get(f"https://api.github.com/repos/apache/{project}", timeout=10) |
| if r.status_code == 200: |
| repos.append(r.json()["name"]) |
| except requests.RequestException: |
| pass |
| |
| # Search for project-* repos |
| try: |
| url = f"https://api.github.com/search/repositories?q=org:apache+{project}-+in:name&per_page=100" |
| r = requests.get(url, timeout=10) |
| if r.status_code == 200: |
| for repo in r.json().get("items", []): |
| name = repo["name"] |
| if name.startswith(f"{project}-") and name not in repos: |
| repos.append(name) |
| except requests.RequestException: |
| pass |
| |
| return repos |
| |
| |
| def fetch_repos(project, repos): |
| """Clone or update repos with metadata only.""" |
| repo_dir = DATA_DIR / "REPOSITORIES" / project |
| repo_dir.mkdir(parents=True, exist_ok=True) |
| |
| for repo_name in repos: |
| path = repo_dir / repo_name |
| git_url = f"https://github.com/apache/{repo_name}.git" |
| if path.exists(): |
| print(f" Updating {repo_name}...") |
| try: |
| subprocess.run(["git", "pull"], cwd=path, capture_output=True, check=True) |
| except subprocess.CalledProcessError as e: |
| print(f" Warning: pull failed for {repo_name}: {e}") |
| else: |
| print(f" Cloning {repo_name} (metadata only)...") |
| try: |
| subprocess.run( |
| ["git", "clone", "--filter=blob:none", "--no-checkout", git_url, str(path)], |
| capture_output=True, check=True, |
| ) |
| except subprocess.CalledProcessError as e: |
| print(f" Warning: clone failed for {repo_name}: {e}") |
| |
| |
| def analyze_mbox_threads(project, list_name, months): |
| """Analyze mbox files and return top threads with first message ID.""" |
| now = datetime.now() |
| threads = defaultdict(int) # normalized subject -> message count |
| thread_msgid = {} # normalized subject -> first Message-ID seen |
| |
| for i in range(months + 1): |
| d = now - timedelta(days=30 * i) |
| ym = d.strftime("%Y-%m") |
| path = DATA_DIR / "mbox" / project / list_name / f"{ym}.mbox" |
| if not path.exists(): |
| continue |
| try: |
| mbox = mailbox.mbox(str(path)) |
| for msg in mbox: |
| raw_subject = msg.get("Subject", "(no subject)") |
| # Decode MIME-encoded headers |
| decoded_parts = email.header.decode_header(raw_subject) |
| subject = "" |
| for part, charset in decoded_parts: |
| if isinstance(part, bytes): |
| subject += part.decode(charset or "utf-8", errors="replace") |
| else: |
| subject += part |
| # Collapse folded header whitespace and sanitize for markdown tables |
| subject = " ".join(subject.split()) |
| subject = subject.replace("|", "\\|") |
| # Normalize: strip Re:/Fwd: prefixes |
| s = subject |
| while True: |
| lower = s.lower().lstrip() |
| if lower.startswith("re:") or lower.startswith("fwd:"): |
| s = s.lstrip()[s.lstrip().index(":") + 1:].lstrip() |
| elif lower.startswith("[") and "]" in lower: |
| s = s[s.index("]") + 1:].lstrip() |
| else: |
| break |
| key = s.strip() |
| threads[key] += 1 |
| if key not in thread_msgid: |
| mid = msg.get("Message-ID", "") |
| if mid: |
| thread_msgid[key] = mid.strip("<>") |
| except Exception: |
| continue |
| |
| # Sort by count, top 5 |
| top = [(subj, count, thread_msgid.get(subj, "")) for subj, count in |
| sorted(threads.items(), key=lambda x: -x[1])[:5]] |
| total = sum(threads.values()) |
| return top, total |
| |
| |
| def analyze_repo_commits(project, repo_name, months): |
| """Count commits in the past m months.""" |
| path = DATA_DIR / "REPOSITORIES" / project / repo_name |
| if not path.exists(): |
| return 0 |
| since = (datetime.now() - timedelta(days=30 * months)).strftime("%Y-%m-%d") |
| try: |
| result = subprocess.run( |
| ["git", "rev-list", "--count", f"--since={since}", "HEAD"], |
| cwd=path, capture_output=True, text=True, check=True, |
| ) |
| return int(result.stdout.strip()) |
| except (subprocess.CalledProcessError, ValueError): |
| return 0 |
| |
| |
| def report(project, active_lists, repos, months): |
| """Generate a markdown activity report.""" |
| today = datetime.now().strftime("%Y-%m-%d") |
| report_dir = Path(__file__).parent / "REPORTS" / project |
| report_dir.mkdir(parents=True, exist_ok=True) |
| report_path = report_dir / f"{today}.md" |
| |
| lines = [] |
| lines.append(f"# Apache {project} — Activity Report") |
| lines.append(f"") |
| lines.append(f"Generated: {today} ") |
| lines.append(f"Period: {months} months ending {today}") |
| lines.append("") |
| |
| # Mailing lists |
| any_list_activity = False |
| list_sections = [] |
| for ln in active_lists: |
| top_threads, total = analyze_mbox_threads(project, ln, months) |
| if total == 0: |
| continue |
| any_list_activity = True |
| section = [] |
| section.append(f"### {ln}@ ({total} messages)") |
| section.append("") |
| list_id = f"{ln}.{project}.apache.org" |
| section.append("| Messages | Thread | Link |") |
| section.append("|-------:|--------|------|") |
| for subject, count, msgid in top_threads: |
| if msgid: |
| link = f"[thread](https://lists.apache.org/thread/<{msgid}>?<{list_id}>)" |
| else: |
| link = "" |
| section.append(f"| {count} | {subject} | {link} |") |
| section.append("") |
| list_sections.append("\n".join(section)) |
| |
| if any_list_activity: |
| lines.append("## Most active mailing list threads") |
| lines.append("") |
| lines.append("\n".join(list_sections)) |
| else: |
| lines.append("## Most active mailing list threads") |
| lines.append("") |
| lines.append("No mailing list activity found.") |
| lines.append("") |
| |
| # Repos |
| any_repo_activity = False |
| repo_rows = [] |
| for repo_name in repos: |
| count = analyze_repo_commits(project, repo_name, months) |
| if count == 0: |
| continue |
| any_repo_activity = True |
| repo_rows.append(f"| {repo_name} | {count} |") |
| |
| lines.append("## Repositories") |
| lines.append("") |
| if any_repo_activity: |
| lines.append("| Repository | Commits |") |
| lines.append("|------------|--------:|") |
| lines.extend(repo_rows) |
| else: |
| lines.append("No repository activity found.") |
| lines.append("") |
| |
| content = "\n".join(lines) |
| report_path.write_text(content) |
| print(f"\nReport written to {report_path}") |
| print(content) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser(description="Apache project activity report") |
| parser.add_argument("-p", "--project", required=True, help="ASF project name") |
| parser.add_argument("-m", "--months", type=int, default=3, help="Number of months (default: 3)") |
| args = parser.parse_args() |
| |
| project = args.project.lower() |
| months = args.months |
| |
| print(f"Project: {project}") |
| print(f"Months: {months}\n") |
| |
| # 1. Mailing lists |
| active_lists = fetch_mailing_lists(project, months) |
| |
| # 2. Git repos |
| print("\nDiscovering GitHub repositories...") |
| repos = discover_repos(project) |
| if repos: |
| print(f" Found: {', '.join(repos)}") |
| fetch_repos(project, repos) |
| else: |
| print(" No repositories found.") |
| |
| # 3. Report |
| report(project, active_lists, repos, months) |
| |
| |
| if __name__ == "__main__": |
| main() |