project-activity/project_activity.py - comdev - Git at Google

 #!/usr/bin/env -S uv run --script
 # /// script
 # dependencies = [
 #   "requests",
 # ]
 # ///

 import argparse
 import os
 import sys
 import subprocess
 import mailbox
 import email
 import requests
 from datetime import datetime, timedelta
 from collections import defaultdict
 from pathlib import Path

 DATA_DIR = Path(__file__).parent / "DATA"
 COMMON_LISTS = ["dev", "user", "users", "commits", "issues", "reviews"]


 def check_list_exists(project, list_name):
     """Check if an Apache mailing list exists."""
     domain = f"{project}.apache.org"
     date = datetime.now().strftime("%Y-%m")
     url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={date}"
     try:
         r = requests.get(url, timeout=10)
         if r.status_code == 200 and len(r.content) > 100:
             return True
     except requests.RequestException:
         pass
     # Try a few months back
     for i in range(1, 12):
         d = datetime.now() - timedelta(days=30 * i)
         url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={d.strftime('%Y-%m')}"
         try:
             r = requests.get(url, timeout=10)
             if r.status_code == 200 and len(r.content) > 100:
                 return True
         except requests.RequestException:
             continue
     return False


 def fetch_mbox(project, list_name, year_month):
     """Fetch an mbox file for a given list and month. Returns path or None."""
     domain = f"{project}.apache.org"
     dest_dir = DATA_DIR / "mbox" / project / list_name
     dest_dir.mkdir(parents=True, exist_ok=True)
     dest = dest_dir / f"{year_month}.mbox"

     now = datetime.now()
     ym_parts = year_month.split("-")
     target_year, target_month = int(ym_parts[0]), int(ym_parts[1])

     # Determine if we should skip fetching
     if dest.exists():
         is_current_month = (target_year == now.year and target_month == now.month)
         if is_current_month:
             pass  # Always refetch current month
         else:
             # For past months, check if file was fetched after end of that month
             end_of_month = datetime(target_year, target_month + 1, 1) if target_month < 12 else datetime(target_year + 1, 1, 1)
             file_mtime = datetime.fromtimestamp(dest.stat().st_mtime)
             if file_mtime > end_of_month:
                 return dest  # Already complete

     url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={year_month}"
     try:
         r = requests.get(url, timeout=60)
         if r.status_code == 200 and len(r.content) > 100:
             dest.write_bytes(r.content)
             return dest
     except requests.RequestException as e:
         print(f"  Warning: Failed to fetch {list_name}/{year_month}: {e}")
     return None


 def fetch_mailing_lists(project, months):
     """Discover lists and fetch mbox archives."""
     print("Checking mailing lists...")
     active_lists = []
     for ln in COMMON_LISTS:
         if check_list_exists(project, ln):
             print(f"  Found list: {ln}")
             active_lists.append(ln)

     if not active_lists:
         print("  No mailing lists found.")
         return active_lists

     now = datetime.now()
     month_list = []
     for i in range(months + 1):
         d = now - timedelta(days=30 * i)
         month_list.append(d.strftime("%Y-%m"))

     for ln in active_lists:
         print(f"  Fetching archives for {ln}...")
         for ym in month_list:
             result = fetch_mbox(project, ln, ym)
             if result:
                 print(f"    {ym}: OK")
             else:
                 print(f"    {ym}: no data")

     return active_lists


 def discover_repos(project):
     """Discover GitHub repos for apache/{project} and apache/{project}-*."""
     repos = []
     # Check main repo
     try:
         r = requests.get(f"https://api.github.com/repos/apache/{project}", timeout=10)
         if r.status_code == 200:
             repos.append(r.json()["name"])
     except requests.RequestException:
         pass

     # Search for project-* repos
     try:
         url = f"https://api.github.com/search/repositories?q=org:apache+{project}-+in:name&per_page=100"
         r = requests.get(url, timeout=10)
         if r.status_code == 200:
             for repo in r.json().get("items", []):
                 name = repo["name"]
                 if name.startswith(f"{project}-") and name not in repos:
                     repos.append(name)
     except requests.RequestException:
         pass

     return repos


 def fetch_repos(project, repos):
     """Clone or update repos with metadata only."""
     repo_dir = DATA_DIR / "REPOSITORIES" / project
     repo_dir.mkdir(parents=True, exist_ok=True)

     for repo_name in repos:
         path = repo_dir / repo_name
         git_url = f"https://github.com/apache/{repo_name}.git"
         if path.exists():
             print(f"  Updating {repo_name}...")
             try:
                 subprocess.run(["git", "pull"], cwd=path, capture_output=True, check=True)
             except subprocess.CalledProcessError as e:
                 print(f"    Warning: pull failed for {repo_name}: {e}")
         else:
             print(f"  Cloning {repo_name} (metadata only)...")
             try:
                 subprocess.run(
                     ["git", "clone", "--filter=blob:none", "--no-checkout", git_url, str(path)],
                     capture_output=True, check=True,
                 )
             except subprocess.CalledProcessError as e:
                 print(f"    Warning: clone failed for {repo_name}: {e}")


 def analyze_mbox_threads(project, list_name, months):
     """Analyze mbox files and return top threads with first message ID."""
     now = datetime.now()
     threads = defaultdict(int)  # normalized subject -> message count
     thread_msgid = {}  # normalized subject -> first Message-ID seen

     for i in range(months + 1):
         d = now - timedelta(days=30 * i)
         ym = d.strftime("%Y-%m")
         path = DATA_DIR / "mbox" / project / list_name / f"{ym}.mbox"
         if not path.exists():
             continue
         try:
             mbox = mailbox.mbox(str(path))
             for msg in mbox:
                 raw_subject = msg.get("Subject", "(no subject)")
                 # Decode MIME-encoded headers
                 decoded_parts = email.header.decode_header(raw_subject)
                 subject = ""
                 for part, charset in decoded_parts:
                     if isinstance(part, bytes):
                         subject += part.decode(charset or "utf-8", errors="replace")
                     else:
                         subject += part
                 # Collapse folded header whitespace and sanitize for markdown tables
                 subject = " ".join(subject.split())
                 subject = subject.replace("|", "\\|")
                 # Normalize: strip Re:/Fwd: prefixes
                 s = subject
                 while True:
                     lower = s.lower().lstrip()
                     if lower.startswith("re:") or lower.startswith("fwd:"):
                         s = s.lstrip()[s.lstrip().index(":") + 1:].lstrip()
                     elif lower.startswith("[") and "]" in lower:
                         s = s[s.index("]") + 1:].lstrip()
                     else:
                         break
                 key = s.strip()
                 threads[key] += 1
                 if key not in thread_msgid:
                     mid = msg.get("Message-ID", "")
                     if mid:
                         thread_msgid[key] = mid.strip("<>")
         except Exception:
             continue

     # Sort by count, top 5
     top = [(subj, count, thread_msgid.get(subj, "")) for subj, count in
            sorted(threads.items(), key=lambda x: -x[1])[:5]]
     total = sum(threads.values())
     return top, total


 def analyze_repo_commits(project, repo_name, months):
     """Count commits in the past m months."""
     path = DATA_DIR / "REPOSITORIES" / project / repo_name
     if not path.exists():
         return 0
     since = (datetime.now() - timedelta(days=30 * months)).strftime("%Y-%m-%d")
     try:
         result = subprocess.run(
             ["git", "rev-list", "--count", f"--since={since}", "HEAD"],
             cwd=path, capture_output=True, text=True, check=True,
         )
         return int(result.stdout.strip())
     except (subprocess.CalledProcessError, ValueError):
         return 0


 def report(project, active_lists, repos, months):
     """Generate a markdown activity report."""
     today = datetime.now().strftime("%Y-%m-%d")
     report_dir = Path(__file__).parent / "REPORTS" / project
     report_dir.mkdir(parents=True, exist_ok=True)
     report_path = report_dir / f"{today}.md"

     lines = []
     lines.append(f"# Apache {project} — Activity Report")
     lines.append(f"")
     lines.append(f"Generated: {today}  ")
     lines.append(f"Period: {months} months ending {today}")
     lines.append("")

     # Mailing lists
     any_list_activity = False
     list_sections = []
     for ln in active_lists:
         top_threads, total = analyze_mbox_threads(project, ln, months)
         if total == 0:
             continue
         any_list_activity = True
         section = []
         section.append(f"### {ln}@ ({total} messages)")
         section.append("")
         list_id = f"{ln}.{project}.apache.org"
         section.append("| Messages | Thread | Link |")
         section.append("|-------:|--------|------|")
         for subject, count, msgid in top_threads:
             if msgid:
                 link = f"[thread](https://lists.apache.org/thread/<{msgid}>?<{list_id}>)"
             else:
                 link = ""
             section.append(f"| {count} | {subject} | {link} |")
         section.append("")
         list_sections.append("\n".join(section))

     if any_list_activity:
         lines.append("## Most active mailing list threads")
         lines.append("")
         lines.append("\n".join(list_sections))
     else:
         lines.append("## Most active mailing list threads")
         lines.append("")
         lines.append("No mailing list activity found.")
         lines.append("")

     # Repos
     any_repo_activity = False
     repo_rows = []
     for repo_name in repos:
         count = analyze_repo_commits(project, repo_name, months)
         if count == 0:
             continue
         any_repo_activity = True
         repo_rows.append(f"| {repo_name} | {count} |")

     lines.append("## Repositories")
     lines.append("")
     if any_repo_activity:
         lines.append("| Repository | Commits |")
         lines.append("|------------|--------:|")
         lines.extend(repo_rows)
     else:
         lines.append("No repository activity found.")
     lines.append("")

     content = "\n".join(lines)
     report_path.write_text(content)
     print(f"\nReport written to {report_path}")
     print(content)


 def main():
     parser = argparse.ArgumentParser(description="Apache project activity report")
     parser.add_argument("-p", "--project", required=True, help="ASF project name")
     parser.add_argument("-m", "--months", type=int, default=3, help="Number of months (default: 3)")
     args = parser.parse_args()

     project = args.project.lower()
     months = args.months

     print(f"Project: {project}")
     print(f"Months: {months}\n")

     # 1. Mailing lists
     active_lists = fetch_mailing_lists(project, months)

     # 2. Git repos
     print("\nDiscovering GitHub repositories...")
     repos = discover_repos(project)
     if repos:
         print(f"  Found: {', '.join(repos)}")
         fetch_repos(project, repos)
     else:
         print("  No repositories found.")

     # 3. Report
     report(project, active_lists, repos, months)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env -S uv run --script
	# /// script
	# dependencies = [
	# "requests",
	# ]
	# ///

	import argparse
	import os
	import sys
	import subprocess
	import mailbox
	import email
	import requests
	from datetime import datetime, timedelta
	from collections import defaultdict
	from pathlib import Path

	DATA_DIR = Path(__file__).parent / "DATA"
	COMMON_LISTS = ["dev", "user", "users", "commits", "issues", "reviews"]


	def check_list_exists(project, list_name):
	"""Check if an Apache mailing list exists."""
	domain = f"{project}.apache.org"
	date = datetime.now().strftime("%Y-%m")
	url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={date}"
	try:
	r = requests.get(url, timeout=10)
	if r.status_code == 200 and len(r.content) > 100:
	return True
	except requests.RequestException:
	pass
	# Try a few months back
	for i in range(1, 12):
	d = datetime.now() - timedelta(days=30 * i)
	url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={d.strftime('%Y-%m')}"
	try:
	r = requests.get(url, timeout=10)
	if r.status_code == 200 and len(r.content) > 100:
	return True
	except requests.RequestException:
	continue
	return False


	def fetch_mbox(project, list_name, year_month):
	"""Fetch an mbox file for a given list and month. Returns path or None."""
	domain = f"{project}.apache.org"
	dest_dir = DATA_DIR / "mbox" / project / list_name
	dest_dir.mkdir(parents=True, exist_ok=True)
	dest = dest_dir / f"{year_month}.mbox"

	now = datetime.now()
	ym_parts = year_month.split("-")
	target_year, target_month = int(ym_parts[0]), int(ym_parts[1])

	# Determine if we should skip fetching
	if dest.exists():
	is_current_month = (target_year == now.year and target_month == now.month)
	if is_current_month:
	pass # Always refetch current month
	else:
	# For past months, check if file was fetched after end of that month
	end_of_month = datetime(target_year, target_month + 1, 1) if target_month < 12 else datetime(target_year + 1, 1, 1)
	file_mtime = datetime.fromtimestamp(dest.stat().st_mtime)
	if file_mtime > end_of_month:
	return dest # Already complete

	url = f"https://lists.apache.org/api/mbox.lua?list={list_name}@{domain}&date={year_month}"
	try:
	r = requests.get(url, timeout=60)
	if r.status_code == 200 and len(r.content) > 100:
	dest.write_bytes(r.content)
	return dest
	except requests.RequestException as e:
	print(f" Warning: Failed to fetch {list_name}/{year_month}: {e}")
	return None


	def fetch_mailing_lists(project, months):
	"""Discover lists and fetch mbox archives."""
	print("Checking mailing lists...")
	active_lists = []
	for ln in COMMON_LISTS:
	if check_list_exists(project, ln):
	print(f" Found list: {ln}")
	active_lists.append(ln)

	if not active_lists:
	print(" No mailing lists found.")
	return active_lists

	now = datetime.now()
	month_list = []
	for i in range(months + 1):
	d = now - timedelta(days=30 * i)
	month_list.append(d.strftime("%Y-%m"))

	for ln in active_lists:
	print(f" Fetching archives for {ln}...")
	for ym in month_list:
	result = fetch_mbox(project, ln, ym)
	if result:
	print(f" {ym}: OK")
	else:
	print(f" {ym}: no data")

	return active_lists


	def discover_repos(project):
	"""Discover GitHub repos for apache/{project} and apache/{project}-*."""
	repos = []
	# Check main repo
	try:
	r = requests.get(f"https://api.github.com/repos/apache/{project}", timeout=10)
	if r.status_code == 200:
	repos.append(r.json()["name"])
	except requests.RequestException:
	pass

	# Search for project-* repos
	try:
	url = f"https://api.github.com/search/repositories?q=org:apache+{project}-+in:name&per_page=100"
	r = requests.get(url, timeout=10)
	if r.status_code == 200:
	for repo in r.json().get("items", []):
	name = repo["name"]
	if name.startswith(f"{project}-") and name not in repos:
	repos.append(name)
	except requests.RequestException:
	pass

	return repos


	def fetch_repos(project, repos):
	"""Clone or update repos with metadata only."""
	repo_dir = DATA_DIR / "REPOSITORIES" / project
	repo_dir.mkdir(parents=True, exist_ok=True)

	for repo_name in repos:
	path = repo_dir / repo_name
	git_url = f"https://github.com/apache/{repo_name}.git"
	if path.exists():
	print(f" Updating {repo_name}...")
	try:
	subprocess.run(["git", "pull"], cwd=path, capture_output=True, check=True)
	except subprocess.CalledProcessError as e:
	print(f" Warning: pull failed for {repo_name}: {e}")
	else:
	print(f" Cloning {repo_name} (metadata only)...")
	try:
	subprocess.run(
	["git", "clone", "--filter=blob:none", "--no-checkout", git_url, str(path)],
	capture_output=True, check=True,
	)
	except subprocess.CalledProcessError as e:
	print(f" Warning: clone failed for {repo_name}: {e}")


	def analyze_mbox_threads(project, list_name, months):
	"""Analyze mbox files and return top threads with first message ID."""
	now = datetime.now()
	threads = defaultdict(int) # normalized subject -> message count
	thread_msgid = {} # normalized subject -> first Message-ID seen

	for i in range(months + 1):
	d = now - timedelta(days=30 * i)
	ym = d.strftime("%Y-%m")
	path = DATA_DIR / "mbox" / project / list_name / f"{ym}.mbox"
	if not path.exists():
	continue
	try:
	mbox = mailbox.mbox(str(path))
	for msg in mbox:
	raw_subject = msg.get("Subject", "(no subject)")
	# Decode MIME-encoded headers
	decoded_parts = email.header.decode_header(raw_subject)
	subject = ""
	for part, charset in decoded_parts:
	if isinstance(part, bytes):
	subject += part.decode(charset or "utf-8", errors="replace")
	else:
	subject += part
	# Collapse folded header whitespace and sanitize for markdown tables
	subject = " ".join(subject.split())
	subject = subject.replace("\|", "\\\|")
	# Normalize: strip Re:/Fwd: prefixes
	s = subject
	while True:
	lower = s.lower().lstrip()
	if lower.startswith("re:") or lower.startswith("fwd:"):
	s = s.lstrip()[s.lstrip().index(":") + 1:].lstrip()
	elif lower.startswith("[") and "]" in lower:
	s = s[s.index("]") + 1:].lstrip()
	else:
	break
	key = s.strip()
	threads[key] += 1
	if key not in thread_msgid:
	mid = msg.get("Message-ID", "")
	if mid:
	thread_msgid[key] = mid.strip("<>")
	except Exception:
	continue

	# Sort by count, top 5
	top = [(subj, count, thread_msgid.get(subj, "")) for subj, count in
	sorted(threads.items(), key=lambda x: -x[1])[:5]]
	total = sum(threads.values())
	return top, total


	def analyze_repo_commits(project, repo_name, months):
	"""Count commits in the past m months."""
	path = DATA_DIR / "REPOSITORIES" / project / repo_name
	if not path.exists():
	return 0
	since = (datetime.now() - timedelta(days=30 * months)).strftime("%Y-%m-%d")
	try:
	result = subprocess.run(
	["git", "rev-list", "--count", f"--since={since}", "HEAD"],
	cwd=path, capture_output=True, text=True, check=True,
	)
	return int(result.stdout.strip())
	except (subprocess.CalledProcessError, ValueError):
	return 0


	def report(project, active_lists, repos, months):
	"""Generate a markdown activity report."""
	today = datetime.now().strftime("%Y-%m-%d")
	report_dir = Path(__file__).parent / "REPORTS" / project
	report_dir.mkdir(parents=True, exist_ok=True)
	report_path = report_dir / f"{today}.md"

	lines = []
	lines.append(f"# Apache {project} — Activity Report")
	lines.append(f"")
	lines.append(f"Generated: {today} ")
	lines.append(f"Period: {months} months ending {today}")
	lines.append("")

	# Mailing lists
	any_list_activity = False
	list_sections = []
	for ln in active_lists:
	top_threads, total = analyze_mbox_threads(project, ln, months)
	if total == 0:
	continue
	any_list_activity = True
	section = []
	section.append(f"### {ln}@ ({total} messages)")
	section.append("")
	list_id = f"{ln}.{project}.apache.org"
	section.append("\| Messages \| Thread \| Link \|")
	section.append("\|-------:\|--------\|------\|")
	for subject, count, msgid in top_threads:
	if msgid:
	link = f"[thread](https://lists.apache.org/thread/<{msgid}>?<{list_id}>)"
	else:
	link = ""
	section.append(f"\| {count} \| {subject} \| {link} \|")
	section.append("")
	list_sections.append("\n".join(section))

	if any_list_activity:
	lines.append("## Most active mailing list threads")
	lines.append("")
	lines.append("\n".join(list_sections))
	else:
	lines.append("## Most active mailing list threads")
	lines.append("")
	lines.append("No mailing list activity found.")
	lines.append("")

	# Repos
	any_repo_activity = False
	repo_rows = []
	for repo_name in repos:
	count = analyze_repo_commits(project, repo_name, months)
	if count == 0:
	continue
	any_repo_activity = True
	repo_rows.append(f"\| {repo_name} \| {count} \|")

	lines.append("## Repositories")
	lines.append("")
	if any_repo_activity:
	lines.append("\| Repository \| Commits \|")
	lines.append("\|------------\|--------:\|")
	lines.extend(repo_rows)
	else:
	lines.append("No repository activity found.")
	lines.append("")

	content = "\n".join(lines)
	report_path.write_text(content)
	print(f"\nReport written to {report_path}")
	print(content)


	def main():
	parser = argparse.ArgumentParser(description="Apache project activity report")
	parser.add_argument("-p", "--project", required=True, help="ASF project name")
	parser.add_argument("-m", "--months", type=int, default=3, help="Number of months (default: 3)")
	args = parser.parse_args()

	project = args.project.lower()
	months = args.months

	print(f"Project: {project}")
	print(f"Months: {months}\n")

	# 1. Mailing lists
	active_lists = fetch_mailing_lists(project, months)

	# 2. Git repos
	print("\nDiscovering GitHub repositories...")
	repos = discover_repos(project)
	if repos:
	print(f" Found: {', '.join(repos)}")
	fetch_repos(project, repos)
	else:
	print(" No repositories found.")

	# 3. Report
	report(project, active_lists, repos, months)


	if __name__ == "__main__":
	main()