blob: 4f36802ae0d9c30a5a3e1375e807b0a5720b27d4 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""This script creates a MD file with a list of discussion documents
from dev@beam.apache.org.
Usage:
1. Download email archives: The script requires local copies of
the dev@beam.apache.org mbox files for the desired year.
You can download these manually or modify the script to
automate the download process.
2. Run the script:
```bash
python generate_doc_md.py <year>
```
3. Output: The script will create a Markdown file named <year>.md containing
a table of discussion documents with their authors,
subjects, and submission dates.
Note:
The script currently extracts links to Google Docs and
Apache short links (s.apache.org). Ensure you have the necessary libraries
installed (e.g., requests, bs4, mailbox).
"""
import os
import re
import requests
import mailbox
import datetime
import sys
from bs4 import BeautifulSoup
from dataclasses import dataclass
LIST_NAME = "dev"
DOMAIN = "beam.apache.org"
OUTPUT_DIR = "generated"
def download_mbox(list_name, domain, year, month):
"""Downloads an mbox file from the Apache mailing list archive."""
# Construct the URL
url = f"https://lists.apache.org/api/mbox.lua?list={list_name}&domain={domain}&d={year}-{month:02d}"
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an exception for bad status codes
# Create the directory for the archive if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Generate the output filename
output_filename = f"{OUTPUT_DIR}/{list_name}@{domain}_{year}-{month:02d}.mbox"
with open(output_filename, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Downloaded {output_filename}")
except requests.exceptions.RequestException as e:
print(f"Error downloading archive: {e}")
def download_mbox_for_one_year(year):
"""Downloads mbox files for each month in a given year."""
for month in range(1, 13):
download_mbox(LIST_NAME, DOMAIN, year, month)
def get_google_doc_title(link):
"""Fetches the title of a Google Doc from its link."""
try:
response = requests.get(link)
response.raise_for_status() # Raise an exception for bad status codes
soup = BeautifulSoup(response.content, "html.parser")
title = soup.title.string.strip()
return title
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e} {link}")
return None
except Exception as e:
print(f"Error extracting title: {e} {link}")
return None
def extract_name_re(email_string):
"""Extracts the name from an email string using regular expressions."""
email_string = email_string.replace('"', "")
match = re.match(r"^(.+?) via .+ <.+@.+>$", email_string)
if match:
return match.group(1)
else:
match = re.match(r"^(.+?) <.+@.+>$", email_string)
if match:
return match.group(1)
return email_string
def convert_to_timestamp(date_string):
"""Converts a date string to a timestamp object."""
try:
date_format = "%a, %d %b %Y %H:%M:%S %z"
datetime_obj = datetime.datetime.strptime(date_string, date_format)
return datetime_obj.timestamp()
except:
return None
@dataclass
class EmailMessage:
"""A data class representing an email message."""
sender: str
doc_title: str
doc_url: str
body: str
timestamp: datetime.datetime = None
def extract_google_doc_sheet_link(text):
"""Extracts Google Docs or Sheets link from text."""
pattern = r"https?:\/\/docs\.google\.com\/(document|spreadsheets)\/d\/([a-zA-Z0-9-_]+)\/.*"
match = re.search(pattern, text)
if match:
return match.group(0)
else:
return None
def extract_s_link(text):
"""Extracts Apache short link from text."""
pattern = r"https?://s\.apache\.org/.*"
match = re.search(pattern, text)
if match:
return match.group(0)
else:
return None
def extract_google_doc_id(url):
"""
Extracts the unique ID of a Google Doc or Google Sheet from a given URL.
Args:
url: The URL of the Google Doc or Google Sheet.
Returns:
The unique ID of the Google Doc or Google Sheet, or None if the ID could not be extracted.
"""
pattern = r"/(document|spreadsheets)/d/([a-zA-Z0-9-_]+)"
match = re.search(pattern, url)
if match:
return match.group(2)
else:
return None
def standardize_url_link(url):
g_url = extract_google_doc_id(url)
if g_url:
if "spreadsheets" in url:
return f"https://docs.google.com/spreadsheets/d/{g_url}"
else:
return f"https://docs.google.com/document/d/{g_url}"
else:
return url
def add_message(messages: list[EmailMessage], new_message: EmailMessage):
"""Adds a new message to the list, ensuring unique subjects and keeping the oldest message."""
url = new_message.doc_url
for i, message in enumerate(messages):
if message.doc_url == url:
if new_message.timestamp < message.timestamp:
messages[i] = new_message
return
messages.append(new_message)
def remove_invalid_characters(string_url):
"""Removes invalid characters from a string."""
while string_url.endswith(".") or string_url.endswith(
",") or string_url.endswith("*") or string_url.endswith(
"(") or string_url.endswith(")"):
string_url = string_url[:-1]
return string_url
def find_google_docs_links(mbox_file, doc_messages, doc_urls):
"""Filters email messages from an mbox file that contain Google Docs links."""
if not os.path.isfile(mbox_file):
print(f"Cannot find the file {mbox_file}")
mbox = mailbox.mbox(mbox_file)
for message in mbox:
c = message.get_payload()
# for multipart messages, only use the first part
while isinstance(c, list):
c = c[0].get_payload()
# assume the message only contain one doc url
doc_url = None
gdoc_url = extract_google_doc_sheet_link(c)
if gdoc_url:
doc_url = gdoc_url.split()[0].split(">")[0]
else:
s_url = extract_s_link(c)
if s_url:
doc_url = s_url.split()[0].split(">")[0]
if doc_url and not doc_url in doc_urls:
doc_url = remove_invalid_characters(doc_url)
doc_url = standardize_url_link(doc_url)
doc_urls.append(doc_url)
title = get_google_doc_title(doc_url)
try:
sender = extract_name_re(str(message["From"]))
except:
print("Something is wrong: ", message["From"])
sender = None
if not sender:
print("test-------")
print(message["From"])
doc_time = convert_to_timestamp(message["Date"])
if title:
title = title.replace("- Google Docs", "").strip()
new_msg = EmailMessage(
doc_title=title,
doc_url=doc_url,
body=c,
sender=sender,
timestamp=doc_time,
)
add_message(doc_messages, new_msg)
return doc_messages
def sort_emails_by_timestamp(emails: list[EmailMessage]) -> list[EmailMessage]:
"""Sorts a list of EmailMessage objects by timestamp from oldest to newest."""
return sorted(emails, key=lambda email: email.timestamp or 0)
def extract_docs_for_one_year(year):
"""Extracts Google Docs links from emails in a given year."""
doc_messages = []
doc_urls = []
for month in range(1, 13):
# Generate the output filename
output_filename = f"{OUTPUT_DIR}/{LIST_NAME}@{DOMAIN}_{year}-{month:02d}.mbox"
find_google_docs_links(output_filename, doc_messages, doc_urls)
return sort_emails_by_timestamp(doc_messages)
def convert_to_md_table(email_messages: list[EmailMessage], year: int):
"""Converts a list of EmailMessage objects to a Markdown file with a table."""
output_file = f"{year}.md"
with open(output_file, "w") as f:
f.write("""<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->\n\n""")
f.write(f"# List Of Documents Submitted To dev@beam.apache.org In {year}\n")
f.write("| No. | Author | Subject | Date (UTC) |\n")
f.write("|---|---|---|---|")
for eid, email in enumerate(email_messages):
if email.timestamp:
datetime_obj = datetime.datetime.fromtimestamp(email.timestamp)
formatted_date = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
else:
formatted_date = "Unknown"
doc_title = email.doc_title.replace("|", ":")
row_no = f'{eid+1}'
f.write(
f"\n| {row_no} | {email.sender} | [{doc_title}]({email.doc_url}) | {formatted_date} |"
)
if __name__ == "__main__":
if len(sys.argv) > 1:
year = sys.argv[1]
download_mbox_for_one_year(year)
docs = extract_docs_for_one_year(year)
convert_to_md_table(docs, year)
else:
print("Please provide a year as an argument.")