blob: ab2b95df0d81b6c47a53e4ab9368457f18a0c7ed [file] [log] [blame]
import errtee
import sys
if sys.hexversion < 0x03000000:
raise ImportError("This script requires Python 3")
from xml.dom import minidom
import xml.etree.ElementTree as ET
import re, urllib.request
import urllib.error
import json
import os
from os.path import join
import traceback
import sendmail
"""
Reads:
../../data/projects.xml
parseprojects-failures.xml (if exists)
../../site/json/foundation/committees-retired.json
Writes:
../../site/json/foundation/projects.json
../../site/json/projects/%s.json
parseprojects-failures.xml (if failures occurred)
Deletes any obsolete files from:
../../site/json/projects/%s.json
"""
URL_TIMEOUT = 60.0 # timeout for URL requests (may need tweaking)
PROJECTS_DIR = '../../site/json/projects'
projectsList = "../../data/projects.xml";
PROJECTS_SVN = 'https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/data/projects.xml'
save = True;
if os.path.exists("parseprojects-failures.xml"):
# Only use restart data if requested (e.g. when running interactively)
if 'restart' in sys.argv:
projectsList = "parseprojects-failures.xml";
save = False;
else:
print("Previous run failed, ignoring restart data")
with open(projectsList, "r") as f:
data = f.read()
f.close()
xmldoc = minidom.parseString(data)
itemlist = xmldoc.getElementsByTagName('location')
siteMap = {
'hc': 'httpcomponents',
'ws':'webservices'
}
# Print to log and send an email (intended for WARN messages)
def printMail(msg, file=sys.stdout, body=''):
print(msg, file=file)
if body == None: # sendmail barfs if body is missing
body = ''
if body == '':
body=msg
try:
sendmail.sendMail(msg, body=body)
except ConnectionRefusedError:
print("*** Failed to send the email", file=file)
ATTIC = 'Attic <general@attic.apache.org>'
# Print to log and send a conditional email to Attic
def printAtticMail(msg, file=sys.stdout):
print(msg, file=file)
import datetime
# Only send the mail once a month
if datetime.datetime.now().day != 14:
print("Not sending the email to '" + str(ATTIC) +"'" , file=file)
return
try:
sendmail.sendMail(msg,recipients=ATTIC, replyTo=None)
except ConnectionRefusedError:
print("*** Failed to send the email to '" + str(ATTIC) +"'" , file=file)
def site2committee(s):
if s in siteMap:
return siteMap[s]
return s
with open("../../site/json/foundation/committees-retired.json", "r") as f:
committeesRetired = json.loads(f.read())
f.close()
retired = []
for r in committeesRetired:
retired.append(r['id'])
projects = {}
failures = []
# Convert project name to unique file name
def name2fileName(s, pmc):
retval = None
fn = s.strip().lower()
fn = fn.replace(" %s " % pmc," ") # drop PMC name
fn = fn.replace(' (incubating)','') # will be under the incubator PMC anyway
fn = re.sub('^apache ', '', fn) # Drop leading Apache
fn = re.sub(' library$', '', fn) # Drop trailing Library
fn = fn.replace('.net','dotnet')
fn = re.sub("[^a-z0-9+-]", "_", fn) # sanitise the name
if fn == pmc:
retval = pmc
else:
retval = "%s-%s" % (pmc, fn)
#print("=========== %s, %s => %s " % (s,pmc,retval))
return retval
# Process external PMC descriptor file to extract the PMC name
def getPMC(url):
print("Parsing PMC descriptor file %s" % url)
rdf = urllib.request.urlopen(url, timeout=URL_TIMEOUT).read()
md = minidom.parseString(rdf)
pmc = (md.getElementsByTagName('asfext:pmc') or md.getElementsByTagName('asfext:PMC'))[0]
t = pmc.tagName.lower()
a = pmc.getAttribute('rdf:about')
md.unlink()
if t == 'asfext:pmc':
print("Found pmc: %s" % a)
return a
printMail("WARN: could not find asfext:pmc in %s " % url)
return 'Unknown'
def handleChild(el):
retval = None
hasKids = False
for child in list(el):
hasKids = True
attribs = {}
for key in el.attrib:
xkey = re.sub(r"\{.+\}", "", key)
attribs[xkey] = el.attrib[key]
tag = re.sub(r"\{.+\}", "", el.tag)
value = attribs['resource'] if 'resource' in attribs else el.text
if not hasKids:
retval = value
else:
retval = {}
for child in list(el):
k, v = handleChild(child)
retval[k] = v
if k == "location":
retval = v
break
return tag, retval
files = []
unreportedError = False # any errors not yet mailed?
for s in itemlist :
url = s.childNodes[0].data
try:
rdf = urllib.request.urlopen(url, timeout=URL_TIMEOUT).read()
rdfxml = ET.fromstring(rdf)
project = rdfxml[0]
pjson = {
'doap': url
}
prname = None
committeeId = None
projectJsonFilename = None
for el in project:
k, v = handleChild(el)
if not save:
print("+ %s" % k);
if k in pjson and not k in ['name','homepage']:
if type(pjson[k]) is str:
pjson[k] = "%s, %s" % (pjson[k], v)
else:
for xk in v:
pjson[k].append(v[xk])
else:
# Deal with multiple entry tags first
if k in ['release', 'implements', 'repository', 'developer', 'maintainer', 'member', 'helper']:
pjson[k] = []
for xk in v:
pjson[k].append(v[xk])
else:
pjson[k] = v
if pjson['homepage']:
homepage = pjson['homepage']
m = re.match(r"https?://([^.]+)\.", homepage, re.IGNORECASE)
if m:
siteId = site2committee(m.group(1))
else:
printMail("WARN: no homepage defined in %s, pmc = %s" % (url, pjson['pmc']))
pmc = 'Unknown'
if not 'pmc' in pjson:
printMail("WARN: no asfext:pmc in %s" % url)
else:
pmcrdf = pjson['pmc']
pmcrdf = pmcrdf.replace('/anakia', '').replace('/texen', '') # temporary hack
# Extract the pmc name if it is a shortcut
m = re.match(r"https?://([^.]+)\.apache\.org/?$", pmcrdf, re.IGNORECASE)
if m:
pmc = m.group(1)
else:
# Not a shortcut, so read the descriptor file
try:
pmc = getPMC(pmcrdf)
except:
printMail("WARN: invalid asfext:pmc '%s' in %s" % (pmcrdf, url))
if pjson['name']:
projectJsonFilename = name2fileName(pjson['name'], pmc)
else:
printMail("WARN: no name defined in %s, pmc = %s" % (url, pjson['pmc']))
committeeId = pmc
if committeeId in retired:
printAtticMail("WARN: project from a retired committee but PMC not changed to Attic in %s" % url)
committeeId = 'attic'
pjson['pmc'] = committeeId
# replace category url with id, by removing https?://projects.apache.org/category/
# They are not usable as URLs, but some projects have converted them from http:
if 'category' in pjson:
pjson['category'] = re.sub(r"https?://projects\.apache\.org/category/", "", pjson['category'])
if committeeId == 'attic' and not 'retired' in pjson['category']:
printAtticMail("WARN: project in Attic but not in 'retired' category: %s" % url)
pjson['category'] = "%s, retired" % pjson['category']
elif committeeId == 'attic' and not 'retired' in pjson['category']:
printAtticMail("WARN: project in Attic but not in 'retired' category: %s" % url)
pjson['category'] = "retired"
if projectJsonFilename:
#add = {}
#for k in pjson:
# if pjson[k] != None and type(pjson[k]) is not str:
# for e in pjson[k]:
# add[e] = pjson[k][e]
# pjson[k] = None
projects[projectJsonFilename] = pjson
#for e in add:
# pjson[e] = add[e]
name = "%s.json" % projectJsonFilename
print("Writing projects/%s" % name)
files.append(name)
with open (join(PROJECTS_DIR, name), "w", encoding='utf-8') as f:
json.dump(pjson, f, sort_keys=True, indent=0, ensure_ascii=False)
f.close()
else:
printMail("WARN: project ignored since unable to extract project json filename from %s" % url)
except Exception as err:
if isinstance(err, OSError): # OSError is parent of HTTPError/URLError
# Only mail 404 errors individually
if isinstance(err, urllib.error.HTTPError) and err.code == 404:
printMail("Cannot find doap file: %s" % url, file=sys.stderr,
body=("URL: %s\n%s\nSource: %s" % (url,str(err),PROJECTS_SVN)))
else:
print("Error when processing doap file %s:" % url, file=sys.stderr)
unreportedError = True
else:
printMail("Error when processing doap file %s:" % url, file=sys.stderr,
body=("URL: %s\n%s\nSource: %s" % (url,str(err),PROJECTS_SVN)))
print("-"*60, file=sys.stderr)
traceback.print_exc()
if isinstance(err, OSError): # OSError is parent of HTTPError/URLError
print("URL: '%s'" % err.filename, file=sys.stderr)
print("-"*60, file=sys.stderr)
failures.append(url)
if rdf is not None:
urlname = url.split('/')[-1]
rem = re.search(r';f=([^;]+);.*',urlname) # better name for Git files
if rem:
urlname = rem.group(1)
print("Saving invalid data in %s " % urlname)
with open (urlname, "wb") as f:
f.write(rdf)
f.close()
if save:
print("Writing foundation/projects.json...")
with open ("../../site/json/foundation/projects.json", "w", encoding='utf-8') as f:
json.dump(projects, f, sort_keys=True, indent=0, ensure_ascii=False)
f.close()
# Drop any obsolete files
for f in os.listdir(PROJECTS_DIR):
if re.match(r'.*\.json$', f) and not f in files:
print("Deleting obsolete file projects/%s" %f)
os.remove(join(PROJECTS_DIR,f))
if len(failures) > 0:
with open ("parseprojects-failures.xml", "w") as f:
f.write("<doapFiles>\n")
for fail in failures:
f.write("<location>%s</location>\n" % fail)
f.write("</doapFiles>\n")
f.close()
if unreportedError:
s = "\n".join(failures)
printMail("ERROR: one or more errors detected - see also the parseprojects.py log file\nURLs:\n%s" % s)
else:
if os.path.exists("parseprojects-failures.xml"):
print("No failures detected, removing previous failure data")
try:
os.remove("parseprojects-failures.xml")
except FileNotFoundError: # should not happen
pass
print("Done!")