blob: 836e8b59fbdb0b9801692f03e61a29123b2b0a2f [file]
from bs4 import BeautifulSoup
import os
from pathlib import Path
from urllib import error, request
def get_index(url):
with request.urlopen(url) as resp:
index = resp.read().decode('utf-8')
soup = BeautifulSoup(index, 'html.parser')
def link_to_subproject(tag):
return tag.name == 'a' and \
tag.has_attr('href') and \
not tag.get('href').startswith('.')
return soup.find_all(link_to_subproject)
def get_dirs(url):
tags = get_index(url)
def is_dir(tag):
return tag.has_attr('href') and tag.get('href').endswith('/')
def as_str(tag):
return tag['href'][:-1]
return list(map(as_str, list(filter(is_dir, tags))))
def get_files(url):
tags = get_index(url)
def is_file(tag):
return tag.has_attr('href') and not tag.get('href').endswith('/')
def as_dict(tag):
return {
'title': tag.get('title'),
'href': tag.get('href'),
}
return list(map(as_dict, filter(is_file, tags)))
# Returns (and caches) a None for URLs that are either empty
# or are not found
def get_url_cached(url):
filename = "cache/" + "".join(x for x in url if x.isalnum())
if not os.path.exists(filename):
os.makedirs("cache", exist_ok=True)
with open(filename, "w") as out:
try:
with request.urlopen(url) as resp:
response = resp.read().decode('utf-8')
out.write(response)
out.close()
except error.HTTPError as e:
if e.status == 404:
Path(filename).touch()
else:
raise e
with open(filename, "r") as i:
try:
return i.read()
except Exception as e:
print(f"Failed to parse {filename}")
print(f"for {url}")
raise e
def get_sbom_cached(url, to):
filename = "sboms/" + to
if not os.path.exists(filename):
os.makedirs(os.path.abspath(os.path.join(filename, os.pardir)), exist_ok=True)
with request.urlopen(url) as sbomPayload:
with open(filename, "w") as out:
out.write(sbomPayload.read().decode('utf-8'))
with open(filename, "r") as i:
return i.read()