| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| """ |
| A script to pull licenses/notices/source code for Java dependencies. |
| It generates a CSV file with [dependency_name, url_to_license, license_type, source_included] |
| """ |
| |
| import argparse |
| import csv |
| import json |
| import logging |
| import os |
| import shutil |
| import threading |
| import traceback |
| import yaml |
| |
| from bs4 import BeautifulSoup |
| from datetime import datetime |
| from multiprocessing.pool import ThreadPool |
| from queue import Queue |
| from tenacity import retry |
| from tenacity import stop_after_attempt |
| from tenacity import wait_fixed |
| from urllib.request import urlopen, Request, URLError, HTTPError |
| |
| SOURCE_CODE_REQUIRED_LICENSES = ['lgpl', 'gpl', 'cddl', 'mpl', 'gnu', 'mozilla public license'] |
| RETRY_NUM = 9 |
| THREADS = 16 |
| |
| @retry(reraise=True, |
| wait=wait_fixed(5), |
| stop=stop_after_attempt(RETRY_NUM)) |
| def pull_from_url(file_name, url, dep, no_list): |
| if url == 'skip': |
| return |
| |
| # Replace file path with absolute path to manual licenses |
| if url.startswith('file://{}'): |
| url = url.format(manual_license_path) |
| logging.info('Replaced local file URL with {url} for {dep}'.format(url=url, dep=dep)) |
| |
| try: |
| url_read = urlopen(Request(url, headers={ |
| 'User-Agent': 'Apache Beam', |
| # MPL license fails to resolve redirects without this header |
| # see https://github.com/apache/beam/issues/22394 |
| 'accept-language': 'en-US,en;q=0.9', |
| })) |
| with open(file_name, 'wb') as temp_write: |
| shutil.copyfileobj(url_read, temp_write) |
| logging.debug( |
| 'Successfully pulled {file_name} from {url} for {dep}'.format( |
| url=url, file_name=file_name, dep=dep)) |
| except URLError as e: |
| traceback.print_exc() |
| if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM: |
| logging.error('Invalid url for {dep}: {url}. Retrying...'.format( |
| url=url, dep=dep)) |
| raise |
| else: |
| logging.error( |
| 'Invalid url for {dep}: {url} after {n} retries.'.format( |
| url=url, dep=dep, n=RETRY_NUM)) |
| with thread_lock: |
| no_list.append(dep) |
| return |
| except HTTPError as e: |
| traceback.print_exc() |
| if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM: |
| logging.info( |
| 'Received {code} from {url} for {dep}. Retrying...'.format( |
| code=e.code, url=url, dep=dep)) |
| raise |
| else: |
| logging.error( |
| 'Received {code} from {url} for {dep} after {n} retries.'. |
| format(code=e.code, url=url, dep=dep, n=RETRY_NUM)) |
| with thread_lock: |
| no_list.append(dep) |
| return |
| except Exception as e: |
| traceback.print_exc() |
| if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM: |
| logging.error( |
| 'Error occurred when pull {file_name} from {url} for {dep}. Retrying...' |
| .format(url=url, file_name=file_name, dep=dep)) |
| raise |
| else: |
| logging.error( |
| 'Error occurred when pull {file_name} from {url} for {dep} after {n} retries.' |
| .format(url=url, file_name=file_name, dep=dep, n=RETRY_NUM)) |
| with thread_lock: |
| no_list.append(dep) |
| return |
| |
| |
| def pull_source_code(base_url, dir_name, dep): |
| # base_url example: https://repo1.maven.org/maven2/org/mortbay/jetty/jsp-2.1/6.1.14/ |
| try: |
| soup = BeautifulSoup(urlopen(base_url).read(), "html.parser") |
| except: |
| logging.error('Error reading source base from {base_url}'.format(base_url=base_url)) |
| raise |
| source_count = 0 |
| for href in (a["href"] for a in soup.select("a[href]")): |
| if href.endswith( |
| '.jar') and 'sources.jar' in href: # download sources jar file only |
| file_name = dir_name + '/' + href |
| url = base_url + '/' + href |
| logging.debug('Pulling source from {url}'.format(url=url)) |
| pull_from_url(file_name, url, dep, incorrect_source_url) |
| source_count = source_count + 1 |
| if source_count == 0: |
| raise RuntimeError('No source found at {base_url}'.format(base_url=base_url)) |
| |
| |
| @retry(reraise=True, stop=stop_after_attempt(3)) |
| def write_to_csv(csv_list): |
| csv_columns = [ |
| 'dependency_name', 'url_to_license', 'license_type', 'source_included' |
| ] |
| csv_file = "{output_dir}/beam_java_dependency_list.csv".format( |
| output_dir=output_dir) |
| try: |
| with open(csv_file, 'w') as csvfile: |
| writer = csv.DictWriter(csvfile, fieldnames=csv_columns) |
| writer.writeheader() |
| for data in csv_list: |
| writer.writerow(data) |
| except: |
| traceback.print_exc() |
| raise |
| |
| |
| def execute(dep): |
| ''' |
| An example of dep. |
| { |
| "moduleName": "antlr:antlr", |
| "moduleUrl": "http://www.antlr.org/", |
| "moduleVersion": "2.7.7", |
| "moduleLicense": "BSD License", |
| "moduleLicenseUrl": "http://www.antlr.org/license.html" |
| } |
| ''' |
| |
| name = dep['moduleName'].split(':')[1] |
| version = dep['moduleVersion'] |
| name_version = name + '-' + version |
| # javac is not a runtime dependency |
| if name == 'javac': |
| logging.debug('Skipping', name_version) |
| return |
| # skip self dependencies |
| if dep['moduleName'].lower().startswith('beam'): |
| logging.debug('Skipping', name_version) |
| return |
| dir_name = '{output_dir}/{name_version}.jar'.format( |
| output_dir=output_dir, name_version=name_version) |
| |
| # if auto pulled, directory is existing at {output_dir} |
| if not os.path.isdir(dir_name): |
| os.mkdir(dir_name) |
| # pull license |
| try: |
| license_url = dep_config[name][version]['license'] |
| except: |
| try: |
| license_url = dep['moduleLicenseUrl'] |
| except: |
| # url cannot be found, add to no_licenses and skip to pull. |
| with thread_lock: |
| no_licenses.append(name_version) |
| license_url = 'skip' |
| pull_from_url(dir_name + '/LICENSE', license_url, name_version, |
| no_licenses) |
| # pull notice |
| try: |
| notice_url = dep_config[name][version]['notice'] |
| pull_from_url(dir_name + '/NOTICE', notice_url, name_version) |
| except: |
| pass |
| else: |
| try: |
| license_url = dep['moduleLicenseUrl'] |
| except: |
| license_url = '' |
| logging.debug( |
| 'License/notice for {name_version} were pulled automatically.'. |
| format(name_version=name_version)) |
| |
| # get license_type to decide if pull source code. |
| try: |
| license_type = dep['moduleLicense'] |
| except: |
| try: |
| license_type = dep_config[name][version]['type'] |
| except: |
| license_type = 'no_license_type' |
| with thread_lock: |
| no_license_type.append(name_version) |
| |
| # pull source code if license_type is one of SOURCE_CODE_REQUIRED_LICENSES. |
| if any(x in license_type.lower() for x in SOURCE_CODE_REQUIRED_LICENSES): |
| try: |
| base_url = dep_config[name][version]['source'] |
| except: |
| module = dep['moduleName'].split(':')[0].replace('.', '/') |
| base_url = maven_url_temp.format(module=module + '/' + name, |
| version=version) |
| pull_source_code(base_url, dir_name, name_version) |
| source_included = True |
| else: |
| source_included = False |
| |
| csv_dict = { |
| 'dependency_name': name_version, |
| 'url_to_license': license_url, |
| 'license_type': license_type, |
| 'source_included': source_included |
| } |
| with thread_lock: |
| csv_list.append(csv_dict) |
| |
| |
| if __name__ == "__main__": |
| start = datetime.now() |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--license_index', required=True) |
| parser.add_argument('--output_dir', required=True) |
| parser.add_argument('--dep_url_yaml', required=True) |
| parser.add_argument('--manual_license_path', required=True) |
| |
| args = parser.parse_args() |
| license_index = args.license_index |
| output_dir = args.output_dir |
| dep_url_yaml = args.dep_url_yaml |
| manual_license_path = args.manual_license_path |
| |
| logging.getLogger().setLevel(logging.INFO) |
| |
| # index.json is generated by Gradle plugin. |
| with open(license_index) as f: |
| dependencies = json.load(f) |
| |
| with open(dep_url_yaml) as file: |
| dep_config = yaml.full_load(file) |
| |
| maven_url_temp = 'https://repo1.maven.org/maven2/{module}/{version}' |
| |
| csv_list = [] |
| no_licenses = [] |
| no_license_type = [] |
| incorrect_source_url = [] |
| |
| logging.info( |
| 'Pulling license for {num_deps} dependencies using {num_threads} threads.' |
| .format(num_deps=len(dependencies['dependencies']), |
| num_threads=THREADS)) |
| thread_lock = threading.Lock() |
| pool = ThreadPool(THREADS) |
| pool.map(execute, dependencies['dependencies']) |
| |
| write_to_csv(csv_list) |
| |
| error_msg = [] |
| run_status = 'succeed' |
| if no_licenses: |
| logging.error(no_licenses) |
| how_to = '**************************************** ' \ |
| 'Licenses were not able to be pulled ' \ |
| 'automatically for some dependencies. Please search source ' \ |
| 'code of the dependencies on the internet and add "license" ' \ |
| 'and "notice" (if available) field to {yaml_file} for each ' \ |
| 'missing license. Dependency List: [{dep_list}]'.format( |
| dep_list=','.join(sorted(no_licenses)), yaml_file=dep_url_yaml) |
| logging.error(how_to) |
| error_msg.append(how_to) |
| run_status = 'failed' |
| |
| if no_license_type: |
| how_to = '**************************************** ' \ |
| 'License type of some dependencies were not ' \ |
| 'identified. The license type is used to decide whether the ' \ |
| 'source code of the dependency should be pulled or not. ' \ |
| 'Please add "type" field to {yaml_file} for each dependency. ' \ |
| 'Dependency List: [{dep_list}]'.format( |
| dep_list=','.join(sorted(no_license_type)), yaml_file=dep_url_yaml) |
| error_msg.append(how_to) |
| run_status = 'failed' |
| |
| if incorrect_source_url: |
| how_to = '**************************************** ' \ |
| 'Urls to maven repo for some dependencies ' \ |
| 'were not able to be generated automatically. Please add ' \ |
| '"source" field to {yaml_file} for each dependency. ' \ |
| 'Dependency List: [{dep_list}]'.format( |
| dep_list=','.join(sorted(incorrect_source_url)), |
| yaml_file=dep_url_yaml) |
| error_msg.append(how_to) |
| run_status = 'failed' |
| |
| end = datetime.now() |
| logging.info( |
| 'pull_licenses_java.py {status}. It took {sec} seconds with {threads} threads.' |
| .format(status=run_status, |
| sec=(end - start).total_seconds(), |
| threads=THREADS)) |
| |
| if error_msg: |
| raise RuntimeError('{n} error(s) occurred.'.format(n=len(error_msg)), |
| error_msg) |