blob: 5ed8ece77830336da51a10f97d498e0ce603fb28 [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import argparse
import concurrent.futures as cf
import functools
import hashlib
import json
import os
import subprocess
import urllib.request
BINTRAY_API_ROOT = "https://bintray.com/api/v1"
BINTRAY_DL_ROOT = "https://dl.bintray.com"
BINTRAY_REPO = os.getenv('BINTRAY_REPOSITORY', 'apache/arrow')
DEFAULT_PARALLEL_DOWNLOADS = 8
class Bintray:
def __init__(self, repo=BINTRAY_REPO):
self.repo = repo
def get_file_list(self, package, version):
url = os.path.join(BINTRAY_API_ROOT, 'packages', self.repo, package,
'versions', version, 'files')
request = urllib.request.urlopen(url).read()
return json.loads(request)
def download_files(self, files, dest=None, num_parallel=None,
re_match=None):
"""
Download files from Bintray in parallel. If file already exists, will
overwrite if the checksum does not match what Bintray says it should be
Parameters
----------
files : List[Dict]
File listing from Bintray
dest : str, default None
Defaults to current working directory
num_parallel : int, default 8
Number of files to download in parallel. If set to None, uses
default
"""
if dest is None:
dest = os.getcwd()
if num_parallel is None:
num_parallel = DEFAULT_PARALLEL_DOWNLOADS
if re_match is not None:
regex = re.compile(re_match)
files = [x for x in files if regex.match(x['path'])]
if num_parallel == 1:
for path in files:
self._download_file(dest, path)
else:
parallel_map_terminate_early(
functools.partial(self._download_file, dest),
files,
num_parallel
)
def _download_file(self, dest, info):
relpath = info['path']
base, filename = os.path.split(relpath)
dest_dir = os.path.join(dest, base)
os.makedirs(dest_dir, exist_ok=True)
dest_path = os.path.join(dest_dir, filename)
if os.path.exists(dest_path):
with open(dest_path, 'rb') as f:
sha256sum = hashlib.sha256(f.read()).hexdigest()
if sha256sum == info['sha256']:
print('Local file {} sha256 matches, skipping'
.format(dest_path))
return
else:
print('Local file sha256 does not match, overwriting')
print("Downloading {} to {}".format(relpath, dest_path))
bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath)
cmd = [
'curl', '--fail', '--location', '--retry', '5',
'--output', dest_path, bintray_abspath
]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}"
.format(relpath, stdout, stderr))
def parallel_map_terminate_early(f, iterable, num_parallel):
tasks = []
with cf.ProcessPoolExecutor(num_parallel) as pool:
for v in iterable:
tasks.append(pool.submit(functools.partial(f, v)))
for task in cf.as_completed(tasks):
if task.exception() is not None:
e = task.exception()
for task in tasks:
task.cancel()
raise e
ARROW_PACKAGE_TYPES = ['centos', 'debian', 'nuget', 'python', 'ubuntu']
def download_rc_binaries(version, rc_number, re_match=None, dest=None,
num_parallel=None, target_package_type=None):
bintray = Bintray()
version_string = '{}-rc{}'.format(version, rc_number)
if target_package_type:
package_types = [target_package_type]
else:
package_types = ARROW_PACKAGE_TYPES
for package_type in package_types:
files = bintray.get_file_list('{}-rc'.format(package_type),
version_string)
bintray.download_files(files, re_match=re_match, dest=dest,
num_parallel=num_parallel)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Download release candidate binaries'
)
parser.add_argument('version', type=str, help='The version number')
parser.add_argument('rc_number', type=int,
help='The release candidate number, e.g. 0, 1, etc')
parser.add_argument('-e', '--regexp', type=str, default=None,
help=('Regular expression to match on file names '
'to only download certain files'))
parser.add_argument('--dest', type=str, default=os.getcwd(),
help='The output folder for the downloaded files')
parser.add_argument('--num_parallel', type=int, default=8,
help='The number of concurrent downloads to do')
parser.add_argument('--package_type', type=str, default=None,
help='The package type to be downloaded')
args = parser.parse_args()
download_rc_binaries(args.version, args.rc_number, dest=args.dest,
re_match=args.regexp, num_parallel=args.num_parallel,
target_package_type=args.package_type)