blob: cc9b412bf889ad274f3855d539b8aca1c3293c62 [file] [log] [blame]
#!/usr/bin/python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Implement the basic 'pip download' functionality in a way that gives us more control
# over which archive type is downloaded and what post-download steps are executed.
# This script requires Python 2.6+.
import json
import os
import os.path
import re
import sys
from hashlib import md5
from random import randint
from time import sleep
from urllib import urlopen, URLopener
NUM_DOWNLOAD_ATTEMPTS = 8
PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
# The requirement files that list all of the required packages and versions.
REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt',
'kudu-requirements.txt', 'adls-requirements.txt']
def check_md5sum(filename, expected_md5):
actual_md5 = md5(open(filename).read()).hexdigest()
return actual_md5 == expected_md5
def retry(func):
'''Retry decorator.'''
def wrapper(*args, **kwargs):
for try_num in xrange(NUM_DOWNLOAD_ATTEMPTS):
if try_num > 0:
sleep_len = randint(5, 10 * 2 ** try_num)
print 'Sleeping for {0} seconds before retrying'.format(sleep_len)
sleep(sleep_len)
try:
result = func(*args, **kwargs)
if result:
return result
except Exception as e:
print e
print 'Download failed after several attempts.'
sys.exit(1)
return wrapper
def get_package_info(pkg_name, pkg_version):
'''Returns the file name, path and md5 digest of the package.'''
# We store the matching result in the candidates list instead of returning right away
# to sort them and return the first value in alphabetical order. This ensures that the
# same result is always returned even if the ordering changed on the server.
candidates = []
url = '{0}/simple/{1}/'.format(PYPI_MIRROR, pkg_name)
print 'Getting package info from {0}'.format(url)
# The web page should be in PEP 503 format (https://www.python.org/dev/peps/pep-0503/).
# We parse the page with regex instead of an html parser because that requires
# downloading an extra package before running this script. Since the HTML is guaranteed
# to be formatted according to PEP 503, this is acceptable.
pkg_info = urlopen(url).read()
# We assume that the URL includes a hash and the hash function is md5. This not strictly
# required by PEP 503.
regex = r'<a href=\".*?packages/(.*?)#md5=(.*?)\".*?>(.*?)<\/a>'
for match in re.finditer(regex, pkg_info):
path = match.group(1)
md5_digest = match.group(2)
file_name = match.group(3)
# Make sure that we consider only non Wheel archives, because those are not supported.
if (file_name.endswith('-{0}.tar.gz'.format(pkg_version)) or
file_name.endswith('-{0}.tar.bz2'.format(pkg_version)) or
file_name.endswith('-{0}.zip'.format(pkg_version))):
candidates.append((file_name, path, md5_digest))
if not candidates:
print 'Could not find archive to download for {0} {1}'.format(pkg_name, pkg_version)
return (None, None, None)
return sorted(candidates)[0]
@retry
def download_package(pkg_name, pkg_version):
file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version)
if not file_name:
return False
if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5):
print 'File with matching md5sum already exists, skipping {0}'.format(file_name)
return True
downloader = URLopener()
pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
print 'Downloading {0} from {1}'.format(file_name, pkg_url)
downloader.retrieve(pkg_url, file_name)
if check_md5sum(file_name, expected_md5):
return True
else:
print 'MD5 mismatch in file {0}.'.format(file_name)
return False
def main():
if len(sys.argv) > 1:
_, pkg_name, pkg_version = sys.argv
download_package(pkg_name, pkg_version)
return
for requirements_file in REQUIREMENTS_FILES:
# If the package name and version are not specified in the command line arguments,
# download the packages that in requirements.txt.
f = open(requirements_file, 'r')
try:
# requirements.txt follows the standard pip grammar.
for line in f:
# A hash symbol ("#") represents a comment that should be ignored.
hash_index = line.find('#')
if hash_index != -1:
line = line[:hash_index]
# A semi colon (";") specifies some additional condition for when the package
# should be installed (for example a specific OS). We can ignore this and download
# the package anyways because the installation script(bootstrap_virtualenv.py) can
# take it into account.
semi_colon_index = line.find(';')
if semi_colon_index != -1:
line = line[:semi_colon_index]
l = line.strip()
if len(l) > 0:
pkg_name, pkg_version = l.split('==')
download_package(pkg_name.strip(), pkg_version.strip())
finally:
f.close()
if __name__ == '__main__':
main()