infra/python/deps/pip_download.py - impala - Git at Google

 #!/usr/bin/python
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # Implement the basic 'pip download' functionality in a way that gives us more control
 # over which archive type is downloaded and what post-download steps are executed.
 # This script requires Python 2.6+.

 import json
 import os
 import os.path
 import re
 import sys
 from hashlib import md5
 from random import randint
 from time import sleep
 from urllib import urlopen, URLopener

 NUM_DOWNLOAD_ATTEMPTS = 8

 PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')

 # The requirement files that list all of the required packages and versions.
 REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt',
                       'kudu-requirements.txt', 'adls-requirements.txt']

 def check_md5sum(filename, expected_md5):
   actual_md5 = md5(open(filename).read()).hexdigest()
   return actual_md5 == expected_md5

 def retry(func):
   '''Retry decorator.'''

   def wrapper(*args, **kwargs):
     for try_num in xrange(NUM_DOWNLOAD_ATTEMPTS):
       if try_num > 0:
         sleep_len = randint(5, 10 * 2 ** try_num)
         print 'Sleeping for {0} seconds before retrying'.format(sleep_len)
         sleep(sleep_len)
       try:
         result = func(*args, **kwargs)
         if result:
           return result
       except Exception as e:
         print e
     print 'Download failed after several attempts.'
     sys.exit(1)

   return wrapper

 def get_package_info(pkg_name, pkg_version):
   '''Returns the file name, path and md5 digest of the package.'''
   # We store the matching result in the candidates list instead of returning right away
   # to sort them and return the first value in alphabetical order. This ensures that the
   # same result is always returned even if the ordering changed on the server.
   candidates = []
   url = '{0}/simple/{1}/'.format(PYPI_MIRROR, pkg_name)
   print 'Getting package info from {0}'.format(url)
   # The web page should be in PEP 503 format (https://www.python.org/dev/peps/pep-0503/).
   # We parse the page with regex instead of an html parser because that requires
   # downloading an extra package before running this script. Since the HTML is guaranteed
   # to be formatted according to PEP 503, this is acceptable.
   pkg_info = urlopen(url).read()
   # We assume that the URL includes a hash and the hash function is md5. This not strictly
   # required by PEP 503.
   regex = r'<a href=\".*?packages/(.*?)#md5=(.*?)\".*?>(.*?)<\/a>'
   for match in re.finditer(regex, pkg_info):
     path = match.group(1)
     md5_digest = match.group(2)
     file_name = match.group(3)
     # Make sure that we consider only non Wheel archives, because those are not supported.
     if (file_name.endswith('-{0}.tar.gz'.format(pkg_version)) or
         file_name.endswith('-{0}.tar.bz2'.format(pkg_version)) or
         file_name.endswith('-{0}.zip'.format(pkg_version))):
       candidates.append((file_name, path, md5_digest))
   if not candidates:
     print 'Could not find archive to download for {0} {1}'.format(pkg_name, pkg_version)
     return (None, None, None)
   return sorted(candidates)[0]

 @retry
 def download_package(pkg_name, pkg_version):
   file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version)
   if not file_name:
     return False
   if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5):
     print 'File with matching md5sum already exists, skipping {0}'.format(file_name)
     return True
   downloader = URLopener()
   pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
   print 'Downloading {0} from {1}'.format(file_name, pkg_url)
   downloader.retrieve(pkg_url, file_name)
   if check_md5sum(file_name, expected_md5):
     return True
   else:
     print 'MD5 mismatch in file {0}.'.format(file_name)
     return False

 def main():
   if len(sys.argv) > 1:
     _, pkg_name, pkg_version = sys.argv
     download_package(pkg_name, pkg_version)
     return

   for requirements_file in REQUIREMENTS_FILES:
     # If the package name and version are not specified in the command line arguments,
     # download the packages that in requirements.txt.
     f = open(requirements_file, 'r')
     try:
       # requirements.txt follows the standard pip grammar.
       for line in f:
         # A hash symbol ("#") represents a comment that should be ignored.
         hash_index = line.find('#')
         if hash_index != -1:
           line = line[:hash_index]
         # A semi colon (";") specifies some additional condition for when the package
         # should be installed (for example a specific OS). We can ignore this and download
         # the package anyways because the installation script(bootstrap_virtualenv.py) can
         # take it into account.
         semi_colon_index = line.find(';')
         if semi_colon_index != -1:
           line = line[:semi_colon_index]
         l = line.strip()
         if len(l) > 0:
           pkg_name, pkg_version = l.split('==')
           download_package(pkg_name.strip(), pkg_version.strip())
     finally:
       f.close()

 if __name__ == '__main__':
   main()
	#!/usr/bin/python
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# Implement the basic 'pip download' functionality in a way that gives us more control
	# over which archive type is downloaded and what post-download steps are executed.
	# This script requires Python 2.6+.

	import json
	import os
	import os.path
	import re
	import sys
	from hashlib import md5
	from random import randint
	from time import sleep
	from urllib import urlopen, URLopener

	NUM_DOWNLOAD_ATTEMPTS = 8

	PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')

	# The requirement files that list all of the required packages and versions.
	REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt',
	'kudu-requirements.txt', 'adls-requirements.txt']

	def check_md5sum(filename, expected_md5):
	actual_md5 = md5(open(filename).read()).hexdigest()
	return actual_md5 == expected_md5

	def retry(func):
	'''Retry decorator.'''

	def wrapper(args, *kwargs):
	for try_num in xrange(NUM_DOWNLOAD_ATTEMPTS):
	if try_num > 0:
	sleep_len = randint(5, 10 * 2 ** try_num)
	print 'Sleeping for {0} seconds before retrying'.format(sleep_len)
	sleep(sleep_len)
	try:
	result = func(args, *kwargs)
	if result:
	return result
	except Exception as e:
	print e
	print 'Download failed after several attempts.'
	sys.exit(1)

	return wrapper

	def get_package_info(pkg_name, pkg_version):
	'''Returns the file name, path and md5 digest of the package.'''
	# We store the matching result in the candidates list instead of returning right away
	# to sort them and return the first value in alphabetical order. This ensures that the
	# same result is always returned even if the ordering changed on the server.
	candidates = []
	url = '{0}/simple/{1}/'.format(PYPI_MIRROR, pkg_name)
	print 'Getting package info from {0}'.format(url)
	# The web page should be in PEP 503 format (https://www.python.org/dev/peps/pep-0503/).
	# We parse the page with regex instead of an html parser because that requires
	# downloading an extra package before running this script. Since the HTML is guaranteed
	# to be formatted according to PEP 503, this is acceptable.
	pkg_info = urlopen(url).read()
	# We assume that the URL includes a hash and the hash function is md5. This not strictly
	# required by PEP 503.
	regex = r'<a href=\".?packages/(.?)#md5=(.?)\".?>(.*?)<\/a>'
	for match in re.finditer(regex, pkg_info):
	path = match.group(1)
	md5_digest = match.group(2)
	file_name = match.group(3)
	# Make sure that we consider only non Wheel archives, because those are not supported.
	if (file_name.endswith('-{0}.tar.gz'.format(pkg_version)) or
	file_name.endswith('-{0}.tar.bz2'.format(pkg_version)) or
	file_name.endswith('-{0}.zip'.format(pkg_version))):
	candidates.append((file_name, path, md5_digest))
	if not candidates:
	print 'Could not find archive to download for {0} {1}'.format(pkg_name, pkg_version)
	return (None, None, None)
	return sorted(candidates)[0]

	@retry
	def download_package(pkg_name, pkg_version):
	file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version)
	if not file_name:
	return False
	if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5):
	print 'File with matching md5sum already exists, skipping {0}'.format(file_name)
	return True
	downloader = URLopener()
	pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
	print 'Downloading {0} from {1}'.format(file_name, pkg_url)
	downloader.retrieve(pkg_url, file_name)
	if check_md5sum(file_name, expected_md5):
	return True
	else:
	print 'MD5 mismatch in file {0}.'.format(file_name)
	return False

	def main():
	if len(sys.argv) > 1:
	_, pkg_name, pkg_version = sys.argv
	download_package(pkg_name, pkg_version)
	return

	for requirements_file in REQUIREMENTS_FILES:
	# If the package name and version are not specified in the command line arguments,
	# download the packages that in requirements.txt.
	f = open(requirements_file, 'r')
	try:
	# requirements.txt follows the standard pip grammar.
	for line in f:
	# A hash symbol ("#") represents a comment that should be ignored.
	hash_index = line.find('#')
	if hash_index != -1:
	line = line[:hash_index]
	# A semi colon (";") specifies some additional condition for when the package
	# should be installed (for example a specific OS). We can ignore this and download
	# the package anyways because the installation script(bootstrap_virtualenv.py) can
	# take it into account.
	semi_colon_index = line.find(';')
	if semi_colon_index != -1:
	line = line[:semi_colon_index]
	l = line.strip()
	if len(l) > 0:
	pkg_name, pkg_version = l.split('==')
	download_package(pkg_name.strip(), pkg_version.strip())
	finally:
	f.close()

	if __name__ == '__main__':
	main()