blob: 27c527f7ac09589c09a07bca251a665d95f34eee [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This module will create a python virtual env and install external dependencies. If the
# virtualenv already exists and it contains all the expected packages, nothing is done.
#
# A multi-step bootstrapping process is required to build and install all of the
# dependencies:
# 1. install basic non-C/C++ packages into the virtualenv
# 1b. install packages that depend on step 1 but cannot be installed together with their
# dependencies
# 2. use the virtualenv Python to bootstrap the toolchain
# 3. use toolchain gcc to build C/C++ packages
# 4. build the kudu-python package with toolchain gcc and Cython
#
# Every time this script is run, it completes as many of the bootstrapping steps as
# possible with the available dependencies.
#
# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
# instead.
from __future__ import print_function
import glob
import logging
import optparse
import os
import shutil
import subprocess
import sys
import tarfile
import tempfile
import textwrap
import urllib
LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
ENV_DIR = os.path.join(os.path.dirname(__file__), "env")
# Requirements file with packages we need for our build and tests.
REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
# Second stage of requirements which cannot be installed together with their dependencies
# in requirements.txt.
REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
# Requirements for the next bootstrapping step that builds compiled requirements
# with toolchain gcc.
COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
# Requirements for the Kudu bootstrapping step, which depends on Cython being installed
# by the compiled requirements step.
KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
# Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function
# Interface) being installed by the compiled requirements step.
ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
def delete_virtualenv_if_exist():
if os.path.exists(ENV_DIR):
shutil.rmtree(ENV_DIR)
def create_virtualenv():
LOG.info("Creating python virtualenv")
build_dir = tempfile.mkdtemp()
file = tarfile.open(find_file(DEPS_DIR, "virtualenv*.tar.gz"), "r:gz")
for member in file.getmembers():
file.extract(member, build_dir)
file.close()
python_cmd = detect_python_cmd()
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
"--python", python_cmd, ENV_DIR])
shutil.rmtree(build_dir)
def exec_cmd(args, **kwargs):
'''Executes a command and waits for it to finish, raises an exception if the return
status is not zero. The command output is returned.
'args' and 'kwargs' use the same format as subprocess.Popen().
'''
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
**kwargs)
output = process.communicate()[0]
if process.returncode != 0:
raise Exception("Command returned non-zero status\nCommand: %s\nOutput: %s"
% (args, output))
return output
def use_ccache():
'''Returns true if ccache is available and should be used'''
if 'DISABLE_CCACHE' in os.environ: return False
try:
exec_cmd(['ccache', '-V'])
return True
except:
return False
def select_cc():
'''Return the C compiler command that should be used as a string or None if the
compiler is not available '''
# Use toolchain gcc for ABI compatibility with other toolchain packages, e.g.
# Kudu/kudu-python
if not have_toolchain(): return None
toolchain_gcc_dir = toolchain_pkg_dir("gcc")
cc = os.path.join(toolchain_gcc_dir, "bin/gcc")
if not os.path.exists(cc): return None
if use_ccache(): cc = "ccache %s" % cc
return cc
def exec_pip_install(args, cc="no-cc-available", env=None):
'''Executes "pip install" with the provided command line arguments. If 'cc' is set,
it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
setting the CC environment variable to a bogus value.
Other environment vars can optionally be set with the 'env' argument. By default the
current process's command line arguments are inherited.'''
if not env: env = dict(os.environ)
env["CC"] = cc
# Parallelize the slow numpy build.
# Use getconf instead of nproc because it is supported more widely, e.g. on older
# linux distributions.
env["NPY_NUM_BUILD_JOBS"] = exec_cmd(["getconf", "_NPROCESSORS_ONLN"]).strip()
# Don't call the virtualenv pip directly, it uses a hashbang to to call the python
# virtualenv using an absolute path. If the path to the virtualenv is very long, the
# hashbang won't work.
impala_pip_base_cmd = [os.path.join(ENV_DIR, "bin", "python"),
os.path.join(ENV_DIR, "bin", "pip"), "install", "-v"]
# Passes --no-binary for IMPALA-3767: without this, Cython (and
# several other packages) fail download.
#
# --no-cache-dir is used to prevent caching of compiled artifacts, which may be built
# with different compilers or settings.
third_party_pkg_install_cmd = \
impala_pip_base_cmd[:] + ["--no-binary", ":all:", "--no-cache-dir"]
# When using a custom mirror, we also must use the index of that mirror.
if "PYPI_MIRROR" in os.environ:
third_party_pkg_install_cmd.extend(["--index-url",
"%s/simple" % os.environ["PYPI_MIRROR"]])
else:
# Prevent fetching additional packages from the index. If we forget to add a package
# to one of the requirements.txt files, this should trigger an error. However, we will
# still access the index for version/dependency resolution, hence we need to change it
# when using a private mirror.
third_party_pkg_install_cmd.append("--no-index")
third_party_pkg_install_cmd.extend(["--find-links",
"file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))])
third_party_pkg_install_cmd.extend(args)
exec_cmd(third_party_pkg_install_cmd, env=env)
# Finally, we want to install the packages from our own internal python lib
local_package_install_cmd = impala_pip_base_cmd + \
['-e', os.path.join(os.getenv('IMPALA_HOME'), 'lib', 'python')]
exec_cmd(local_package_install_cmd)
def find_file(*paths):
'''Returns the path specified by the glob 'paths', raises an exception if no file is
found.
Ex: find_file('/etc', 'h*sts') --> /etc/hosts
'''
path = os.path.join(*paths)
files = glob.glob(path)
if len(files) > 1:
raise Exception("Found too many files at %s: %s" % (path, files))
if len(files) == 0:
raise Exception("No file found at %s" % path)
return files[0]
def detect_python_cmd():
'''Returns the system command that provides python 2.6 or greater.'''
paths = os.getenv("PATH").split(os.path.pathsep)
for cmd in ("python", "python27", "python2.7", "python-27", "python-2.7", "python26",
"python2.6", "python-26", "python-2.6"):
for path in paths:
cmd_path = os.path.join(path, cmd)
if not os.path.exists(cmd_path) or not os.access(cmd_path, os.X_OK):
continue
exit = subprocess.call([cmd_path, "-c", textwrap.dedent("""
import sys
sys.exit(int(sys.version_info[:2] < (2, 6)))""")])
if exit == 0:
return cmd_path
raise Exception("Could not find minimum required python version 2.6")
def install_deps():
LOG.info("Installing packages into the virtualenv")
exec_pip_install(["-r", REQS_PATH])
mark_reqs_installed(REQS_PATH)
LOG.info("Installing stage 2 packages into the virtualenv")
exec_pip_install(["-r", REQS2_PATH])
mark_reqs_installed(REQS2_PATH)
def have_toolchain():
'''Return true if the Impala toolchain is available'''
return "IMPALA_TOOLCHAIN" in os.environ
def toolchain_pkg_dir(pkg_name):
'''Return the path to the toolchain package'''
pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"]
return os.path.join(os.environ["IMPALA_TOOLCHAIN"], pkg_name + "-" + pkg_version)
def install_compiled_deps_if_possible():
'''Install dependencies that require compilation with toolchain GCC, if the toolchain
is available. Returns true if the deps are installed'''
if reqs_are_installed(COMPILED_REQS_PATH):
LOG.debug("Skipping compiled deps: matching compiled-installed-requirements.txt found")
return True
cc = select_cc()
if cc is None:
LOG.debug("Skipping compiled deps: cc not available yet")
return False
env = dict(os.environ)
# Compilation of pycrypto fails on CentOS 5 with newer GCC versions because of a
# problem with inline declarations in older libc headers. Setting -fgnu89-inline is a
# workaround.
distro_version = ''.join(exec_cmd(["lsb_release", "-irs"]).lower().split())
print(distro_version)
if distro_version.startswith("centos5."):
env["CFLAGS"] = "-fgnu89-inline"
LOG.info("Installing compiled requirements into the virtualenv")
exec_pip_install(["-r", COMPILED_REQS_PATH], cc=cc, env=env)
mark_reqs_installed(COMPILED_REQS_PATH)
return True
def install_adls_deps():
# The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
# which is why we break this into a seperate step. If the target filesystem is
# ADLS, the expectation is that the dev environment is running at least CentOS 6.7.
if os.environ.get('TARGET_FILESYSTEM') == "adls":
if reqs_are_installed(ADLS_REQS_PATH):
LOG.debug("Skipping ADLS deps: matching adls-installed-requirements.txt found")
return True
cc = select_cc()
assert cc is not None
LOG.info("Installing ADLS packages into the virtualenv")
exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
mark_reqs_installed(ADLS_REQS_PATH)
def install_kudu_client_if_possible():
'''Installs the Kudu python module if possible, which depends on the toolchain and
the compiled requirements in compiled-requirements.txt. If the toolchain isn't
available, nothing will be done. Also nothing will be done if the Kudu client lib
required by the module isn't available (as determined by KUDU_IS_SUPPORTED)'''
if reqs_are_installed(KUDU_REQS_PATH):
LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
return
if os.environ["KUDU_IS_SUPPORTED"] != "true":
LOG.debug("Skipping Kudu: Kudu is not supported")
return
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
if not os.path.exists(kudu_base_dir):
LOG.debug("Skipping Kudu: %s doesn't exist" % kudu_base_dir)
return
LOG.info("Installing Kudu into the virtualenv")
# The installation requires that KUDU_HOME/build/latest exists. An empty directory
# structure will be made to satisfy that. The Kudu client headers and lib will be made
# available through GCC environment variables.
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu")
try:
artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest")
if not os.path.exists(artifact_dir):
os.makedirs(artifact_dir)
cc = select_cc()
assert cc is not None
env = dict(os.environ)
env["KUDU_HOME"] = fake_kudu_build_dir
kudu_client_dir = find_kudu_client_install_dir()
env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include")
env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
os.path.join(kudu_client_dir, 'lib64')])
exec_pip_install(["-r", KUDU_REQS_PATH], cc=cc, env=env)
mark_reqs_installed(KUDU_REQS_PATH)
finally:
try:
shutil.rmtree(fake_kudu_build_dir)
except Exception:
LOG.debug("Error removing temp Kudu build dir", exc_info=True)
def find_kudu_client_install_dir():
custom_client_dir = os.environ["KUDU_CLIENT_DIR"]
if custom_client_dir:
install_dir = os.path.join(custom_client_dir, "usr", "local")
error_if_kudu_client_not_found(install_dir)
else:
# If the toolchain appears to have been setup already, then the Kudu client is
# required to exist. It's possible that the toolchain won't be setup yet though
# since the toolchain bootstrap script depends on the virtualenv.
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
install_dir = os.path.join(kudu_base_dir, "debug")
if os.path.exists(kudu_base_dir):
error_if_kudu_client_not_found(install_dir)
return install_dir
def error_if_kudu_client_not_found(install_dir):
header_path = os.path.join(install_dir, "include", "kudu", "client", "client.h")
if not os.path.exists(header_path):
raise Exception("Kudu client header not found at %s" % header_path)
kudu_client_lib = "libkudu_client.so"
lib_dir = os.path.join(install_dir, "lib64")
if not os.path.exists(lib_dir):
lib_dir = os.path.join(install_dir, "lib")
for _, _, files in os.walk(lib_dir):
for file in files:
if file == kudu_client_lib:
return
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
def mark_reqs_installed(reqs_path):
'''Mark that the requirements from the given file are installed by copying it into the root
directory of the virtualenv.'''
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
shutil.copyfile(reqs_path, installed_reqs_path)
def reqs_are_installed(reqs_path):
'''Check if the requirements from the given file are installed in the virtualenv by
looking for a matching requirements file in the root directory of the virtualenv.'''
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
if not os.path.exists(installed_reqs_path):
return False
installed_reqs_file = open(installed_reqs_path)
try:
reqs_file = open(reqs_path)
try:
if reqs_file.read() == installed_reqs_file.read():
return True
else:
LOG.debug("Virtualenv upgrade needed")
return False
finally:
reqs_file.close()
finally:
installed_reqs_file.close()
def setup_virtualenv_if_not_exists():
if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
delete_virtualenv_if_exist()
create_virtualenv()
install_deps()
LOG.debug("Virtualenv setup complete")
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option("-l", "--log-level", default="INFO",
choices=("DEBUG", "INFO", "WARN", "ERROR"))
parser.add_option("-r", "--rebuild", action="store_true", help="Force a rebuild of"
" the virtualenv even if it exists and appears to be completely up-to-date.")
parser.add_option("--print-ld-library-path", action="store_true", help="Print the"
" LD_LIBRARY_PATH that should be used when running python from the virtualenv.")
options, args = parser.parse_args()
if options.print_ld_library_path:
kudu_client_dir = find_kudu_client_install_dir()
print(os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
os.path.join(kudu_client_dir, 'lib64')]))
sys.exit()
logging.basicConfig(level=getattr(logging, options.log_level))
if options.rebuild:
delete_virtualenv_if_exist()
# Complete as many bootstrap steps as possible (see file comment for the steps).
setup_virtualenv_if_not_exists()
if install_compiled_deps_if_possible():
install_kudu_client_if_possible()
install_adls_deps()