blob: 38b8d05eda17c9463326e790498da0bf65789d62 [file] [log] [blame]
#!/usr/bin/env impala-python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# The purpose of this script is to download prebuilt binaries and jar files to satisfy the
# third-party dependencies for Impala. The script checks for the presence of IMPALA_HOME
# and IMPALA_TOOLCHAIN. IMPALA_HOME indicates that the environment is correctly setup and
# that we can deduce the version settings of the dependencies from the environment.
# IMPALA_TOOLCHAIN indicates the location where the prebuilt artifacts should be extracted
# to. If DOWNLOAD_CDH_COMPONENTS is set to true, this script will also download and extract
# the CDH components (i.e. Hadoop, Hive, HBase and Sentry) into
# The script is called as follows without any additional parameters:
# python
import os
import re
import sh
import shutil
import subprocess
import sys
import tempfile
HOST = ""
"centos6" : "ec2-package-centos-6",
"centos5" : "ec2-package-centos-5",
"centos7" : "ec2-package-centos-7",
"debian6" : "ec2-package-debian-6",
"debian7" : "ec2-package-debian-7",
"debian8" : "ec2-package-debian-8",
"suselinux11": "ec2-package-sles-11",
"suselinux12": "ec2-package-sles-12",
"suse12.2": "ec2-package-sles-12",
"ubuntu12.04" : "ec2-package-ubuntu-12-04",
"ubuntu14.04" : "ec2-package-ubuntu-14-04",
"ubuntu15.04" : "ec2-package-ubuntu-14-04",
"ubuntu15.10" : "ec2-package-ubuntu-14-04",
"ubuntu16.04" : "ec2-package-ubuntu-16-04",
def try_get_platform_release_label():
"""Gets the right package label from the OS version. Return None if not found."""
return get_platform_release_label()
return None
def get_platform_release_label(release=None):
"""Gets the right package label from the OS version. Raise exception if not found.
'release' can be provided to override the underlying OS version.
if not release:
release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split()))
for k, v in OS_MAPPING.iteritems():
if, release):
return v
raise Exception("Could not find package label for OS version: {0}.".format(release))
def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber):
print "URL {0}".format(download_path)
print "Downloading {0} to {1}".format(file_name, destination)
# --no-clobber avoids downloading the file if a file with the name already exists
sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber)
print "Extracting {0}".format(file_name)
sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination)
sh.rm(os.path.join(destination, file_name))
def download_package(destination, product, version, compiler, platform_release=None):
remove_existing_package(destination, product, version)
toolchain_build_id = os.environ["IMPALA_TOOLCHAIN_BUILD_ID"]
label = get_platform_release_label(release=platform_release)
format_params = {'product': product, 'version': version, 'compiler': compiler,
'label': label, 'toolchain_build_id': toolchain_build_id}
file_name = "{product}-{version}-{compiler}-{label}.tar.gz".format(**format_params)
format_params['file_name'] = file_name
url_path = "/{toolchain_build_id}/{product}/{version}-{compiler}/{file_name}".format(
download_path = HOST + url_path
wget_and_unpack_package(download_path, file_name, destination, True)
def bootstrap(toolchain_root, packages):
"""Downloads and unpacks each package in the list `packages` into `toolchain_root` if it
doesn't exist already.
if not try_get_platform_release_label():
check_custom_toolchain(toolchain_root, packages)
# Detect the compiler
compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
for p in packages:
pkg_name, pkg_version = unpack_name_and_version(p)
if check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler):
if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true":
download_package(toolchain_root, pkg_name, pkg_version, compiler)
build_kudu_stub(toolchain_root, pkg_version, compiler)
write_version_file(toolchain_root, pkg_name, pkg_version, compiler,
def check_output(cmd_args):
"""Run the command and return the output. Raise an exception if the command returns
a non-zero return code. Similar to subprocess.check_output() which is only provided
in python 2.7.
process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, _ = process.communicate()
if process.wait() != 0:
raise Exception("Command with args '%s' failed with exit code %s:\n%s"
% (cmd_args, process.returncode, stdout))
return stdout
def package_directory(toolchain_root, pkg_name, pkg_version):
dir_name = "{0}-{1}".format(pkg_name, pkg_version)
return os.path.join(toolchain_root, dir_name)
def version_file_path(toolchain_root, pkg_name, pkg_version):
return os.path.join(package_directory(toolchain_root, pkg_name, pkg_version),
def check_custom_toolchain(toolchain_root, packages):
missing = []
for p in packages:
pkg_name, pkg_version = unpack_name_and_version(p)
pkg_dir = package_directory(toolchain_root, pkg_name, pkg_version)
if not os.path.isdir(pkg_dir):
missing.append((p, pkg_dir))
if missing:
print("The following packages are not in their expected locations.")
for p, pkg_dir in missing:
print(" %s (expected directory %s to exist)" % (p, pkg_dir))
print("Pre-built toolchain archives not available for your platform.")
print("Clone and build native toolchain from source using this repository:")
raise Exception("Toolchain bootstrap failed: required packages were missing")
def check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler):
"""Return true if toolchain_root already contains the package with the correct
version and compiler.
version_file = version_file_path(toolchain_root, pkg_name, pkg_version)
if not os.path.exists(version_file):
return False
label = get_platform_release_label()
pkg_version_string = "{0}-{1}-{2}-{3}".format(pkg_name, pkg_version, compiler, label)
with open(version_file) as f:
return == pkg_version_string
def write_version_file(toolchain_root, pkg_name, pkg_version, compiler, label):
with open(version_file_path(toolchain_root, pkg_name, pkg_version), 'w') as f:
f.write("{0}-{1}-{2}-{3}".format(pkg_name, pkg_version, compiler, label))
def remove_existing_package(toolchain_root, pkg_name, pkg_version):
dir_path = package_directory(toolchain_root, pkg_name, pkg_version)
if os.path.exists(dir_path):
print "Removing existing package directory {0}".format(dir_path)
def unpack_name_and_version(package):
"""A package definition is either a string where the version is fetched from the
environment or a tuple where the package name and the package version are fully
if isinstance(package, basestring):
env_var = "IMPALA_{0}_VERSION".format(package).replace("-", "_").upper()
return package, os.environ[env_var]
except KeyError:
raise Exception("Could not find version for {0} in environment var {1}".format(
package, env_var))
return package[0], package[1]
def build_kudu_stub(toolchain_root, kudu_version, compiler):
# When Kudu isn't supported, the CentOS 7 package will be downloaded and the client
# lib will be replaced with a stubbed client.
download_package(toolchain_root, "kudu", kudu_version, compiler,
# Find the client lib files in the extracted dir. There may be several files with
# various extensions. Also there will be a debug version.
kudu_dir = package_directory(toolchain_root, "kudu", kudu_version)
client_lib_paths = []
for path, _, files in os.walk(kudu_dir):
for file in files:
if not file.startswith(""):
file_path = os.path.join(path, file)
if os.path.islink(file_path):
if not client_lib_paths:
raise Exception("Unable to find Kudu client lib under '%s'" % kudu_dir)
# The client stub will be create by inspecting a real client and extracting the
# symbols. The choice of which client file to use shouldn't matter.
client_lib_path = client_lib_paths[0]
# Use a newer version of binutils because on older systems the default binutils may
# not be able to read the newer binary.
binutils_dir = package_directory(
toolchain_root, "binutils", os.environ["IMPALA_BINUTILS_VERSION"])
nm_path = os.path.join(binutils_dir, "bin", "nm")
objdump_path = os.path.join(binutils_dir, "bin", "objdump")
# Extract the symbols and write the stubbed client source. There is a special method
# kudu::client::GetShortVersionString() that is overridden so that the stub can be
# identified by the caller.
get_short_version_sig = "kudu::client::GetShortVersionString()"
nm_out = check_output([nm_path, "--defined-only", "-D", client_lib_path])
stub_build_dir = tempfile.mkdtemp()
stub_client_src_file = open(os.path.join(stub_build_dir, ""), "w")
#include <string>
static const std::string kFakeKuduVersion = "__IMPALA_KUDU_STUB__";
static void KuduNotSupported() {
*((char*)0) = 0;
namespace kudu { namespace client {
std::string GetShortVersionString() { return kFakeKuduVersion; }
found_start_version_symbol = False
cpp_filt_path = os.path.join(binutils_dir, "bin", "c++filt")
for line in nm_out.splitlines():
addr, sym_type, mangled_name = line.split(" ")
# Skip special functions an anything that isn't a strong symbol. Any symbols that
# get passed this check must be related to Kudu. If a symbol unrelated to Kudu
# (ex: a boost symbol) gets defined in the stub, there's a chance the symbol could
# get used and crash Impala.
if mangled_name in ["_init", "_fini"] or sym_type not in "Tt":
demangled_name = check_output([cpp_filt_path, mangled_name]).strip()
assert "kudu" in demangled_name, \
"Symbol doesn't appear to be related to Kudu: " + demangled_name
if demangled_name == get_short_version_sig:
found_start_version_symbol = True
extern "C" void %s() {
""" % mangled_name)
if not found_start_version_symbol:
raise Exception("Expected to find symbol a corresponding to"
" %s but it was not found." % get_short_version_sig)
# The soname is needed to avoid problem in packaging builds. Without the soname,
# the library dependency as listed in the impalad binary will be a full path instead
# of a short name. Debian in particular has problems with packaging when that happens.
objdump_out = check_output([objdump_path, "-p", client_lib_path])
for line in objdump_out.splitlines():
if "SONAME" not in line:
# The line that needs to be parsed should be something like:
so_name = line.split()[1]
raise Exception("Unable to extract soname from %s" % client_lib_path)
# Compile the library.
stub_client_lib_path = os.path.join(stub_build_dir, "")
subprocess.check_call(["g++",, "-shared", "-fPIC",
"-Wl,-soname,%s" % so_name, "-o", stub_client_lib_path])
# Replace the real libs with the stub.
for client_lib_path in client_lib_paths:
shutil.copyfile(stub_client_lib_path, client_lib_path)
def download_cdh_components(toolchain_root, cdh_components):
"""Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found."""
cdh_components_home = os.getenv("CDH_COMPONENTS_HOME")
if not cdh_components_home:
print("Impala environment not set up correctly, make sure "
"$CDH_COMPONENTS_HOME is present.")
# Create the directory where CDH components live if necessary.
if not os.path.exists(cdh_components_home):
# The URL prefix of where CDH components live in S3.
download_path_prefix = HOST + "/cdh_components/"
for component in cdh_components:
pkg_name, pkg_version = unpack_name_and_version(component)
pkg_directory = package_directory(cdh_components_home, pkg_name, pkg_version)
if os.path.isdir(pkg_directory):
# Download the package if it doesn't exist
file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version)
download_path = download_path_prefix + file_name
wget_and_unpack_package(download_path, file_name, cdh_components_home, False)
if __name__ == "__main__":
"""Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment.-
By checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present
as well. Will create the directory specified by $IMPALA_TOOLCHAIN if it doesn't exist
yet. Each of the packages specified in `packages` is downloaded and extracted into
$IMPALA_TOOLCHAIN. If $DOWNLOAD_CDH_COMPONENTS is true, this function will also download
the CDH components (i.e. hadoop, hbase, hive, llama, llama-minikidc and sentry) into the
directory specified by $CDH_COMPONENTS_HOME.
if not os.getenv("IMPALA_HOME"):
print("Impala environment not set up correctly, make sure "
" is sourced.")
# Create the destination directory if necessary
toolchain_root = os.getenv("IMPALA_TOOLCHAIN")
if not toolchain_root:
print("Impala environment not set up correctly, make sure "
"$IMPALA_TOOLCHAIN is present.")
if not os.path.exists(toolchain_root):
packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil",
"flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "kudu", "libev",
"llvm", ("llvm", "3.8.0-asserts-p1"), "lz4", "openldap", "openssl", "protobuf",
"rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"]
bootstrap(toolchain_root, packages)
# Download the CDH components if necessary.
if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true":
cdh_components = ["hadoop", "hbase", "hive", "llama-minikdc", "sentry"]
download_cdh_components(toolchain_root, cdh_components)