| #!/usr/bin/env impala-python |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # The purpose of this script is to download prebuilt binaries and jar files to satisfy the |
| # third-party dependencies for Impala. The script checks for the presence of IMPALA_HOME |
| # and IMPALA_TOOLCHAIN. IMPALA_HOME indicates that the environment is correctly setup and |
| # that we can deduce the version settings of the dependencies from the environment. |
| # IMPALA_TOOLCHAIN indicates the location where the prebuilt artifacts should be extracted |
| # to. If DOWNLOAD_CDH_COMPONENTS is set to true, this script will also download and extract |
| # the CDH components (i.e. Hadoop, Hive, HBase and Sentry) into |
| # CDH_COMPONENTS_HOME. |
| # |
| # By default, packages are downloaded from an S3 bucket named native-toolchain. |
| # The exact URL is based on IMPALA_<PACKAGE>_VERSION environment variables |
| # (configured in impala-config.sh) as well as the OS version being built on. |
| # The URL can be overridden with an IMPALA_<PACKAGE>_URL environment variable |
| # set in impala-config-{local,branch}.sh. |
| # |
| # The script is called as follows without any additional parameters: |
| # |
| # python bootstrap_toolchain.py |
| import logging |
| import os |
| import random |
| import re |
| import sh |
| import shutil |
| import subprocess |
| import sys |
| import tempfile |
| import time |
| |
| HOST = "https://native-toolchain.s3.amazonaws.com/build" |
| |
| OS_MAPPING = { |
| "centos6" : "ec2-package-centos-6", |
| "centos5" : "ec2-package-centos-5", |
| "centos7" : "ec2-package-centos-7", |
| "redhatenterpriseserver5" : "ec2-package-centos-5", |
| "redhatenterpriseserver6" : "ec2-package-centos-6", |
| "redhatenterpriseserver7" : "ec2-package-centos-7", |
| "debian6" : "ec2-package-debian-6", |
| "debian7" : "ec2-package-debian-7", |
| "debian8" : "ec2-package-debian-8", |
| "suselinux11": "ec2-package-sles-11", |
| "suselinux12": "ec2-package-sles-12", |
| "suse12.2": "ec2-package-sles-12", |
| "ubuntu12.04" : "ec2-package-ubuntu-12-04", |
| "ubuntu14.04" : "ec2-package-ubuntu-14-04", |
| "ubuntu15.04" : "ec2-package-ubuntu-14-04", |
| "ubuntu15.10" : "ec2-package-ubuntu-14-04", |
| "ubuntu16.04" : "ec2-package-ubuntu-16-04", |
| } |
| |
| class Package(object): |
| """ |
| Represents a package to be downloaded. A version, if not specified |
| explicitly, is retrieved from the environment variable IMPALA_<NAME>_VERSION. |
| URLs are retrieved from IMPALA_<NAME>_URL, but are optional. |
| """ |
| def __init__(self, name, version=None, url=None): |
| self.name = name |
| self.version = version |
| self.url = url |
| package_env_name = name.replace("-", "_").upper() |
| if self.version is None: |
| version_env_var = "IMPALA_{0}_VERSION".format(package_env_name) |
| |
| self.version = os.environ.get(version_env_var) |
| if not self.version: |
| raise Exception("Could not find version for {0} in environment var {1}".format( |
| name, version_env_var)) |
| if self.url is None: |
| url_env_var = "IMPALA_{0}_URL".format(package_env_name) |
| self.url = os.environ.get(url_env_var) |
| |
| def try_get_platform_release_label(): |
| """Gets the right package label from the OS version. Return None if not found.""" |
| try: |
| return get_platform_release_label() |
| except: |
| return None |
| |
| # Cache "lsb_release -irs" to avoid excessive logging from sh, and |
| # to shave a little bit of time. |
| lsb_release_cache = None |
| |
| def get_platform_release_label(release=None): |
| """Gets the right package label from the OS version. Raise exception if not found. |
| 'release' can be provided to override the underlying OS version. |
| """ |
| global lsb_release_cache |
| if not release: |
| if lsb_release_cache: |
| release = lsb_release_cache |
| else: |
| release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split())) |
| # Only need to check against the major release if RHEL or CentOS |
| for platform in ['centos', 'redhatenterpriseserver']: |
| if platform in release: |
| release = release.split('.')[0] |
| break |
| lsb_release_cache = release |
| for k, v in OS_MAPPING.iteritems(): |
| if re.search(k, release): |
| return v |
| |
| raise Exception("Could not find package label for OS version: {0}.".format(release)) |
| |
| def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber): |
| if not download_path.endswith("/" + file_name): |
| raise Exception("URL {0} does not match with expected file_name {1}" |
| .format(download_path, file_name)) |
| NUM_ATTEMPTS = 3 |
| for attempt in range(1, NUM_ATTEMPTS + 1): |
| logging.info("Downloading {0} to {1}/{2} (attempt {3})".format( |
| download_path, destination, file_name, attempt)) |
| # --no-clobber avoids downloading the file if a file with the name already exists |
| try: |
| sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber) |
| break |
| except Exception, e: |
| if attempt == NUM_ATTEMPTS: |
| raise |
| logging.error("Download failed; retrying after sleep: " + str(e)) |
| time.sleep(10 + random.random() * 5) # Sleep between 10 and 15 seconds. |
| logging.info("Extracting {0}".format(file_name)) |
| sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination) |
| sh.rm(os.path.join(destination, file_name)) |
| |
| def download_package(destination, package, compiler, platform_release=None): |
| remove_existing_package(destination, package.name, package.version) |
| |
| toolchain_build_id = os.environ["IMPALA_TOOLCHAIN_BUILD_ID"] |
| label = get_platform_release_label(release=platform_release) |
| format_params = {'product': package.name, 'version': package.version, |
| 'compiler': compiler, 'label': label, 'toolchain_build_id': toolchain_build_id} |
| file_name = "{product}-{version}-{compiler}-{label}.tar.gz".format(**format_params) |
| format_params['file_name'] = file_name |
| if package.url is None: |
| url_path = "/{toolchain_build_id}/{product}/{version}-{compiler}/{file_name}".format( |
| **format_params) |
| download_path = HOST + url_path |
| else: |
| download_path = package.url |
| |
| wget_and_unpack_package(download_path, file_name, destination, True) |
| |
| def bootstrap(toolchain_root, packages): |
| """Downloads and unpacks each package in the list `packages` into `toolchain_root` if it |
| doesn't exist already. |
| """ |
| if not try_get_platform_release_label(): |
| check_custom_toolchain(toolchain_root, packages) |
| return |
| |
| # Detect the compiler |
| compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"]) |
| |
| def handle_package(p): |
| if check_for_existing_package(toolchain_root, p.name, p.version, compiler): |
| return |
| if p.name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true": |
| download_package(toolchain_root, p, compiler) |
| else: |
| build_kudu_stub(toolchain_root, p.version, compiler) |
| write_version_file(toolchain_root, p.name, p.version, compiler, |
| get_platform_release_label()) |
| execute_many(handle_package, packages) |
| |
| def check_output(cmd_args): |
| """Run the command and return the output. Raise an exception if the command returns |
| a non-zero return code. Similar to subprocess.check_output() which is only provided |
| in python 2.7. |
| """ |
| process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) |
| stdout, _ = process.communicate() |
| if process.wait() != 0: |
| raise Exception("Command with args '%s' failed with exit code %s:\n%s" |
| % (cmd_args, process.returncode, stdout)) |
| return stdout |
| |
| def package_directory(toolchain_root, pkg_name, pkg_version): |
| dir_name = "{0}-{1}".format(pkg_name, pkg_version) |
| return os.path.join(toolchain_root, dir_name) |
| |
| def version_file_path(toolchain_root, pkg_name, pkg_version): |
| return os.path.join(package_directory(toolchain_root, pkg_name, pkg_version), |
| "toolchain_package_version.txt") |
| |
| def check_custom_toolchain(toolchain_root, packages): |
| missing = [] |
| for p in packages: |
| pkg_dir = package_directory(toolchain_root, p.name, p.version) |
| if not os.path.isdir(pkg_dir): |
| missing.append((p, pkg_dir)) |
| |
| if missing: |
| msg = "The following packages are not in their expected locations.\n" |
| for p, pkg_dir in missing: |
| msg += " %s (expected directory %s to exist)\n" % (p, pkg_dir) |
| msg += "Pre-built toolchain archives not available for your platform.\n" |
| msg += "Clone and build native toolchain from source using this repository:\n" |
| msg += " https://github.com/cloudera/native-toolchain\n" |
| logging.error(msg) |
| raise Exception("Toolchain bootstrap failed: required packages were missing") |
| |
| def check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler): |
| """Return true if toolchain_root already contains the package with the correct |
| version and compiler. |
| """ |
| version_file = version_file_path(toolchain_root, pkg_name, pkg_version) |
| if not os.path.exists(version_file): |
| return False |
| |
| label = get_platform_release_label() |
| pkg_version_string = "{0}-{1}-{2}-{3}".format(pkg_name, pkg_version, compiler, label) |
| with open(version_file) as f: |
| return f.read().strip() == pkg_version_string |
| |
| def write_version_file(toolchain_root, pkg_name, pkg_version, compiler, label): |
| with open(version_file_path(toolchain_root, pkg_name, pkg_version), 'w') as f: |
| f.write("{0}-{1}-{2}-{3}".format(pkg_name, pkg_version, compiler, label)) |
| |
| def remove_existing_package(toolchain_root, pkg_name, pkg_version): |
| dir_path = package_directory(toolchain_root, pkg_name, pkg_version) |
| if os.path.exists(dir_path): |
| logging.info("Removing existing package directory {0}".format(dir_path)) |
| shutil.rmtree(dir_path) |
| |
| def build_kudu_stub(toolchain_root, kudu_version, compiler): |
| # When Kudu isn't supported, the CentOS 7 package will be downloaded and the client |
| # lib will be replaced with a stubbed client. |
| download_package(toolchain_root, Package("kudu", kudu_version), compiler, |
| platform_release="centos7") |
| |
| # Find the client lib files in the extracted dir. There may be several files with |
| # various extensions. Also there will be a debug version. |
| kudu_dir = package_directory(toolchain_root, "kudu", kudu_version) |
| client_lib_paths = [] |
| for path, _, files in os.walk(kudu_dir): |
| for file in files: |
| if not file.startswith("libkudu_client.so"): |
| continue |
| file_path = os.path.join(path, file) |
| if os.path.islink(file_path): |
| continue |
| client_lib_paths.append(file_path) |
| if not client_lib_paths: |
| raise Exception("Unable to find Kudu client lib under '%s'" % kudu_dir) |
| |
| # The client stub will be create by inspecting a real client and extracting the |
| # symbols. The choice of which client file to use shouldn't matter. |
| client_lib_path = client_lib_paths[0] |
| |
| # Use a newer version of binutils because on older systems the default binutils may |
| # not be able to read the newer binary. |
| binutils_dir = package_directory( |
| toolchain_root, "binutils", os.environ["IMPALA_BINUTILS_VERSION"]) |
| nm_path = os.path.join(binutils_dir, "bin", "nm") |
| objdump_path = os.path.join(binutils_dir, "bin", "objdump") |
| |
| # Extract the symbols and write the stubbed client source. There is a special method |
| # kudu::client::GetShortVersionString() that is overridden so that the stub can be |
| # identified by the caller. |
| get_short_version_sig = "kudu::client::GetShortVersionString()" |
| nm_out = check_output([nm_path, "--defined-only", "-D", client_lib_path]) |
| stub_build_dir = tempfile.mkdtemp() |
| stub_client_src_file = open(os.path.join(stub_build_dir, "kudu_client.cc"), "w") |
| try: |
| stub_client_src_file.write(""" |
| #include <string> |
| |
| static const std::string kFakeKuduVersion = "__IMPALA_KUDU_STUB__"; |
| |
| static void KuduNotSupported() { |
| *((char*)0) = 0; |
| } |
| |
| namespace kudu { namespace client { |
| std::string GetShortVersionString() { return kFakeKuduVersion; } |
| }} |
| """) |
| found_start_version_symbol = False |
| cpp_filt_path = os.path.join(binutils_dir, "bin", "c++filt") |
| for line in nm_out.splitlines(): |
| addr, sym_type, mangled_name = line.split(" ") |
| # Skip special functions an anything that isn't a strong symbol. Any symbols that |
| # get passed this check must be related to Kudu. If a symbol unrelated to Kudu |
| # (ex: a boost symbol) gets defined in the stub, there's a chance the symbol could |
| # get used and crash Impala. |
| if mangled_name in ["_init", "_fini"] or sym_type not in "Tt": |
| continue |
| demangled_name = check_output([cpp_filt_path, mangled_name]).strip() |
| assert "kudu" in demangled_name, \ |
| "Symbol doesn't appear to be related to Kudu: " + demangled_name |
| if demangled_name == get_short_version_sig: |
| found_start_version_symbol = True |
| continue |
| stub_client_src_file.write(""" |
| extern "C" void %s() { |
| KuduNotSupported(); |
| } |
| """ % mangled_name) |
| |
| if not found_start_version_symbol: |
| raise Exception("Expected to find symbol a corresponding to" |
| " %s but it was not found." % get_short_version_sig) |
| stub_client_src_file.flush() |
| |
| # The soname is needed to avoid problem in packaging builds. Without the soname, |
| # the library dependency as listed in the impalad binary will be a full path instead |
| # of a short name. Debian in particular has problems with packaging when that happens. |
| objdump_out = check_output([objdump_path, "-p", client_lib_path]) |
| for line in objdump_out.splitlines(): |
| if "SONAME" not in line: |
| continue |
| # The line that needs to be parsed should be something like: |
| # " SONAME libkudu_client.so.0" |
| so_name = line.split()[1] |
| break |
| else: |
| raise Exception("Unable to extract soname from %s" % client_lib_path) |
| |
| # Compile the library. |
| stub_client_lib_path = os.path.join(stub_build_dir, "libkudu_client.so") |
| subprocess.check_call(["g++", stub_client_src_file.name, "-shared", "-fPIC", |
| "-Wl,-soname,%s" % so_name, "-o", stub_client_lib_path]) |
| |
| # Replace the real libs with the stub. |
| for client_lib_path in client_lib_paths: |
| shutil.copyfile(stub_client_lib_path, client_lib_path) |
| finally: |
| shutil.rmtree(stub_build_dir) |
| |
| def execute_many(f, args): |
| """ |
| Executes f(a) for a in args. If possible, uses a threadpool |
| to execute in parallel. The pool uses the number of CPUs |
| in the system as the default size. |
| """ |
| pool = None |
| try: |
| import multiprocessing.pool |
| pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4)) |
| return pool.map(f, args, 1) |
| except ImportError: |
| # multiprocessing was introduced in Python 2.6. |
| # For older Pythons (CentOS 5), degrade to single-threaded execution: |
| return [ f(a) for a in args ] |
| |
| def download_cdh_components(toolchain_root, cdh_components): |
| """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found.""" |
| cdh_components_home = os.environ.get("CDH_COMPONENTS_HOME") |
| if not cdh_components_home: |
| logging.error("Impala environment not set up correctly, make sure " |
| "$CDH_COMPONENTS_HOME is present.") |
| sys.exit(1) |
| |
| # Create the directory where CDH components live if necessary. |
| if not os.path.exists(cdh_components_home): |
| os.makedirs(cdh_components_home) |
| |
| # The URL prefix of where CDH components live in S3. |
| download_path_prefix = HOST + "/cdh_components/" |
| |
| def download(component): |
| pkg_directory = package_directory(cdh_components_home, component.name, |
| component.version) |
| if os.path.isdir(pkg_directory): |
| return |
| |
| # Download the package if it doesn't exist |
| file_name = "{0}-{1}.tar.gz".format(component.name, component.version) |
| if component.url is None: |
| download_path = download_path_prefix + file_name |
| else: |
| download_path = component.url |
| wget_and_unpack_package(download_path, file_name, cdh_components_home, False) |
| |
| execute_many(download, cdh_components) |
| |
| if __name__ == "__main__": |
| """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment.- |
| By checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present |
| as well. Will create the directory specified by $IMPALA_TOOLCHAIN if it doesn't exist |
| yet. Each of the packages specified in `packages` is downloaded and extracted into |
| $IMPALA_TOOLCHAIN. If $DOWNLOAD_CDH_COMPONENTS is true, this function will also download |
| the CDH components (i.e. hadoop, hbase, hive, llama, llama-minikidc and sentry) into the |
| directory specified by $CDH_COMPONENTS_HOME. |
| """ |
| logging.basicConfig(level=logging.INFO, |
| format='%(asctime)s %(threadName)s %(levelname)s: %(message)s') |
| # 'sh' module logs at every execution, which is too noisy |
| logging.getLogger("sh").setLevel(logging.WARNING) |
| |
| if not os.environ.get("IMPALA_HOME"): |
| logging.error("Impala environment not set up correctly, make sure " |
| "impala-config.sh is sourced.") |
| sys.exit(1) |
| |
| # Create the destination directory if necessary |
| toolchain_root = os.environ.get("IMPALA_TOOLCHAIN") |
| if not toolchain_root: |
| logging.error("Impala environment not set up correctly, make sure " |
| "$IMPALA_TOOLCHAIN is present.") |
| sys.exit(1) |
| |
| if not os.path.exists(toolchain_root): |
| os.makedirs(toolchain_root) |
| |
| # LLVM and Kudu are the largest packages. Sort them first so that |
| # their download starts as soon as possible. |
| packages = map(Package, ["llvm", "kudu", |
| "avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil", |
| "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "libev", |
| "lz4", "openldap", "openssl", "orc", "protobuf", |
| "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"]) |
| packages.insert(0, Package("llvm", "5.0.1-asserts")) |
| bootstrap(toolchain_root, packages) |
| |
| # Download the CDH components if necessary. |
| if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true": |
| cdh_components = map(Package, ["hadoop", "hbase", "hive", "llama-minikdc", "sentry"]) |
| download_cdh_components(toolchain_root, cdh_components) |