| #!/usr/bin/env impala-python |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # This script can be used to dump symbols using the 'dump_syms' binary, which is contained |
| # in Google Breakpad. It supports collecting binary files from different sources: |
| # |
| # - Scan an Impala build dir for ELF files |
| # - Read files from stdin |
| # - Process a list of one or multiple explicitly specified files |
| # - Extract an Impala rpm/deb and corresponding debuginfo rpm/deb file, scan for ELF |
| # files, and process them together with their respective .debug file. |
| # |
| # Dependencies: |
| # - dpkg (sudo apt-get -y install dpkg) |
| # - rpm2cpio (sudo apt-get -y install rpm2cpio) |
| # - cpio (sudo apt-get -y install cpio) |
| # - Google Breakpad, either installed via the Impala toolchain or separately |
| # |
| # Usage: dump_breakpad_symbols.py -h |
| # |
| # Typical usage patterns: |
| # ----------------------- |
| # |
| # * Extract symbols from an rpm file and its debuginfo counterpart: |
| # ./dump_breakpad_symbols -d /tmp/syms \ |
| # -r tmp/impala-2.5.0+cdh5.7.0+0-1.cdh5.7.0.p0.147.el6.x86_64.rpm \ |
| # -s tmp/impala-debuginfo-2.5.0+cdh5.7.0+0-1.cdh5.7.0.p0.147.el6.x86_64.rpm |
| # |
| # Note that this will process all ELF binaries in the rpm, including both debug and |
| # release builds. Files are identified by hashes, so you don't need to worry about |
| # collisions and you can expect it to 'just work'. |
| # |
| # * Scan an impalad build directory and extract Breakpad symbols from all binaries: |
| # ./dump_breakpad_symbols.py -d /tmp/syms -b be/build/debug |
| # |
| # * Use the 'minidump_stackwalk' after symbol extraction tool to process a minidump file: |
| # $IMPALA_TOOLCHAIN_PACKAGES_HOME/breakpad-*/bin/minidump_stackwalk \ |
| # /tmp/impala-minidumps/impalad/03c0ee26-bfd1-cf3e-43fa49ca-1a6aae25.dmp /tmp/syms |
| |
| from __future__ import absolute_import, division, print_function |
| import errno |
| import logging |
| import magic |
| import multiprocessing |
| import os |
| import shutil |
| import subprocess |
| import sys |
| import tempfile |
| |
| from argparse import ArgumentParser |
| from collections import namedtuple |
| from multiprocessing.pool import ThreadPool |
| |
| BinarySymbolInfo = namedtuple('BinarySymbolInfo', 'path, debug_path') |
| |
| |
| def die(msg=''): |
| """End the process, optionally after printing the passed error message.""" |
| logging.error('ERROR: %s\n' % msg) |
| sys.exit(1) |
| |
| |
| def find_dump_syms_binary(): |
| """Locate the 'dump_syms' binary from Breakpad. |
| |
| We try to locate the package in the Impala toolchain folder. |
| TODO: Lookup the binary in the system path. Not urgent, since the user can specify the |
| path as a command line switch. |
| """ |
| toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME') |
| if toolchain_packages_home: |
| if not os.path.isdir(toolchain_packages_home): |
| die('Could not find toolchain packages directory') |
| breakpad_version = os.environ.get('IMPALA_BREAKPAD_VERSION') |
| if not breakpad_version: |
| die('Could not determine breakpad version from toolchain') |
| breakpad_dir = 'breakpad-%s' % breakpad_version |
| dump_syms = os.path.join(toolchain_packages_home, breakpad_dir, 'bin', 'dump_syms') |
| if not os.path.isfile(dump_syms): |
| die('Could not find dump_syms executable at %s' % dump_syms) |
| return dump_syms |
| return '' |
| |
| |
| def find_objcopy_binary(): |
| """Locate the 'objcopy' binary from Binutils. |
| |
| We try to locate the package in the Impala toolchain folder. |
| TODO: Fall back to finding objcopy in the system path. |
| """ |
| toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME') |
| if toolchain_packages_home: |
| if not os.path.isdir(toolchain_packages_home): |
| die('Could not find toolchain packages directory') |
| binutils_version = os.environ.get('IMPALA_BINUTILS_VERSION') |
| if not binutils_version: |
| die('Could not determine binutils version from toolchain') |
| binutils_dir = 'binutils-%s' % binutils_version |
| objcopy = os.path.join(toolchain_packages_home, binutils_dir, 'bin', 'objcopy') |
| if not os.path.isfile(objcopy): |
| die('Could not find objcopy executable at %s' % objcopy) |
| return objcopy |
| return '' |
| |
| |
| def parse_args(): |
| """Parse command line arguments and perform sanity checks.""" |
| parser = ArgumentParser() |
| parser.add_argument('-d', '--dest_dir', required=True, help="""The target directory, |
| below which to place extracted symbol files""") |
| parser.add_argument('--dump_syms', help='Path to the dump_syms binary from Breakpad') |
| # Options controlling how to find input files. |
| parser.add_argument('-b', '--build_dir', help="""Path to a directory containing results |
| from an Impala build, e.g. be/build/debug""") |
| parser.add_argument('-f', '--binary_files', nargs='+', metavar="FILE", |
| help='List of binary files to process') |
| parser.add_argument('-i', '--stdin_files', action='store_true', help="""Read the list |
| of files to process from stdin""") |
| parser.add_argument('-r', '--pkg', '--rpm', help="""RPM/DEB file containing the binaries |
| to process, use with -s""") |
| parser.add_argument('-s', '--symbol_pkg', '--debuginfo_rpm', help="""RPM/DEB file |
| containing the debug symbols matching the binaries in -r""") |
| parser.add_argument('--no_symbol_pkg', '--no_debuginfo_rpm', action='store_true', |
| help="""Don't require a symbol pkg when processing a RPM/DEB package with -r""") |
| parser.add_argument('--objcopy', help='Path to the objcopy binary from Binutils') |
| parser.add_argument('--num_processes', type=int, default=multiprocessing.cpu_count(), |
| help="Number of parallel processes to use.") |
| args = parser.parse_args() |
| |
| # Post processing checks |
| # Check that either both pkg and debuginfo_rpm/deb are specified, or none. |
| if not args.no_symbol_pkg and bool(args.pkg) != bool(args.symbol_pkg): |
| parser.print_usage() |
| die("The -r option requires a corresponding -s unless --no_symbol_pkg is specified") |
| input_flags = [args.build_dir, args.binary_files, args.stdin_files, args.pkg] |
| if sum(1 for flag in input_flags if flag) != 1: |
| die('You need to specify exactly one way to locate input files (-b/-f/-i/-r,-s)') |
| |
| return args |
| |
| |
| def ensure_dir_exists(path): |
| """Make sure the directory 'path' exists in a thread-safe way.""" |
| try: |
| os.makedirs(path) |
| except OSError as e: |
| if e.errno != errno.EEXIST or not os.path.isdir(path): |
| raise e |
| |
| |
| def walk_path(path): |
| for dirpath, dirnames, filenames in os.walk(path): |
| for name in filenames: |
| yield os.path.join(dirpath, name) |
| |
| |
| def is_regular_file(path): |
| """Check whether 'path' is a regular file, especially not a symlink.""" |
| return os.path.isfile(path) and not os.path.islink(path) |
| |
| |
| def is_elf_file(path): |
| """Check whether 'path' is an ELF file.""" |
| return is_regular_file(path) and 'ELF' in magic.from_file(path) |
| |
| |
| def find_elf_files(path): |
| """Walk 'path' and return a generator over all ELF files below.""" |
| return (f for f in walk_path(path) if is_elf_file(f)) |
| |
| |
| def extract_rpm(rpm, out_dir): |
| """Extract 'rpm' into 'out_dir'.""" |
| assert os.path.isdir(out_dir) |
| cmd = 'rpm2cpio %s | cpio -id' % rpm |
| subprocess.check_call(cmd, shell=True, cwd=out_dir) |
| |
| |
| def extract_deb(deb, out_dir): |
| """Extract 'deb' into 'out_dir'.""" |
| assert os.path.isdir(out_dir) |
| cmd = 'dpkg -x %s %s' % (deb, out_dir) |
| subprocess.check_call(cmd, shell=True) |
| |
| |
| def extract_pkg(pkg, out_dir): |
| """Autodetect type of 'pkg' and extract it to 'out_dir'.""" |
| pkg_magic = magic.from_file(pkg) |
| if 'RPM' in pkg_magic: |
| return extract_rpm(pkg, out_dir) |
| elif 'Debian' in pkg_magic: |
| return extract_deb(pkg, out_dir) |
| else: |
| die('Unsupported package type: %s' % pkg_magic) |
| |
| |
| def assert_file_exists(path): |
| if not os.path.isfile(path): |
| die('File does not exists: %s' % path) |
| |
| |
| def enumerate_pkg_files(pkg, symbol_pkg): |
| """Return a generator over BinarySymbolInfo tuples for all ELF files in 'pkg'. |
| |
| This function extracts both RPM/DEB files, then walks the binary pkg directory to |
| enumerate all ELF files. If there is no separate symbol pkg, it simply yields |
| all ELF files. If there is a separate symbol pkg, it matches the binaries |
| to the location of their respective .debug files and yields the matching tuples. |
| We use a generator here to keep the temporary directory and its contents around |
| until the consumer of the generator has finished its processing. |
| """ |
| IMPALA_BINARY_BASE = os.path.join('usr', 'lib', 'impala') |
| IMPALA_SYMBOL_BASE = os.path.join('usr', 'lib', 'debug', IMPALA_BINARY_BASE) |
| assert_file_exists(pkg) |
| if symbol_pkg: |
| assert_file_exists(symbol_pkg) |
| tmp_dir = tempfile.mkdtemp() |
| try: |
| # Extract pkg |
| logging.info('Extracting to %s: %s' % (tmp_dir, pkg)) |
| extract_pkg(os.path.abspath(pkg), tmp_dir) |
| binary_base = os.path.join(tmp_dir, IMPALA_BINARY_BASE) |
| if symbol_pkg: |
| # Extract symbol_pkg |
| logging.info('Extracting to %s: %s' % (tmp_dir, symbol_pkg)) |
| extract_pkg(os.path.abspath(symbol_pkg), tmp_dir) |
| symbol_base = os.path.join(tmp_dir, IMPALA_SYMBOL_BASE) |
| # Walk pkg path and find elf files |
| # Find folder with .debug file in symbol_pkg path |
| for binary_path in find_elf_files(binary_base): |
| # Add tuple to output |
| if symbol_pkg: |
| rel_dir = os.path.relpath(os.path.dirname(binary_path), binary_base) |
| debug_dir = os.path.join(symbol_base, rel_dir) |
| else: |
| debug_dir = None |
| yield BinarySymbolInfo(binary_path, debug_dir) |
| finally: |
| shutil.rmtree(tmp_dir) |
| |
| |
| def enumerate_binaries(args): |
| """Enumerate all BinarySymbolInfo tuples, from which symbols should be extracted. |
| |
| This function returns iterables, either lists or generators. |
| """ |
| if args.binary_files: |
| return (BinarySymbolInfo(f, None) for f in args.binary_files) |
| elif args.stdin_files: |
| return (BinarySymbolInfo(f, None) for f in sys.stdin.read().splitlines()) |
| elif args.pkg: |
| return enumerate_pkg_files(args.pkg, args.symbol_pkg) |
| elif args.build_dir: |
| return (BinarySymbolInfo(f, None) for f in find_elf_files(args.build_dir)) |
| die('No input method provided') |
| |
| |
| def process_binary(dump_syms, objcopy, binary, out_dir): |
| """Dump symbols of a single binary file and move the result. |
| |
| Symbols will be extracted to a temporary file and moved into place afterwards. Required |
| directories will be created if necessary. |
| """ |
| logging.info('Processing binary file: %s' % binary.path) |
| ensure_dir_exists(out_dir) |
| # tmp_fd will be closed when the file object created by os.fdopen() below gets |
| # destroyed. |
| tmp_fd, tmp_file = tempfile.mkstemp(dir=out_dir, suffix='.sym') |
| try: |
| # Create a temporary directory used for decompressing debug info |
| tempdir = tempfile.mkdtemp() |
| |
| # Binaries can contain compressed debug symbols. Breakpad currently |
| # does not support dumping symbols for binaries with compressed debug |
| # symbols. |
| # |
| # As a workaround, this uses objcopy to create a copy of the binary with |
| # the debug symbols decompressed. If the debug symbols are not compressed |
| # in the original binary, objcopy simply makes a copy of the binary. |
| # Breakpad is able to read symbols from the decompressed binary, and |
| # those symbols work correctly in resolving a minidump from the original |
| # compressed binary. |
| # TODO: In theory, this could work with the binary.debug_path. |
| binary_basename = os.path.basename(binary.path) |
| decompressed_binary = os.path.join(tempdir, binary_basename) |
| objcopy_retcode = subprocess.call([objcopy, "--decompress-debug-sections", |
| binary.path, decompressed_binary]) |
| |
| # Run dump_syms on the binary |
| # If objcopy failed for some reason, fall back to running dump_syms |
| # directly on the original binary. This is unlikely to work, but it is a way of |
| # guaranteeing that objcopy is not the problem. |
| args = [dump_syms, decompressed_binary] |
| if objcopy_retcode != 0: |
| sys.stderr.write('objcopy failed. Trying to run dump_sym directly.\n') |
| args = [dump_syms, binary.path] |
| |
| if binary.debug_path: |
| args.append(binary.debug_path) |
| proc = subprocess.Popen(args, stdout=os.fdopen(tmp_fd, 'wb'), stderr=subprocess.PIPE) |
| _, stderr = proc.communicate() |
| if proc.returncode != 0: |
| sys.stderr.write('dump_syms: Failed to dump symbols from %s, return code %s\n' % |
| (binary.path, proc.returncode)) |
| sys.stderr.write(stderr) |
| os.remove(tmp_file) |
| return False |
| # Parse the temporary file to determine the full target path. |
| with open(tmp_file, 'r') as f: |
| header = f.readline().strip() |
| # Format of header is: MODULE os arch binary_id binary |
| _, _, _, binary_id, binary = header.split(' ') |
| out_path = os.path.join(out_dir, binary, binary_id) |
| ensure_dir_exists(out_path) |
| # Move the temporary file to its final destination. |
| shutil.move(tmp_file, os.path.join(out_path, '%s.sym' % binary)) |
| except Exception as e: |
| # Only need to clean up in case of errors. |
| try: |
| os.remove(tmp_file) |
| except EnvironmentError: |
| pass |
| raise e |
| finally: |
| # Cleanup temporary directory |
| shutil.rmtree(tempdir) |
| return True |
| |
| |
| def main(): |
| logging.basicConfig(level=logging.INFO) |
| args = parse_args() |
| dump_syms = args.dump_syms or find_dump_syms_binary() |
| assert dump_syms |
| objcopy = args.objcopy or find_objcopy_binary() |
| assert objcopy |
| status = 0 |
| ensure_dir_exists(args.dest_dir) |
| # The logic for handling DEB/RPM packages does not currently work with |
| # parallelism, so disable parallelism if using the -r/--pkg option. |
| if args.num_processes > 1 and not bool(args.pkg): |
| # Use a thread pool to go parallel |
| thread_pool = ThreadPool(processes=args.num_processes) |
| |
| def processing_fn(binary): |
| return process_binary(dump_syms, objcopy, binary, args.dest_dir) |
| |
| for result in thread_pool.imap_unordered(processing_fn, enumerate_binaries(args)): |
| if not result: |
| thread_pool.terminate() |
| status = 1 |
| break |
| |
| thread_pool.close() |
| thread_pool.join() |
| else: |
| # For serial cases, simply avoid the ThreadPool altogether, as that makes it |
| # easy to reason about. |
| for binary in enumerate_binaries(args): |
| if not process_binary(dump_syms, objcopy, binary, args.dest_dir): |
| status = 1 |
| break |
| sys.exit(status) |
| |
| |
| if __name__ == '__main__': |
| main() |