blob: 3c64d9d55944a02385db091201c1418cc6ff3c2f [file] [log] [blame]
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# This script automates symbol resolution for Breakpad minidumps
# under ideal circumstances. Specifically, it expects all the
# binaries to be in the same locations as when the minidump
# was taken. This is often true for minidumps on a developer
# workstation or at the end of an Impala test job. It finds Breakpad
# using environment variables from the Impala dev environment,
# so it must run inside the Impala dev environment.
# TODO: It may be possible to extend this to Docker images.
#
# Within this simple context, this script aims for complete
# symbol resolution. It uses Breakpad's minidump_dump utility
# to dump the minidump, then it parses the list of libraries
# that were used by the binary. It gets the symbols for all
# those libraries and resolves the minidump.
#
# Usage: resolve_minidumps.py --minidump_file [file] --output_file [file]
# (optional -v or --verbose for more output)
import errno
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
import traceback
from argparse import ArgumentParser
class ModuleInfo:
def __init__(self, code_file, code_id, debug_file, debug_id):
self.code_file = code_file
self.code_id = code_id
self.debug_file = debug_file
self.debug_id = debug_id
def read_module_info(minidump_dump_contents):
"""Read the module information out of the minidump_dump raw contents.
This is expecting 'minidump_dump_contents' to be the minidump_dump
contents for the minidump split by newlines.
This will return a list of ModuleInfo objects.
"""
# Find the module_count
for idx, line in enumerate(minidump_dump_contents):
if line.strip().startswith("module_count"):
module_count = int(line.split("=")[1].strip())
break
# The minidump has a MDRawModule per module and it will have
# the same number of MDRawModule dumps as module_count.
module_boundaries = []
for idx, line in enumerate(minidump_dump_contents):
if line.startswith("MDRawModule"):
module_boundaries.append(idx)
if len(module_boundaries) != module_count:
logging.error("Failed to parse modules, mismatch in module count "
"({0} != {1})".format(len(module_boundaries), module_count))
return None
# Add one more entry to module_boundaries that is the end of the file
# That makes this more of a list of boundaries than the list of
# start locations.
module_boundaries.append(len(minidump_dump_contents))
modules = []
for module_idx in range(module_count):
module_start = module_boundaries[module_idx]
module_end = module_boundaries[module_idx + 1]
# Find the code_file
code_file = None
code_identifier = None
debug_file = None
debug_identifier = None
for line in minidump_dump_contents[module_start:module_end]:
if line.find("code_file") != -1:
code_file = line.split("=")[1].strip().strip('"')
elif line.find("code_identifier") != -1:
code_identifier = line.split("=")[1].strip().strip('"')
elif line.find("debug_file") != -1:
debug_file = line.split("=")[1].strip().strip('"')
elif line.find("debug_identifier") != -1:
debug_identifier = line.split("=")[1].strip().strip('"')
# Important: it is ok for the fields to be the zero-length string.
# We just care that they are non-None (i.e. the loop above encountered
# them and parsed a value).
if code_file is None or code_identifier is None or debug_file is None or \
debug_identifier is None:
logging.error("Failed to parse dump output, missing fields for MDRawModule "
"{0}".format(module_idx))
return None
# Jars and other files show up in this list, but they have
# code identifiers or debug identifiers as all zeros. Skip those,
# as there are no symbols to find.
if re.fullmatch("[0]+", code_identifier) or re.fullmatch("[0]+", debug_identifier):
continue
# Skip cases where the code identifier or debug identifier are null
if len(code_identifier) == 0 or len(debug_identifier) == 0:
continue
# linux-gate.so is a special case, and it is not an actual file on disk.
if code_file.startswith("linux-gate.so"):
continue
modules.append(ModuleInfo(code_file, code_identifier, debug_file, debug_identifier))
return modules
def filter_shared_library_modules(module_list, lib_allow_list):
"""Filter the list of modules by eliminating any shared libaries that do not match
one of the prefixes in the allow list. This keeps all non-shared libaries
(such as the main binary).
"""
filtered_module_list = []
for module in module_list:
code_file_basename = os.path.basename(module.code_file)
# Keep anything that is not a shared library (e.g. the main binary)
if ".so" not in code_file_basename:
filtered_module_list.append(module)
continue
# Only keep shared libraries that match an entry on the allow list.
for allow_lib in lib_allow_list:
if code_file_basename.startswith(allow_lib):
filtered_module_list.append(module)
break
return filtered_module_list
def find_breakpad_home():
"""Locate the Breakpad home directory.
We try to locate the package in the Impala toolchain folder.
"""
toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
if not toolchain_packages_home:
logging.error("IMPALA_TOOLCHAIN_PACKAGES_HOME is not set")
return None
if not os.path.isdir(toolchain_packages_home):
logging.error("Could not find toolchain packages directory")
return None
breakpad_version = os.environ.get('IMPALA_BREAKPAD_VERSION')
if not breakpad_version:
logging.error("Could not determine breakpad version from toolchain")
return None
breakpad_dir = '{0}/breakpad-{1}'.format(toolchain_packages_home, breakpad_version)
if not os.path.isdir(breakpad_dir):
logging.error("Could not find breakpad directory")
return None
return breakpad_dir
def find_breakpad_binary(binary_name):
"""Locate the specified Breadpad binary"""
breakpad_home = find_breakpad_home()
if not breakpad_home:
return None
binary_path = os.path.join(breakpad_home, 'bin', binary_name)
if not os.path.isfile(binary_path):
logging.error("Could not find {0} executable at {1}".format(binary_name, binary_path))
return None
return binary_path
def find_objcopy_binary():
"""Locate the 'objcopy' binary from Binutils.
We try to locate the package in the Impala toolchain folder.
TODO: Fall back to finding objcopy in the system path.
"""
toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
if not toolchain_packages_home:
logging.error("IMPALA_TOOLCHAIN_PACKAGES_HOME is not set")
return None
if not os.path.isdir(toolchain_packages_home):
logging.error("Could not find toolchain packages directory")
return None
binutils_version = os.environ.get('IMPALA_BINUTILS_VERSION')
if not binutils_version:
logging.error("Could not determine binutils version from toolchain")
return None
binutils_dir = "binutils-{0}".format(binutils_version)
objcopy = os.path.join(toolchain_packages_home, binutils_dir, 'bin', 'objcopy')
if not os.path.isfile(objcopy):
logging.error("Could not find objcopy executable at {0}".format(objcopy))
return None
return objcopy
def ensure_dir_exists(path):
"""Make sure the directory 'path' exists in a thread-safe way."""
try:
os.makedirs(path)
except OSError as e:
if e.errno != errno.EEXIST or not os.path.isdir(path):
raise e
def dump_symbols_for_binary(dump_syms, objcopy, binary, out_dir):
"""Dump symbols of a single binary file and move the result.
Symbols will be extracted to a temporary file and moved into place afterwards. Required
directories will be created if necessary.
"""
logging.info("Processing binary file: {0}".format(binary))
ensure_dir_exists(out_dir)
# tmp_fd will be closed when the file object created by os.fdopen() below gets
# destroyed.
tmp_fd, tmp_file = tempfile.mkstemp(dir=out_dir, suffix='.sym')
try:
# Create a temporary directory used for decompressing debug info
tempdir = tempfile.mkdtemp()
# Binaries can contain compressed debug symbols. Breakpad currently
# does not support dumping symbols for binaries with compressed debug
# symbols.
#
# As a workaround, this uses objcopy to create a copy of the binary with
# the debug symbols decompressed. If the debug symbols are not compressed
# in the original binary, objcopy simply makes a copy of the binary.
# Breakpad is able to read symbols from the decompressed binary, and
# those symbols work correctly in resolving a minidump from the original
# compressed binary.
# TODO: In theory, this could work with the binary.debug_path.
binary_basename = os.path.basename(binary)
decompressed_binary = os.path.join(tempdir, binary_basename)
objcopy_retcode = subprocess.call([objcopy, "--decompress-debug-sections",
binary, decompressed_binary])
# Run dump_syms on the binary
# If objcopy failed for some reason, fall back to running dump_syms
# directly on the original binary. This is unlikely to work, but it is a way of
# guaranteeing that objcopy is not the problem.
args = [dump_syms, decompressed_binary]
if objcopy_retcode != 0:
sys.stderr.write('objcopy failed. Trying to run dump_sym directly.\n')
args = [dump_syms, binary]
# Run dump_syms on the binary.
proc = subprocess.Popen(args, stdout=os.fdopen(tmp_fd, 'wb'), stderr=subprocess.PIPE)
_, stderr = proc.communicate()
if proc.returncode != 0:
sys.stderr.write('Failed to dump symbols from %s, return code %s\n' %
(binary, proc.returncode))
sys.stderr.write(stderr.decode('utf-8'))
os.remove(tmp_file)
return False
# Parse the temporary file to determine the full target path.
with open(tmp_file, 'r') as f:
header = f.readline().strip()
# Format of header is: MODULE os arch binary_id binary
_, _, _, binary_id, binary = header.split(' ')
out_path = os.path.join(out_dir, binary, binary_id)
ensure_dir_exists(out_path)
# Move the temporary file to its final destination.
shutil.move(tmp_file, os.path.join(out_path, '%s.sym' % binary))
except Exception as e:
# Only need to clean up in case of errors.
try:
os.remove(tmp_file)
except EnvironmentError:
pass
raise e
finally:
# Cleanup temporary directory
shutil.rmtree(tempdir)
return True
def dump_symbols_for_all_modules(dump_syms, objcopy, module_list, out_dir):
"""Given a list of modules (ModuleInfo objects), dump symbols for
each library listed.
"""
for module in module_list:
success = dump_symbols_for_binary(dump_syms, objcopy, module.code_file, out_dir)
if not success:
logging.warning("Failed to dump symbols for {0}".format(module.code_file))
def resolve_minidump(minidump_stackwalk, minidump_path, symbol_dir, verbose, out_file):
minidump_stackwalk_cmd = [minidump_stackwalk, minidump_path, symbol_dir]
# There are circumstances where the minidump_stackwalk can go wrong and become
# a runaway process capable of using all system memory. If the prlimit utility
# is present, we use it to apply a limit on the memory consumption.
#
# See if we have the prlimit utility
check_prlimit = subprocess.run(["prlimit", "-V"], stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
if check_prlimit.returncode == 0:
# The prlimit utility is available, so wrap the minidump_stackwalk command
# to apply a 4GB limit on virtual memory. In normal operations, 4G is plenty.
prlimit_wrapper = ["prlimit", "--as={0}".format(4 * 1024 * 1024 * 1024)]
minidump_stackwalk_cmd = prlimit_wrapper + minidump_stackwalk_cmd
with open(out_file, "w") as out_f:
stderr_output = None if verbose else subprocess.DEVNULL
subprocess.run(minidump_stackwalk_cmd, stdout=out_f,
stderr=stderr_output, check=True)
def raw_dump_for_minidump(minidump_dump, minidump_path):
"""Run minidump_dump on the specified minidump and split the output into lines"""
# minidump_dump sometimes returns an error code even though it produced usable output.
# So, this doesn't check the error code, and it relies on read_module_info() doing
# validation.
#
# Python 3.6 adjustments:
# 'capture_output=True' not supported: set stdout/stderr to subprocess.PIPE instead
# 'text=True' not supported: set 'universal_newlines=True' (the two are the same thing)
output = subprocess.run([minidump_dump, minidump_path], stdout=subprocess.PIPE,
stderr=subprocess.PIPE, universal_newlines=True)
return output.stdout.split('\n')
def parse_args():
"""Parse command line arguments and perform sanity checks."""
# TODO:
# - Add ability to specify Breakpad home
# - Add ability to specify the symbol directory location (for reuse)
# - Add ability to specify Binutils home
parser = ArgumentParser()
parser.add_argument('--minidump_file', required=True)
parser.add_argument('--output_file', required=True)
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('--safe_library_list',
default="libstdc++.so,libc.so,libjvm.so",
help="Comma-separate list of prefixes for allowed system libraries")
args = parser.parse_args()
return args
def dump_syms_and_resolve_stack(modules, minidump_file, output_file, verbose):
"""Dump the symbols for the listed modules and use them to resolve the minidump."""
# Create a temporary directory to store the symbols
# This automatically gets cleaned up
with tempfile.TemporaryDirectory() as tmp_dir:
# Dump symbols for all the modules into this temporary directory.
# Need both dump_syms and objcopy
dump_syms_bin = find_breakpad_binary("dump_syms")
if not dump_syms_bin:
logging.error("Could not find Breakpad dump_syms binary")
sys.exit(1)
objcopy_bin = find_objcopy_binary()
if not objcopy_bin:
logging.error("Could not find Binutils objcopy binary")
sys.exit(1)
dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
# Resolve the minidump with the temporary symbol directory
minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
if not minidump_stackwalk_bin:
logging.error("Could not find Breakpad minidump_stackwalk binary")
sys.exit(1)
resolve_minidump(find_breakpad_binary("minidump_stackwalk"), minidump_file,
tmp_dir, verbose, output_file)
def main():
args = parse_args()
if args.verbose:
logging.basicConfig(level=logging.INFO)
else:
logging.basicConfig(level=logging.WARNING)
# Step 1: Get the raw dump for the specified minidump
minidump_dump_bin = find_breakpad_binary("minidump_dump")
if not minidump_dump_bin:
logging.error("Could not find Breakpad minidump_dump binary")
sys.exit(1)
contents = raw_dump_for_minidump(minidump_dump_bin, args.minidump_file)
if not contents:
logging.error(
"minidump_dump could not get the contents of {0}".format(args.minidump_file))
sys.exit(1)
# Step 2: Parse the raw dump to get the list of code modules
# This is the list of things that have symbols we need to dump.
modules = read_module_info(contents)
if not modules:
logging.error("Failed to read modules for {0}".format(args.minidump_file))
sys.exit(1)
# Step 3: Dump the symbols and use them to resolve the minidump
# Sometimes there are libraries with corrupt/problematic symbols
# that can cause minidump_stackwalk to go haywire and use excessive
# memory. First, we try using symbols from all of the shared libraries.
# If that fails, we fallback to using a "safe" list of shared libraries.
try:
# Dump the symbols and use them to resolve the minidump
dump_syms_and_resolve_stack(modules, args.minidump_file, args.output_file,
args.verbose)
return
except Exception:
logging.warning("Encountered error: {0}".format(traceback.format_exc()))
logging.warning("Falling back to resolution using the safe library list")
logging.warning("Safe library list: {0}".format(args.safe_library_list))
# Limit the shared libraries to the "safe" list of shared libraries and
# try again.
if len(args.safe_library_list) == 0:
safe_library_list = []
else:
safe_library_list = args.safe_library_list.split(",")
safe_modules = filter_shared_library_modules(modules, safe_library_list)
dump_syms_and_resolve_stack(safe_modules, args.minidump_file, args.output_file,
args.verbose)
if __name__ == "__main__":
main()