devops/tools/elf_ubuntu_dependency_analyzer.py - cloudberry - Git at Google

 #!/usr/bin/env python3
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 ELF Dependency Analyzer for Ubuntu

 This script analyzes ELF (Executable and Linkable Format) binaries to determine their runtime
 package dependencies on Ubuntu systems. It can process individual files or recursively analyze directories.

 The script provides information about:
 - Required packages and their versions
 - Custom or non-APT libraries
 - Core system libraries
 - Missing libraries
 - Other special cases

 Usage:
     python3 elf_dependency_analyzer.py [file_or_directory] [file_or_directory] ...

 Requirements:
 - Python 3.6+
 - prettytable (pip install prettytable)
 - ldd (usually pre-installed on Linux systems)
 - file (usually pre-installed on Linux systems)
 - dpkg (pre-installed on Ubuntu)
 """

 import os
 import subprocess
 import re
 import sys
 import argparse
 from collections import defaultdict
 from prettytable import PrettyTable

 def run_command(command):
     """
     Execute a shell command and return its output.

     Args:
     command (list): The command to execute as a list of strings.

     Returns:
     str: The output of the command, or None if an error occurred.
     """
     try:
         return subprocess.check_output(command, stderr=subprocess.STDOUT).decode('utf-8')
     except subprocess.CalledProcessError as e:
         print(f"Error running command {' '.join(command)}: {e.output.decode('utf-8').strip()}")
         return None

 def get_package_info(lib_path):
     """
     Get package information for a given library path.

     Args:
     lib_path (str): The path to the library.

     Returns:
     tuple: A tuple containing the package name and full package information.
     """
     if lib_path.startswith('/usr/local/cloudberry-db'):
         return "cloudberry-custom", f"Cloudberry custom library: {lib_path}"

     dpkg_output = run_command(['dpkg', '-S', lib_path])
     if dpkg_output:
         package_name = dpkg_output.split(':')[0]
         return package_name, dpkg_output.strip()

     # List of core system libraries that might not be individually tracked by dpkg
     core_libs = {
         'libc.so': 'libc6',
         'libm.so': 'libc6',
         'libdl.so': 'libc6',
         'libpthread.so': 'libc6',
         'libresolv.so': 'libc6',
         'librt.so': 'libc6',
         'libgcc_s.so': 'libgcc-s1',
         'libstdc++.so': 'libstdc++6',
         'libz.so': 'zlib1g',
         'libbz2.so': 'libbz2-1.0',
         'libpam.so': 'libpam0g',
         'libaudit.so': 'libaudit1',
         'libcap-ng.so': 'libcap-ng0',
         'libkeyutils.so': 'libkeyutils1',
         'liblzma.so': 'liblzma5',
         'libcom_err.so': 'libcomerr2'
     }

     lib_name = os.path.basename(lib_path)
     for core_lib, package in core_libs.items():
         if lib_name.startswith(core_lib):
             return package, f"Core system library: {lib_path}"

     # If not a recognized core library, return as system library
     file_output = run_command(['file', lib_path])
     if file_output:
         return "system-library", f"System library: {lib_path} - {file_output.strip()}"

     return None

 def print_summary(packages, special_cases, missing_libraries, binary_path):
     """
     Print a summary of the dependencies for a binary.

     Args:
     packages (list): List of package tuples (package_name, full_package_name).
     special_cases (list): List of special case strings.
     missing_libraries (list): List of missing library names.
     binary_path (str): Path to the binary being analyzed.
     """
     print("\nSummary of runtime dependencies:")
     table = PrettyTable(['Category', 'Package/Library', 'Details'])
     table.align['Category'] = 'l'
     table.align['Package/Library'] = 'l'
     table.align['Details'] = 'l'

     categories = {
         'cloudberry-custom': 'Cloudberry Custom',
         'system-library': 'System Library',
     }

     for package_name, full_package_name in sorted(set(packages)):
         category = categories.get(package_name, 'System Package')
         table.add_row([category, package_name, full_package_name])

     print(table)

     if missing_libraries:
         print("\nMISSING LIBRARIES:")
         for lib in missing_libraries:
             print(f"  - {lib}")

     if special_cases:
         print("\nSPECIAL CASES:")
         for case in special_cases:
             print(f"  - {case}")

 def process_binary(binary_path):
     """
     Process a single binary file to determine its dependencies.

     Args:
     binary_path (str): Path to the binary file.

     Returns:
     tuple: A tuple containing lists of packages, special cases, and missing libraries.
     """
     print(f"Binary: {binary_path}\n")
     print("Libraries and their corresponding packages:")
     packages, special_cases, missing_libraries = [], [], []

     ldd_output = run_command(['ldd', binary_path])
     if ldd_output is None:
         return packages, special_cases, missing_libraries

     for line in ldd_output.splitlines():
         if "=>" not in line:
             continue

         parts = line.split('=>')
         lib_name = parts[0].strip()
         lib_path = parts[1].split()[0].strip()
         lib_path = os.path.realpath(lib_path)

         if lib_path == "not":
             missing_libraries.append(lib_name)
             print(f"MISSING: {line.strip()}")
         else:
             package_info = get_package_info(lib_path)
             if package_info:
                 print(f"{lib_path} => {package_info[1]}")
                 packages.append(package_info)
             else:
                 special_case = f"{lib_path} is not found and might be a special case"
                 special_cases.append(special_case)
                 print(f"{lib_path} => Not found, might be a special case")

     print_summary(packages, special_cases, missing_libraries, binary_path)
     print("-------------------------------------------")
     return packages, special_cases, missing_libraries

 def is_elf_binary(file_path):
     """
     Check if a file is an ELF binary.

     Args:
     file_path (str): Path to the file.

     Returns:
     bool: True if the file is an ELF binary, False otherwise.
     """
     file_output = run_command(['file', file_path])
     return 'ELF' in file_output and ('executable' in file_output or 'shared object' in file_output)

 def print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries):
     """
     Print a grand summary of all analyzed binaries.

     Args:
     grand_summary (dict): Dictionary of all packages and their details.
     grand_special_cases (list): List of all special cases.
     grand_missing_libraries (dict): Dictionary of all missing libraries.
     """
     if grand_summary or grand_special_cases or grand_missing_libraries:
         print("\nGrand Summary of runtime packages required across all binaries:")
         table = PrettyTable(['Package', 'Included Packages'])
         table.align['Package'] = 'l'
         table.align['Included Packages'] = 'l'
         for package_name, full_package_names in sorted(grand_summary.items()):
             included_packages = '\n'.join(sorted(full_package_names))
             table.add_row([package_name, included_packages])
         print(table)

         if grand_missing_libraries:
             print("\nGrand Summary of MISSING LIBRARIES across all binaries:")
             missing_table = PrettyTable(['Missing Library', 'Referenced By'])
             missing_table.align['Missing Library'] = 'l'
             missing_table.align['Referenced By'] = 'l'
             for lib, binaries in sorted(grand_missing_libraries.items()):
                 missing_table.add_row([lib, '\n'.join(sorted(binaries))])
             print(missing_table)

         print("\nGrand Summary of special cases across all binaries:")
         if grand_special_cases:
             special_table = PrettyTable(['Library/Case', 'Referenced By', 'Category'])
             special_table.align['Library/Case'] = 'l'
             special_table.align['Referenced By'] = 'l'
             special_table.align['Category'] = 'l'
             for case, binary in sorted(set(grand_special_cases)):
                 category = "System Library" if "system library" in case else "Other"
                 library = case.split(" is ")[0] if " is " in case else case
                 special_table.add_row([library, binary, category])
             print(special_table)
         else:
             print("No special cases found.")

 def analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries):
     """
     Analyze a file or directory for ELF binaries and their dependencies.

     Args:
     path (str): Path to the file or directory to analyze.
     grand_summary (dict): Dictionary to store all package information.
     grand_special_cases (list): List to store all special cases.
     grand_missing_libraries (dict): Dictionary to store all missing libraries.
     """
     if os.path.isfile(path):
         if is_elf_binary(path):
             packages, special_cases, missing_libraries = process_binary(path)
             for package_name, full_package_name in packages:
                 grand_summary[package_name].add(full_package_name)
             grand_special_cases.extend((case, path) for case in special_cases)
             for lib in missing_libraries:
                 grand_missing_libraries[lib].add(path)
     elif os.path.isdir(path):
         for root, dirs, files in os.walk(path):
             for file in files:
                 file_path = os.path.join(root, file)
                 if is_elf_binary(file_path):
                     packages, special_cases, missing_libraries = process_binary(file_path)
                     for package_name, full_package_name in packages:
                         grand_summary[package_name].add(full_package_name)
                     grand_special_cases.extend((case, file_path) for case in special_cases)
                     for lib in missing_libraries:
                         grand_missing_libraries[lib].add(file_path)
     else:
         print(f"Error: {path} is neither a valid file nor a directory.")

 def main():
     """
     Main function to handle command-line arguments and initiate the analysis.
     """
     parser = argparse.ArgumentParser(description="ELF Dependency Analyzer for Ubuntu")
     parser.add_argument('paths', nargs='+', help="Paths to files or directories to analyze")
     args = parser.parse_args()

     grand_summary = defaultdict(set)
     grand_special_cases = []
     grand_missing_libraries = defaultdict(set)

     for path in args.paths:
         analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries)

     print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries)

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python3
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	ELF Dependency Analyzer for Ubuntu

	This script analyzes ELF (Executable and Linkable Format) binaries to determine their runtime
	package dependencies on Ubuntu systems. It can process individual files or recursively analyze directories.

	The script provides information about:
	- Required packages and their versions
	- Custom or non-APT libraries
	- Core system libraries
	- Missing libraries
	- Other special cases

	Usage:
	python3 elf_dependency_analyzer.py [file_or_directory] [file_or_directory] ...

	Requirements:
	- Python 3.6+
	- prettytable (pip install prettytable)
	- ldd (usually pre-installed on Linux systems)
	- file (usually pre-installed on Linux systems)
	- dpkg (pre-installed on Ubuntu)
	"""

	import os
	import subprocess
	import re
	import sys
	import argparse
	from collections import defaultdict
	from prettytable import PrettyTable

	def run_command(command):
	"""
	Execute a shell command and return its output.

	Args:
	command (list): The command to execute as a list of strings.

	Returns:
	str: The output of the command, or None if an error occurred.
	"""
	try:
	return subprocess.check_output(command, stderr=subprocess.STDOUT).decode('utf-8')
	except subprocess.CalledProcessError as e:
	print(f"Error running command {' '.join(command)}: {e.output.decode('utf-8').strip()}")
	return None

	def get_package_info(lib_path):
	"""
	Get package information for a given library path.

	Args:
	lib_path (str): The path to the library.

	Returns:
	tuple: A tuple containing the package name and full package information.
	"""
	if lib_path.startswith('/usr/local/cloudberry-db'):
	return "cloudberry-custom", f"Cloudberry custom library: {lib_path}"

	dpkg_output = run_command(['dpkg', '-S', lib_path])
	if dpkg_output:
	package_name = dpkg_output.split(':')[0]
	return package_name, dpkg_output.strip()

	# List of core system libraries that might not be individually tracked by dpkg
	core_libs = {
	'libc.so': 'libc6',
	'libm.so': 'libc6',
	'libdl.so': 'libc6',
	'libpthread.so': 'libc6',
	'libresolv.so': 'libc6',
	'librt.so': 'libc6',
	'libgcc_s.so': 'libgcc-s1',
	'libstdc++.so': 'libstdc++6',
	'libz.so': 'zlib1g',
	'libbz2.so': 'libbz2-1.0',
	'libpam.so': 'libpam0g',
	'libaudit.so': 'libaudit1',
	'libcap-ng.so': 'libcap-ng0',
	'libkeyutils.so': 'libkeyutils1',
	'liblzma.so': 'liblzma5',
	'libcom_err.so': 'libcomerr2'
	}

	lib_name = os.path.basename(lib_path)
	for core_lib, package in core_libs.items():
	if lib_name.startswith(core_lib):
	return package, f"Core system library: {lib_path}"

	# If not a recognized core library, return as system library
	file_output = run_command(['file', lib_path])
	if file_output:
	return "system-library", f"System library: {lib_path} - {file_output.strip()}"

	return None

	def print_summary(packages, special_cases, missing_libraries, binary_path):
	"""
	Print a summary of the dependencies for a binary.

	Args:
	packages (list): List of package tuples (package_name, full_package_name).
	special_cases (list): List of special case strings.
	missing_libraries (list): List of missing library names.
	binary_path (str): Path to the binary being analyzed.
	"""
	print("\nSummary of runtime dependencies:")
	table = PrettyTable(['Category', 'Package/Library', 'Details'])
	table.align['Category'] = 'l'
	table.align['Package/Library'] = 'l'
	table.align['Details'] = 'l'

	categories = {
	'cloudberry-custom': 'Cloudberry Custom',
	'system-library': 'System Library',
	}

	for package_name, full_package_name in sorted(set(packages)):
	category = categories.get(package_name, 'System Package')
	table.add_row([category, package_name, full_package_name])

	print(table)

	if missing_libraries:
	print("\nMISSING LIBRARIES:")
	for lib in missing_libraries:
	print(f" - {lib}")

	if special_cases:
	print("\nSPECIAL CASES:")
	for case in special_cases:
	print(f" - {case}")

	def process_binary(binary_path):
	"""
	Process a single binary file to determine its dependencies.

	Args:
	binary_path (str): Path to the binary file.

	Returns:
	tuple: A tuple containing lists of packages, special cases, and missing libraries.
	"""
	print(f"Binary: {binary_path}\n")
	print("Libraries and their corresponding packages:")
	packages, special_cases, missing_libraries = [], [], []

	ldd_output = run_command(['ldd', binary_path])
	if ldd_output is None:
	return packages, special_cases, missing_libraries

	for line in ldd_output.splitlines():
	if "=>" not in line:
	continue

	parts = line.split('=>')
	lib_name = parts[0].strip()
	lib_path = parts[1].split()[0].strip()
	lib_path = os.path.realpath(lib_path)

	if lib_path == "not":
	missing_libraries.append(lib_name)
	print(f"MISSING: {line.strip()}")
	else:
	package_info = get_package_info(lib_path)
	if package_info:
	print(f"{lib_path} => {package_info[1]}")
	packages.append(package_info)
	else:
	special_case = f"{lib_path} is not found and might be a special case"
	special_cases.append(special_case)
	print(f"{lib_path} => Not found, might be a special case")

	print_summary(packages, special_cases, missing_libraries, binary_path)
	print("-------------------------------------------")
	return packages, special_cases, missing_libraries

	def is_elf_binary(file_path):
	"""
	Check if a file is an ELF binary.

	Args:
	file_path (str): Path to the file.

	Returns:
	bool: True if the file is an ELF binary, False otherwise.
	"""
	file_output = run_command(['file', file_path])
	return 'ELF' in file_output and ('executable' in file_output or 'shared object' in file_output)

	def print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries):
	"""
	Print a grand summary of all analyzed binaries.

	Args:
	grand_summary (dict): Dictionary of all packages and their details.
	grand_special_cases (list): List of all special cases.
	grand_missing_libraries (dict): Dictionary of all missing libraries.
	"""
	if grand_summary or grand_special_cases or grand_missing_libraries:
	print("\nGrand Summary of runtime packages required across all binaries:")
	table = PrettyTable(['Package', 'Included Packages'])
	table.align['Package'] = 'l'
	table.align['Included Packages'] = 'l'
	for package_name, full_package_names in sorted(grand_summary.items()):
	included_packages = '\n'.join(sorted(full_package_names))
	table.add_row([package_name, included_packages])
	print(table)

	if grand_missing_libraries:
	print("\nGrand Summary of MISSING LIBRARIES across all binaries:")
	missing_table = PrettyTable(['Missing Library', 'Referenced By'])
	missing_table.align['Missing Library'] = 'l'
	missing_table.align['Referenced By'] = 'l'
	for lib, binaries in sorted(grand_missing_libraries.items()):
	missing_table.add_row([lib, '\n'.join(sorted(binaries))])
	print(missing_table)

	print("\nGrand Summary of special cases across all binaries:")
	if grand_special_cases:
	special_table = PrettyTable(['Library/Case', 'Referenced By', 'Category'])
	special_table.align['Library/Case'] = 'l'
	special_table.align['Referenced By'] = 'l'
	special_table.align['Category'] = 'l'
	for case, binary in sorted(set(grand_special_cases)):
	category = "System Library" if "system library" in case else "Other"
	library = case.split(" is ")[0] if " is " in case else case
	special_table.add_row([library, binary, category])
	print(special_table)
	else:
	print("No special cases found.")

	def analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries):
	"""
	Analyze a file or directory for ELF binaries and their dependencies.

	Args:
	path (str): Path to the file or directory to analyze.
	grand_summary (dict): Dictionary to store all package information.
	grand_special_cases (list): List to store all special cases.
	grand_missing_libraries (dict): Dictionary to store all missing libraries.
	"""
	if os.path.isfile(path):
	if is_elf_binary(path):
	packages, special_cases, missing_libraries = process_binary(path)
	for package_name, full_package_name in packages:
	grand_summary[package_name].add(full_package_name)
	grand_special_cases.extend((case, path) for case in special_cases)
	for lib in missing_libraries:
	grand_missing_libraries[lib].add(path)
	elif os.path.isdir(path):
	for root, dirs, files in os.walk(path):
	for file in files:
	file_path = os.path.join(root, file)
	if is_elf_binary(file_path):
	packages, special_cases, missing_libraries = process_binary(file_path)
	for package_name, full_package_name in packages:
	grand_summary[package_name].add(full_package_name)
	grand_special_cases.extend((case, file_path) for case in special_cases)
	for lib in missing_libraries:
	grand_missing_libraries[lib].add(file_path)
	else:
	print(f"Error: {path} is neither a valid file nor a directory.")

	def main():
	"""
	Main function to handle command-line arguments and initiate the analysis.
	"""
	parser = argparse.ArgumentParser(description="ELF Dependency Analyzer for Ubuntu")
	parser.add_argument('paths', nargs='+', help="Paths to files or directories to analyze")
	args = parser.parse_args()

	grand_summary = defaultdict(set)
	grand_special_cases = []
	grand_missing_libraries = defaultdict(set)

	for path in args.paths:
	analyze_path(path, grand_summary, grand_special_cases, grand_missing_libraries)

	print_grand_summary(grand_summary, grand_special_cases, grand_missing_libraries)

	if __name__ == '__main__':
	main()