|  | import os | 
|  | import sys | 
|  | import re | 
|  | from collections import namedtuple | 
|  |  | 
|  | # Define a structure to store information, added the 'sed_str' field | 
|  | FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str']) | 
|  |  | 
|  | def find_file(file_str, search_dir, line_content): | 
|  | # Initialize result list | 
|  | results = [] | 
|  |  | 
|  | # Extract the second file path (including the line number) | 
|  | match = re.search(r"in file '([^']+)'", file_str) | 
|  | if match: | 
|  | base_file = match.group(1)  # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67" | 
|  | parts = base_file.split(":") | 
|  | base_file_path = parts[0]         # Remove the line number part to get the file path | 
|  | line_number = parts[1] if len(parts) > 1 else ""  # The part after the colon | 
|  |  | 
|  | # Get the root directory of the second file path | 
|  | root_dir = os.path.dirname(base_file_path) | 
|  |  | 
|  | # Extract the first file path based on the 'link' in the log line | 
|  | match = re.search(r"link '([^']+)'", file_str)  # Extract the path after 'link' | 
|  | if match: | 
|  | filename = match.group(1) | 
|  | # Get the base file name (remove the path part) | 
|  | file_base_name = os.path.basename(filename) | 
|  | # Create the target file name, check if it already has a .md extension | 
|  | if not file_base_name.endswith(".md"): | 
|  | target_filename = f"{file_base_name}.md" | 
|  | else: | 
|  | target_filename = file_base_name | 
|  |  | 
|  | # Check if the file exists in the directory and count the number of occurrences | 
|  | found_files = [] | 
|  | for root, dirs, files in os.walk(search_dir): | 
|  | if target_filename in files: | 
|  | file_path = os.path.join(root, target_filename) | 
|  | found_files.append(file_path) | 
|  |  | 
|  | # Store the result in the structure array | 
|  | if found_files: | 
|  | url_count = 0 | 
|  | relative_url = "" | 
|  | for file in found_files: | 
|  | # Calculate the relative file path | 
|  | url_path = os.path.relpath(file, os.getcwd()) | 
|  | url_count += 1 | 
|  |  | 
|  | # If only one URL is found, output the relative path from the file directory | 
|  | if url_count == 1: | 
|  | relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path)) | 
|  |  | 
|  | # Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix | 
|  | if not relative_url.startswith("../"): | 
|  | relative_url = "./" + relative_url | 
|  | if relative_url.endswith(".md"): | 
|  | relative_url = relative_url[:-3] | 
|  |  | 
|  | # Extract the origin_url (from log_error, extracting the path after 'link' in quotes) | 
|  | origin_url_match = re.search(r"link '([^']+)'", line_content)  # Find the content following 'link' | 
|  | origin_url = origin_url_match.group(1) if origin_url_match else "" | 
|  |  | 
|  | # Create the sed_str command (valid only when url_count is 1) | 
|  | sed_str = "" | 
|  | if url_count == 1: | 
|  | sed_str = f"sed -i '{line_number}s|({origin_url})|({relative_url})|' {base_file_path}" | 
|  |  | 
|  | # Store the result in the structure array | 
|  | file_info = FileInfo( | 
|  | target_file=base_file_path, | 
|  | url_line=line_number, | 
|  | url_path=url_path, | 
|  | url_count=url_count, | 
|  | relative_url=relative_url, | 
|  | log_error=line_content,  # Store the current line content | 
|  | origin_url=origin_url,   # Store origin_url | 
|  | sed_str=sed_str          # Store sed command | 
|  | ) | 
|  | results.append(file_info) | 
|  |  | 
|  | else: | 
|  | print(f"[ERR] No file named {target_filename} found in {search_dir}.") | 
|  | print(f"[ERR] Error log: {line_content}")  # Output the current error log | 
|  | print("-" * 80)  # Print the separator line | 
|  | else: | 
|  | print(f"No valid file path found in the input string.") | 
|  | print(f"Error log: {line_content}")  # Output the current error log | 
|  | print("-" * 80)  # Print the separator line | 
|  | else: | 
|  | print(f"No valid base file path found in the input string.") | 
|  | print(f"Error log: {line_content}")  # Output the current error log | 
|  | print("-" * 80)  # Print the separator line | 
|  |  | 
|  | return results | 
|  |  | 
|  | # New function: Read the file and call find_file | 
|  | def get_deadlink(file_path, search_dir): | 
|  | results = [] | 
|  | if os.path.isfile(file_path):  # Check if it's a valid file | 
|  | with open(file_path, 'r') as file: | 
|  | for line in file: | 
|  | line = line.strip()  # Remove possible spaces and newline characters | 
|  | # Call find_file for each line and pass the current line content | 
|  | results.extend(find_file(line, search_dir, line))  # Append the result of each line to the results list | 
|  | else: | 
|  | print(f"{file_path} is not a valid file.")  # Print if the file is invalid | 
|  |  | 
|  | return results | 
|  |  | 
|  | # Print the results from the structure array | 
|  | def print_results(results): | 
|  | for result in results: | 
|  | print(f"[LOG] target_file >> {result.target_file}") | 
|  | print(f"[LOG] url_line >> {result.url_line}") | 
|  | print(f"[LOG] url_path >> {result.url_path}") | 
|  | print(f"[LOG] url_count >> {result.url_count}") | 
|  | print(f"[LOG] relative_url >> {result.relative_url}") | 
|  | print(f"[LOG] log_error >> {result.log_error}")  # Print log_error | 
|  | print(f"[LOG] origin_url >> {result.origin_url}")  # Print origin_url | 
|  | print(f"[LOG] sed_str >> {result.sed_str}")  # Print sed_str | 
|  | print("----------------------------------------------------------------") | 
|  |  | 
|  | if __name__ == "__main__": | 
|  | # Get input arguments | 
|  | if len(sys.argv) != 3: | 
|  | print("Usage: python find_file.py '<file_with_logs>' <search_dir>")  # Print usage message | 
|  | sys.exit(1) | 
|  |  | 
|  | file_with_logs = sys.argv[1]  # Get the file path | 
|  | search_dir = sys.argv[2]  # Get the search directory | 
|  |  | 
|  | # Process the file and get results | 
|  | results = get_deadlink(file_with_logs, search_dir) | 
|  |  | 
|  | # Print the results from the structure array | 
|  | print_results(results) | 
|  |  |