modify-deadlink.py - doris-website - Git at Google

 import os
 import sys
 import re
 from collections import namedtuple

 # Define a structure to store information, added the 'sed_str' field
 FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str'])

 def find_file(file_str, search_dir, line_content):
     # Initialize result list
     results = []

     # Extract the second file path (including the line number)
     match = re.search(r"in file '([^']+)'", file_str)
     if match:
         base_file = match.group(1)  # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67"
         parts = base_file.split(":")
         base_file_path = parts[0]         # Remove the line number part to get the file path
         line_number = parts[1] if len(parts) > 1 else ""  # The part after the colon

         # Get the root directory of the second file path
         root_dir = os.path.dirname(base_file_path)

         # Extract the first file path based on the 'link' in the log line
         match = re.search(r"link '([^']+)'", file_str)  # Extract the path after 'link'
         if match:
             filename = match.group(1)
             # Get the base file name (remove the path part)
             file_base_name = os.path.basename(filename)
             # Create the target file name, check if it already has a .md extension
             if not file_base_name.endswith(".md"):
                 target_filename = f"{file_base_name}.md"
             else:
                 target_filename = file_base_name

             # Check if the file exists in the directory and count the number of occurrences
             found_files = []
             for root, dirs, files in os.walk(search_dir):
                 if target_filename in files:
                     file_path = os.path.join(root, target_filename)
                     found_files.append(file_path)

             # Store the result in the structure array
             if found_files:
                 url_count = 0
                 relative_url = ""
                 for file in found_files:
                     # Calculate the relative file path
                     url_path = os.path.relpath(file, os.getcwd())
                     url_count += 1

                 # If only one URL is found, output the relative path from the file directory
                 if url_count == 1:
                     relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path))

                     # Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix
                     if not relative_url.startswith("../"):
                         relative_url = "./" + relative_url
                     if relative_url.endswith(".md"):
                         relative_url = relative_url[:-3]

                 # Extract the origin_url (from log_error, extracting the path after 'link' in quotes)
                 origin_url_match = re.search(r"link '([^']+)'", line_content)  # Find the content following 'link'
                 origin_url = origin_url_match.group(1) if origin_url_match else ""

                 # Create the sed_str command (valid only when url_count is 1)
                 sed_str = ""
                 if url_count == 1:
                     sed_str = f"sed -i '{line_number}s|({origin_url})|({relative_url})|' {base_file_path}"

                 # Store the result in the structure array
                 file_info = FileInfo(
                     target_file=base_file_path,
                     url_line=line_number,
                     url_path=url_path,
                     url_count=url_count,
                     relative_url=relative_url,
                     log_error=line_content,  # Store the current line content
                     origin_url=origin_url,   # Store origin_url
                     sed_str=sed_str          # Store sed command
                 )
                 results.append(file_info)

             else:
                 print(f"[ERR] No file named {target_filename} found in {search_dir}.")
                 print(f"[ERR] Error log: {line_content}")  # Output the current error log
                 print("-" * 80)  # Print the separator line
         else:
             print(f"No valid file path found in the input string.")
             print(f"Error log: {line_content}")  # Output the current error log
             print("-" * 80)  # Print the separator line
     else:
         print(f"No valid base file path found in the input string.")
         print(f"Error log: {line_content}")  # Output the current error log
         print("-" * 80)  # Print the separator line

     return results

 # New function: Read the file and call find_file
 def get_deadlink(file_path, search_dir):
     results = []
     if os.path.isfile(file_path):  # Check if it's a valid file
         with open(file_path, 'r') as file:
             for line in file:
                 line = line.strip()  # Remove possible spaces and newline characters
                 # Call find_file for each line and pass the current line content
                 results.extend(find_file(line, search_dir, line))  # Append the result of each line to the results list
     else:
         print(f"{file_path} is not a valid file.")  # Print if the file is invalid

     return results

 # Print the results from the structure array
 def print_results(results):
     for result in results:
         print(f"[LOG] target_file >> {result.target_file}")
         print(f"[LOG] url_line >> {result.url_line}")
         print(f"[LOG] url_path >> {result.url_path}")
         print(f"[LOG] url_count >> {result.url_count}")
         print(f"[LOG] relative_url >> {result.relative_url}")
         print(f"[LOG] log_error >> {result.log_error}")  # Print log_error
         print(f"[LOG] origin_url >> {result.origin_url}")  # Print origin_url
         print(f"[LOG] sed_str >> {result.sed_str}")  # Print sed_str
         print("----------------------------------------------------------------")

 if __name__ == "__main__":
     # Get input arguments
     if len(sys.argv) != 3:
         print("Usage: python find_file.py '<file_with_logs>' <search_dir>")  # Print usage message
         sys.exit(1)

     file_with_logs = sys.argv[1]  # Get the file path
     search_dir = sys.argv[2]  # Get the search directory

     # Process the file and get results
     results = get_deadlink(file_with_logs, search_dir)

     # Print the results from the structure array
     print_results(results)
	import os
	import sys
	import re
	from collections import namedtuple

	# Define a structure to store information, added the 'sed_str' field
	FileInfo = namedtuple('FileInfo', ['target_file', 'url_line', 'url_path', 'url_count', 'relative_url', 'log_error', 'origin_url', 'sed_str'])

	def find_file(file_str, search_dir, line_content):
	# Initialize result list
	results = []

	# Extract the second file path (including the line number)
	match = re.search(r"in file '([^']+)'", file_str)
	if match:
	base_file = match.group(1) # For example: "versioned_docs/version-3.0/sql-manual/sql-data-types/data-type-overview.md:67"
	parts = base_file.split(":")
	base_file_path = parts[0] # Remove the line number part to get the file path
	line_number = parts[1] if len(parts) > 1 else "" # The part after the colon

	# Get the root directory of the second file path
	root_dir = os.path.dirname(base_file_path)

	# Extract the first file path based on the 'link' in the log line
	match = re.search(r"link '([^']+)'", file_str) # Extract the path after 'link'
	if match:
	filename = match.group(1)
	# Get the base file name (remove the path part)
	file_base_name = os.path.basename(filename)
	# Create the target file name, check if it already has a .md extension
	if not file_base_name.endswith(".md"):
	target_filename = f"{file_base_name}.md"
	else:
	target_filename = file_base_name

	# Check if the file exists in the directory and count the number of occurrences
	found_files = []
	for root, dirs, files in os.walk(search_dir):
	if target_filename in files:
	file_path = os.path.join(root, target_filename)
	found_files.append(file_path)

	# Store the result in the structure array
	if found_files:
	url_count = 0
	relative_url = ""
	for file in found_files:
	# Calculate the relative file path
	url_path = os.path.relpath(file, os.getcwd())
	url_count += 1

	# If only one URL is found, output the relative path from the file directory
	if url_count == 1:
	relative_url = os.path.relpath(found_files[0], os.path.dirname(base_file_path))

	# Handle relative_url, if it doesn't start with '../', prepend './', and remove the .md suffix
	if not relative_url.startswith("../"):
	relative_url = "./" + relative_url
	if relative_url.endswith(".md"):
	relative_url = relative_url[:-3]

	# Extract the origin_url (from log_error, extracting the path after 'link' in quotes)
	origin_url_match = re.search(r"link '([^']+)'", line_content) # Find the content following 'link'
	origin_url = origin_url_match.group(1) if origin_url_match else ""

	# Create the sed_str command (valid only when url_count is 1)
	sed_str = ""
	if url_count == 1:
	sed_str = f"sed -i '{line_number}s\|({origin_url})\|({relative_url})\|' {base_file_path}"

	# Store the result in the structure array
	file_info = FileInfo(
	target_file=base_file_path,
	url_line=line_number,
	url_path=url_path,
	url_count=url_count,
	relative_url=relative_url,
	log_error=line_content, # Store the current line content
	origin_url=origin_url, # Store origin_url
	sed_str=sed_str # Store sed command
	)
	results.append(file_info)

	else:
	print(f"[ERR] No file named {target_filename} found in {search_dir}.")
	print(f"[ERR] Error log: {line_content}") # Output the current error log
	print("-" * 80) # Print the separator line
	else:
	print(f"No valid file path found in the input string.")
	print(f"Error log: {line_content}") # Output the current error log
	print("-" * 80) # Print the separator line
	else:
	print(f"No valid base file path found in the input string.")
	print(f"Error log: {line_content}") # Output the current error log
	print("-" * 80) # Print the separator line

	return results

	# New function: Read the file and call find_file
	def get_deadlink(file_path, search_dir):
	results = []
	if os.path.isfile(file_path): # Check if it's a valid file
	with open(file_path, 'r') as file:
	for line in file:
	line = line.strip() # Remove possible spaces and newline characters
	# Call find_file for each line and pass the current line content
	results.extend(find_file(line, search_dir, line)) # Append the result of each line to the results list
	else:
	print(f"{file_path} is not a valid file.") # Print if the file is invalid

	return results

	# Print the results from the structure array
	def print_results(results):
	for result in results:
	print(f"[LOG] target_file >> {result.target_file}")
	print(f"[LOG] url_line >> {result.url_line}")
	print(f"[LOG] url_path >> {result.url_path}")
	print(f"[LOG] url_count >> {result.url_count}")
	print(f"[LOG] relative_url >> {result.relative_url}")
	print(f"[LOG] log_error >> {result.log_error}") # Print log_error
	print(f"[LOG] origin_url >> {result.origin_url}") # Print origin_url
	print(f"[LOG] sed_str >> {result.sed_str}") # Print sed_str
	print("----------------------------------------------------------------")

	if __name__ == "__main__":
	# Get input arguments
	if len(sys.argv) != 3:
	print("Usage: python find_file.py '<file_with_logs>' <search_dir>") # Print usage message
	sys.exit(1)

	file_with_logs = sys.argv[1] # Get the file path
	search_dir = sys.argv[2] # Get the search directory

	# Process the file and get results
	results = get_deadlink(file_with_logs, search_dir)

	# Print the results from the structure array
	print_results(results)