| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import re |
| import os |
| from urllib.parse import urlparse |
| |
| |
| def process_md_file(file_path): |
| link_pattern = re.compile(r"\[.*?\]\((.*?)\)") |
| code_block_pattern = re.compile(r"^```.*$") |
| |
| with open(file_path, "r", encoding="utf-8") as f: |
| content = f.read() |
| |
| lines = content.splitlines() |
| in_code_block = False |
| |
| for line_number, line in enumerate(lines, start=1): |
| # Skip codeblocks |
| if code_block_pattern.match(line): |
| in_code_block = not in_code_block |
| continue |
| |
| if in_code_block: |
| continue |
| |
| links = link_pattern.findall(line) |
| |
| for link in links: |
| # Skip urls |
| if ( |
| not urlparse(link).scheme |
| and not os.path.isabs(link) |
| and not (link[0] == "#") |
| ): |
| full_path = os.path.normpath( |
| os.path.join(os.path.dirname(file_path), link) |
| ) |
| |
| # Skip section headers |
| if "#" in full_path: |
| full_path = full_path.split("#", 1)[0] |
| |
| if not full_path.endswith(".md") and not full_path.endswith(".mdx"): |
| full_path += ".md" |
| md_exists = os.path.exists(full_path) |
| mdx_exists = ( |
| os.path.exists(full_path[:-3] + ".mdx") |
| if full_path.endswith(".md") |
| else False |
| ) |
| |
| if not md_exists and not mdx_exists: |
| print( |
| f"Error: File not found for link '{link}' in file '{file_path}:{line_number}'" |
| ) |
| |
| |
| def travel(root_path: str): |
| for root, dirs, files in os.walk(root_path): |
| for file in files: |
| if file.endswith(".md") or file.endswith(".mdx"): |
| md_file_path = os.path.join(root, file) |
| process_md_file(md_file_path) |
| |
| |
| if __name__ == "__main__": |
| # check docs directories |
| travel("docs") |
| travel("i18n") |
| travel("versioned_docs") |