python/tvm_ffi/cpp/extension.py - tvm-ffi - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """Build and load C++/CUDA sources into a tvm_ffi Module using Ninja."""

 from __future__ import annotations

 import functools
 import hashlib
 import logging
 import os
 import shutil
 import subprocess
 import sys
 from collections.abc import Mapping, Sequence
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Literal

 from tvm_ffi.libinfo import find_dlpack_include_path, find_include_path, find_libtvm_ffi
 from tvm_ffi.module import Module, load_module
 from tvm_ffi.utils import FileLock

 IS_WINDOWS = sys.platform == "win32"
 BACKEND_STR = Literal["cuda", "hip"]

 logger = logging.getLogger(__name__)


 @functools.lru_cache
 def _detect_gpu_backend() -> BACKEND_STR:
     """Auto-detect whether to use CUDA or HIP (ROCm).

     Returns 'hip' if ROCm/HIP is available, 'cuda' otherwise.
     """
     # Check environment variable override first
     backend = os.environ.get("TVM_FFI_GPU_BACKEND", "").lower()
     if backend in ("cuda", "hip"):
         return backend  # type: ignore[return-value]
     try:
         _find_rocm_home()
         return "hip"
     except RuntimeError:
         return "cuda"


 def _resolve_gpu_backend(backend: str | None) -> BACKEND_STR:
     if backend is not None:
         if backend in ("cuda", "hip"):
             return backend  # type: ignore[return-value]
         raise ValueError(f"Invalid backend: {backend}. Supported backends are 'cuda' and 'hip'.")
     return _detect_gpu_backend()


 def _hash_sources(
     cpp_source: str | None,
     cuda_source: str | None,
     cpp_files: Sequence[str] | None,
     cuda_files: Sequence[str] | None,
     functions: Sequence[str] | Mapping[str, str],
     extra_cflags: Sequence[str],
     extra_cuda_cflags: Sequence[str],
     extra_ldflags: Sequence[str],
     extra_include_paths: Sequence[str],
     embed_cubin: Mapping[str, bytes] | None = None,
 ) -> str:
     """Generate a unique hash for the given sources and functions."""
     m = hashlib.sha256()

     def _hash(obj: Any) -> None:
         if obj is None:
             m.update(b"None")
         elif isinstance(obj, str):
             m.update(b"str")
             m.update(obj.encode("utf-8"))
         elif isinstance(obj, bytes):
             m.update(b"bytes")
             m.update(obj)
         elif isinstance(obj, Mapping):
             m.update(b"Mapping")
             for key in sorted(obj.keys()):
                 item = obj[key]
                 _hash(key)
                 _hash(item)
         elif isinstance(obj, Sequence):
             m.update(b"Sequence")
             for item in obj:
                 _hash(item)
         else:
             raise ValueError(f"Unsupported type: {type(obj)}")

     _hash(
         (
             cpp_source,
             cuda_source,
             sorted(cpp_files) if cpp_files is not None else None,
             sorted(cuda_files) if cuda_files is not None else None,
             functions,
             extra_cflags,
             extra_cuda_cflags,
             extra_ldflags,
             extra_include_paths,
             embed_cubin,
         )
     )

     return m.hexdigest()[:16]


 def _maybe_write(path: str, content: str) -> None:
     """Write content to path if it does not already exist with the same content."""
     p = Path(path)
     if p.exists():
         with p.open() as f:
             existing_content = f.read()
         if existing_content == content:
             return
     with p.open("w") as f:
         f.write(content)


 @functools.lru_cache
 def _find_cuda_home() -> str:
     """Find the CUDA install path."""
     # Guess #1
     cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
     if cuda_home is None:
         # Guess #2
         nvcc_path = shutil.which("nvcc")
         if nvcc_path is not None:
             cuda_home = str(Path(nvcc_path).parent.parent)
         else:
             # Guess #3
             if IS_WINDOWS:
                 cuda_root = Path("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA")
                 cuda_homes = list(cuda_root.glob("v*.*"))
                 if len(cuda_homes) == 0:
                     raise RuntimeError(
                         "Could not find CUDA installation. Please set CUDA_HOME environment variable."
                     )
                 cuda_home = str(cuda_homes[0])
             else:
                 cuda_home = "/usr/local/cuda"
             if not Path(cuda_home).exists():
                 raise RuntimeError(
                     "Could not find CUDA installation. Please set CUDA_HOME environment variable."
                 )
     return cuda_home


 def _get_cuda_target() -> str:
     """Get the CUDA target architecture flag."""
     if "TVM_FFI_CUDA_ARCH_LIST" in os.environ:
         arch_list = os.environ["TVM_FFI_CUDA_ARCH_LIST"].split()  # e.g., "8.9 9.0a"
         flags = []
         for arch in arch_list:
             if len(arch.split(".")) != 2:
                 raise ValueError(f"Invalid CUDA architecture: {arch}")
             major, minor = arch.split(".")
             flags.append(f"-gencode=arch=compute_{major}{minor},code=sm_{major}{minor}")
         return " ".join(flags)
     else:
         try:
             status = subprocess.run(
                 args=["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
                 capture_output=True,
                 check=True,
             )
             compute_cap = status.stdout.decode("utf-8").strip().split("\n")[0]
             major, minor = compute_cap.split(".")
             return f"-gencode=arch=compute_{major}{minor},code=sm_{major}{minor}"
         except Exception:
             try:
                 # For old drivers, there is no compute_cap, but we can use the GPU name to determine the architecture.
                 ampere_arch_map = {
                     "A100": ("8", "0"),
                     "A10": ("8", "6"),
                 }
                 status = subprocess.run(
                     args=["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
                     capture_output=True,
                     check=True,
                     text=True,
                 )
                 gpu_name = status.stdout.strip().split("\n")[0]
                 for gpu_key, (major, minor) in ampere_arch_map.items():
                     if gpu_key in gpu_name:
                         return f"-gencode=arch=compute_{major}{minor},code=sm_{major}{minor}"
             except (subprocess.CalledProcessError, FileNotFoundError):
                 pass
             raise RuntimeError(
                 "Could not detect CUDA compute_cap automatically. Please set TVM_FFI_CUDA_ARCH_LIST environment variable."
             )


 @functools.lru_cache
 def _find_rocm_home() -> str:
     """Find the ROCm install path."""
     # Guess #1: check environment variables
     rocm_home = os.environ.get("ROCM_HOME") or os.environ.get("ROCM_PATH")
     if rocm_home is None:
         hipcc_path = shutil.which("hipcc")
         # Guess #2: find hipcc in PATH and resolve ROCm home from it
         if hipcc_path is not None:
             rocm_home = str(Path(hipcc_path).resolve().parent.parent)
             if Path(rocm_home).name == "hip":
                 rocm_home = str(Path(rocm_home).parent)
         else:
             # Guess #3: use default installation path
             rocm_home = "/opt/rocm"
             if not Path(rocm_home).exists():
                 raise RuntimeError(
                     "Could not find ROCm installation. Please set ROCM_HOME environment variable."
                 )
     return rocm_home


 def _get_rocm_target() -> list[str]:
     """Get the ROCm target architecture flags (--offload-arch=gfxXXXX)."""
     if "TVM_FFI_ROCM_ARCH_LIST" in os.environ:
         arch_list = os.environ["TVM_FFI_ROCM_ARCH_LIST"].split()  # e.g., "gfx90a gfx942"
         return [f"--offload-arch={arch}" for arch in arch_list]
     # Try rocm_agent_enumerator
     try:
         agent_enum = str(Path(_find_rocm_home()) / "bin" / "rocm_agent_enumerator")
         if not Path(agent_enum).exists():
             agent_enum = "rocm_agent_enumerator"
         status = subprocess.run(args=[agent_enum], capture_output=True, check=True, text=True)
         archs = list(
             dict.fromkeys(
                 line.strip()
                 for line in status.stdout.strip().split("\n")
                 if line.strip() and line.strip() != "gfx000"
             )
         )
         if archs:
             return [f"--offload-arch={arch}" for arch in archs]
     except (subprocess.CalledProcessError, FileNotFoundError):
         pass
     # Try rocminfo
     try:
         status = subprocess.run(args=["rocminfo"], capture_output=True, check=True, text=True)
         archs = list(
             dict.fromkeys(
                 line.split(":")[-1].strip()
                 for line in status.stdout.split("\n")
                 if "Name:" in line
                 and "gfx" in line.lower()
                 and line.split(":")[-1].strip() != "gfx000"
             )
         )
         if archs:
             return [f"--offload-arch={arch}" for arch in archs]
     except (subprocess.CalledProcessError, FileNotFoundError):
         pass
     raise RuntimeError(
         "Could not detect ROCm GPU architecture automatically. "
         "Please set TVM_FFI_ROCM_ARCH_LIST environment variable (e.g. 'gfx90a gfx942')."
     )


 def _run_command_in_dev_prompt(
     args: list[str],
     cwd: str | os.PathLike[str],
     capture_output: bool,
 ) -> subprocess.CompletedProcess:
     """Locates the Developer Command Prompt and runs a command within its environment."""
     try:
         # Path to vswhere.exe
         vswhere_path = str(
             Path(os.environ.get("ProgramFiles(x86)", "C:\\Program Files (x86)"))
             / "Microsoft Visual Studio"
             / "Installer"
             / "vswhere.exe"
         )

         if not Path(vswhere_path).exists():
             raise FileNotFoundError("vswhere.exe not found.")

         # Find the Visual Studio installation path
         vs_install_path = subprocess.run(
             [
                 vswhere_path,
                 "-latest",
                 "-prerelease",
                 "-products",
                 "*",
                 "-property",
                 "installationPath",
             ],
             capture_output=True,
             text=True,
             check=True,
         ).stdout.strip()

         if not vs_install_path:
             raise FileNotFoundError("No Visual Studio installation found.")

         # Construct the path to the VsDevCmd.bat file
         vsdevcmd_path = str(Path(vs_install_path) / "Common7" / "Tools" / "VsDevCmd.bat")

         if not Path(vsdevcmd_path).exists():
             raise FileNotFoundError(f"VsDevCmd.bat not found at: {vsdevcmd_path}")

         # Use cmd.exe to run the batch file and then your command.
         # The /k flag keeps the command prompt open after the batch file runs.
         # The "&" symbol chains the commands.
         cmd_command = '"{vsdevcmd_path}" -arch=x64 & {command}'.format(
             vsdevcmd_path=vsdevcmd_path, command=" ".join(args)
         )

         # Execute the command in a new shell
         return subprocess.run(
             cmd_command, check=False, cwd=cwd, capture_output=capture_output, shell=True
         )

     except (FileNotFoundError, subprocess.CalledProcessError) as e:
         raise RuntimeError(
             "Failed to run the following command in MSVC developer environment: {}".format(
                 " ".join(args)
             )
         ) from e


 def _generate_ninja_build(  # noqa: PLR0915, PLR0912
     name: str,
     extra_cflags: Sequence[str],
     extra_cuda_cflags: Sequence[str],
     extra_ldflags: Sequence[str],
     extra_include_paths: Sequence[str],
     sources: Sequence[str],
     embed_cubin: Mapping[str, bytes] | None = None,
     backend: str | None = None,
     output: str | None = None,
 ) -> str:
     """Generate the content of build.ninja for building the module."""
     # Determine output format from extension
     if output is not None:
         out_ext = Path(output).suffix.lower()
         object_mode = out_ext in (".o", ".obj")
         output_name = output
     else:
         object_mode = False
         output_name = f"{name}{'.dll' if IS_WINDOWS else '.so'}"
     has_cuda_sources = any(Path(s).suffix.lower() == ".cu" for s in sources)
     with_hip = backend == "hip"
     with_cuda = backend == "cuda"
     with_backend = with_hip or with_cuda or has_cuda_sources
     if has_cuda_sources and not (with_hip or with_cuda):
         # Auto-detect backend from available GPU
         detected = _resolve_gpu_backend(None)
         with_hip = detected == "hip"
         with_cuda = detected == "cuda"

     default_include_paths = [find_include_path(), find_dlpack_include_path()]
     tvm_ffi_lib = Path(find_libtvm_ffi())
     tvm_ffi_lib_path = str(tvm_ffi_lib.parent)
     tvm_ffi_lib_name = tvm_ffi_lib.stem
     if IS_WINDOWS:
         default_cflags = ["/O2", "/MD"]
         default_cxxflags = ["/std:c++17", "/MD", "/EHsc"]
         _win_warnings = [
             "/wd4819",
             "/wd4251",
             "/wd4244",
             "/wd4267",
             "/wd4275",
             "/wd4018",
             "/wd4190",
             "/wd4624",
             "/wd4067",
             "/wd4068",
         ]
         default_cflags += _win_warnings
         default_cxxflags += _win_warnings
         default_cuda_cflags = ["-Xcompiler", "/std:c++17", "/O2"]
         default_ldflags = [
             "/DLL",
             f"/LIBPATH:{tvm_ffi_lib_path}",
             f"{tvm_ffi_lib_name}.lib",
         ]
     else:
         default_cflags = ["-fPIC", "-O2"]
         default_cxxflags = ["-std=c++17", "-fPIC", "-O2"]
         default_cuda_cflags = ["-std=c++17", "-O2"]
         default_ldflags = ["-shared", f"-L{tvm_ffi_lib_path}", "-ltvm_ffi"]

         if with_hip:
             rocm_home = _find_rocm_home()
             default_cuda_cflags += ["-fPIC", "-D__HIP_PLATFORM_AMD__=1", "-fno-gpu-rdc"]
             default_cuda_cflags += _get_rocm_target()
             default_include_paths.append(str(Path(rocm_home) / "include"))
             default_ldflags += [
                 f"-L{Path(rocm_home) / 'lib'!s}",
                 "-lamdhip64",
             ]
         if with_cuda:
             default_cuda_cflags = ["-Xcompiler", "-fPIC", *default_cuda_cflags]
             default_cuda_cflags += [_get_cuda_target()]
             default_ldflags += [
                 "-L{}".format(str(Path(_find_cuda_home()) / "lib64")),
                 "-lcudart",  # cuda runtime library
             ]

     extra_cflags_list = [flag.strip() for flag in extra_cflags]
     cflags = default_cflags + extra_cflags_list
     cxxflags = default_cxxflags + extra_cflags_list
     cuda_cflags = default_cuda_cflags + [flag.strip() for flag in extra_cuda_cflags]
     ldflags = default_ldflags + [flag.strip() for flag in extra_ldflags]
     include_paths = default_include_paths + [
         str(Path(path).resolve()) for path in extra_include_paths
     ]

     # append include paths
     for path in include_paths:
         inc = "-I{}".format(path.replace(":", "$:"))
         cflags.append(inc)
         cxxflags.append(inc)
         cuda_cflags.append(inc)

     # Classify sources by extension to determine which rules are needed
     with_c = any(Path(s).suffix.lower() == ".c" for s in sources)
     ninja: list[str] = []
     ninja.append("ninja_required_version = 1.3")
     ninja.append("cxx = {}".format(os.environ.get("CXX", "cl" if IS_WINDOWS else "c++")))
     ninja.append("cxxflags = {}".format(" ".join(cxxflags)))
     if with_c:
         ninja.append("cc = {}".format(os.environ.get("CC", "cl" if IS_WINDOWS else "cc")))
         ninja.append("cflags = {}".format(" ".join(cflags)))
     if with_backend:
         if with_hip:
             ninja.append("nvcc = {}".format(str(Path(_find_rocm_home()) / "bin" / "hipcc")))
         if with_cuda:
             ninja.append("nvcc = {}".format(str(Path(_find_cuda_home()) / "bin" / "nvcc")))
         ninja.append("cuda_cflags = {}".format(" ".join(cuda_cflags)))
     ninja.append("ldflags = {}".format(" ".join(ldflags)))

     # rules
     ninja.append("")
     ninja.append("rule compile")
     if IS_WINDOWS:
         ninja.append("  command = $cxx /showIncludes $cxxflags -c $in /Fo$out")
         ninja.append("  deps = msvc")
     else:
         ninja.append("  depfile = $out.d")
         ninja.append("  deps = gcc")
         ninja.append("  command = $cxx -MMD -MF $out.d $cxxflags -c $in -o $out")
     ninja.append("")

     if with_c:
         ninja.append("rule c_compile")
         if IS_WINDOWS:
             ninja.append("  command = $cc /showIncludes $cflags -c $in /Fo$out")
             ninja.append("  deps = msvc")
         else:
             ninja.append("  depfile = $out.d")
             ninja.append("  deps = gcc")
             ninja.append("  command = $cc -MMD -MF $out.d $cflags -c $in -o $out")
         ninja.append("")

     if with_backend:
         ninja.append("rule compile_cuda")
         ninja.append("  depfile = $out.d")
         ninja.append("  deps = gcc")
         if with_hip:
             ninja.append("  command = $nvcc $cuda_cflags -c $in -o $out")
         else:
             ninja.append(
                 "  command = $nvcc  --generate-dependencies-with-compile --dependency-output $out.d $cuda_cflags -c $in -o $out"
             )
         ninja.append("")

     # Add rules for object merging and cubin embedding (Unix only)
     if not IS_WINDOWS:
         ninja.append("rule merge_objects")
         ninja.append("  command = ld -r -o $out $in")
         ninja.append("")

         if embed_cubin:
             ninja.append("rule embed_cubin")
             ninja.append(
                 f"  command = {sys.executable} -m tvm_ffi.utils.embed_cubin --output-obj $out --input-obj $in --cubin $cubin --name $name"
             )
             ninja.append("")

     if not object_mode:
         ninja.append("rule link")
         if IS_WINDOWS:
             ninja.append("  command = $cxx $in /link $ldflags /out:$out")
         else:
             ninja.append("  command = $cxx $in $ldflags -o $out")
         ninja.append("")

     # build targets — dispatch by file extension
     obj_files: list[str] = []
     c_idx = cpp_idx = cuda_idx = 0
     for src in sorted(sources):
         ext = Path(src).suffix.lower()
         escaped = src.replace(":", "$:")
         if ext in (".o", ".obj"):
             # Pre-compiled object file: pass directly to linker
             obj_files.append(escaped)
         elif ext == ".c":
             obj_name = f"c_{c_idx}.o"
             ninja.append(f"build {obj_name}: c_compile {escaped}")
             obj_files.append(obj_name)
             c_idx += 1
         elif ext == ".cu":
             obj_name = f"cuda_{cuda_idx}.o"
             ninja.append(f"build {obj_name}: compile_cuda {escaped}")
             obj_files.append(obj_name)
             cuda_idx += 1
         else:
             # .cc, .cpp, .cxx — default to C++ compilation
             obj_name = f"cpp_{cpp_idx}.o"
             ninja.append(f"build {obj_name}: compile {escaped}")
             obj_files.append(obj_name)
             cpp_idx += 1

     if object_mode:
         # Object-only output: merge all object files into the target.
         if not IS_WINDOWS:
             ninja.append(f"build {output_name}: merge_objects {' '.join(obj_files)}")
             ninja.append("")
             ninja.append(f"default {output_name}")
         else:
             # Windows: no ld -r available; default to the first intermediate object
             ninja.append(f"default {obj_files[0]}")
         ninja.append("")
         return "\n".join(ninja)

     # For Unix systems with embed_cubin, use a 3-step process:
     # 1. Merge all object files into a unified object file
     # 2. Embed each cubin into the unified object file (chain them)
     # 3. Link the final object file into a shared library
     if not IS_WINDOWS and embed_cubin:
         # Step 1: Merge object files into unified.o
         unified_obj = "unified.o"
         obj_files_str = " ".join(obj_files)
         ninja.append(f"build {unified_obj}: merge_objects {obj_files_str}")
         ninja.append("")

         # Step 2: Chain embed_cubin operations for each cubin
         current_obj = unified_obj
         for cubin_name in sorted(embed_cubin.keys()):
             # Create next object file name
             next_obj = f"unified_with_{cubin_name}.o"
             cubin_file = f"{cubin_name}.cubin"

             # Add ninja build rule
             ninja.append(f"build {next_obj}: embed_cubin {current_obj}")
             ninja.append(f"  cubin = {cubin_file}")
             ninja.append(f"  name = {cubin_name}")
             ninja.append("")

             current_obj = next_obj

         # Step 3: Link the final object file
         ninja.append(f"build {output_name}: link {current_obj}")
         ninja.append("")
     else:
         # Directly link object files (for Windows or no cubin embedding)
         link_files_str = " ".join(obj_files)
         ninja.append(f"build {output_name}: link {link_files_str}")
         ninja.append("")

     # default target
     ninja.append(f"default {output_name}")
     ninja.append("")
     return "\n".join(ninja)


 def build_ninja(build_dir: str) -> None:
     """Build the module in the given build directory using ninja."""
     command = ["ninja", "-v"]
     num_workers = os.environ.get("MAX_JOBS", None)
     if num_workers is not None:
         command += ["-j", num_workers]
     if IS_WINDOWS:
         status = _run_command_in_dev_prompt(args=command, cwd=build_dir, capture_output=True)
     else:
         status = subprocess.run(check=False, args=command, cwd=build_dir, capture_output=True)
     encoding = "oem" if IS_WINDOWS else "utf-8"
     if status.returncode != 0:
         msg = [f"ninja exited with status {status.returncode}"]
         if status.stdout:
             msg.append(f"stdout:\n{status.stdout.decode(encoding)}")
         if status.stderr:
             msg.append(f"stderr:\n{status.stderr.decode(encoding)}")

         raise RuntimeError("\n".join(msg))

     LOG_BUILD = os.environ.get("TVM_FFI_CPP_EXTENSION_LOG_BUILD", "0")
     if LOG_BUILD in ("1", "stdout"):
         logger.info("ninja build stdout:\n%s", status.stdout.decode(encoding))
     if LOG_BUILD in ("1", "stderr"):
         logger.info("ninja build stderr:\n%s", status.stderr.decode(encoding))


 # Translation table for escaping C++ string literals
 _CPP_ESCAPE_TABLE = str.maketrans(
     {
         "\\": "\\\\",
         '"': '\\"',
         "\n": "\\n",
         "\r": "\\r",
         "\t": "\\t",
     }
 )


 def _escape_cpp_string_literal(s: str) -> str:
     """Escape special characters for C++ string literals."""
     return s.translate(_CPP_ESCAPE_TABLE)


 def _decorate_with_tvm_ffi(source: str, functions: Mapping[str, str]) -> str:
     """Decorate the given source code with TVM FFI export macros."""
     sources = [
         "#include <tvm/ffi/container/tensor.h>",
         "#include <tvm/ffi/dtype.h>",
         "#include <tvm/ffi/error.h>",
         "#include <tvm/ffi/extra/c_env_api.h>",
         "#include <tvm/ffi/function.h>",
         "",
         source,
     ]

     for func_name, func_doc in functions.items():
         sources.append(f"TVM_FFI_DLL_EXPORT_TYPED_FUNC({func_name}, {func_name});")

         if func_doc:
             # Escape the docstring for C++ string literal
             escaped_doc = _escape_cpp_string_literal(func_doc)
             sources.append(f'TVM_FFI_DLL_EXPORT_TYPED_FUNC_DOC({func_name}, "{escaped_doc}");')

     sources.append("")

     return "\n".join(sources)


 def _str_seq2list(seq: Sequence[str] | str | None) -> list[str]:
     if seq is None:
         return []
     elif isinstance(seq, str):
         return [seq]
     else:
         return list(seq)


 def _build_impl(  # noqa: PLR0913
     name: str,
     sources: Sequence[str] | str | None,
     extra_cflags: Sequence[str] | None,
     extra_cuda_cflags: Sequence[str] | None,
     extra_ldflags: Sequence[str] | None,
     extra_include_paths: Sequence[str] | None,
     build_directory: str | None,
     need_lock: bool = True,
     embed_cubin: Mapping[str, bytes] | None = None,
     backend: str | None = None,
     output: str | None = None,
 ) -> str:
     """Real implementation of build function."""
     # need to resolve the path to make it unique
     source_path_list = [str(Path(p).resolve()) for p in _str_seq2list(sources)]
     assert source_path_list, "sources must be provided."

     has_cuda = any(Path(p).suffix.lower() == ".cu" for p in source_path_list)
     resolved_backend = _resolve_gpu_backend(backend) if has_cuda else None
     extra_ldflags_list = list(extra_ldflags) if extra_ldflags is not None else []
     extra_cflags_list = list(extra_cflags) if extra_cflags is not None else []
     extra_cuda_cflags_list = list(extra_cuda_cflags) if extra_cuda_cflags is not None else []
     extra_include_paths_list = list(extra_include_paths) if extra_include_paths is not None else []

     build_dir: Path
     if build_directory is None:
         cache_dir = os.environ.get("TVM_FFI_CACHE_DIR", str(Path("~/.cache/tvm-ffi").expanduser()))
         source_hash: str = _hash_sources(
             None,
             None,
             source_path_list,
             None,
             {},
             extra_cflags_list,
             extra_cuda_cflags_list,
             extra_ldflags_list,
             extra_include_paths_list,
             embed_cubin,
         )
         build_dir = Path(cache_dir).expanduser() / f"{name}_{source_hash}"
     else:
         build_dir = Path(build_directory).resolve()
     build_dir.mkdir(parents=True, exist_ok=True)

     # CUBIN embedding is only supported on Unix systems
     if embed_cubin and IS_WINDOWS:
         raise NotImplementedError("CUBIN embedding is not yet supported on Windows")

     # Write CUBIN files to build directory if needed (for Unix systems)
     # These will be embedded using the embed_cubin utility during ninja build
     if embed_cubin:
         for cubin_name, cubin_bytes in embed_cubin.items():
             cubin_path = build_dir / f"{cubin_name}.cubin"
             cubin_path.write_bytes(cubin_bytes)

     # generate build.ninja
     ninja_source = _generate_ninja_build(
         name=name,
         extra_cflags=extra_cflags_list,
         extra_cuda_cflags=extra_cuda_cflags_list,
         extra_ldflags=extra_ldflags_list,
         extra_include_paths=extra_include_paths_list,
         sources=source_path_list,
         embed_cubin=embed_cubin,
         backend=resolved_backend,
         output=output,
     )

     # may not hold lock when build_directory is specified, prevent deadlock
     with FileLock(str(build_dir / "lock")) if need_lock else nullcontext():
         # write build.ninja if it does not already exist
         _maybe_write(str(build_dir / "build.ninja"), ninja_source)
         # build the module
         build_ninja(str(build_dir))
         # Determine the output filename (mirrors _generate_ninja_build logic)
         if output is not None:
             out_ext = Path(output).suffix.lower()
             object_mode = out_ext in (".o", ".obj")
             output_name = Path(output).name
         else:
             object_mode = False
             output_name = f"{name}{'.dll' if IS_WINDOWS else '.so'}"
         if object_mode and IS_WINDOWS:
             # Windows has no ld -r; the actual target is the first intermediate object.
             # The name must match _generate_ninja_build: c_0.o / cpp_0.o / cuda_0.o.
             first_ext = Path(sorted(source_path_list)[0]).suffix.lower() if source_path_list else ""
             if first_ext == ".c":
                 obj_name = "c_0.o"
             elif first_ext == ".cu":
                 obj_name = "cuda_0.o"
             else:
                 obj_name = "cpp_0.o"
             return str((build_dir / obj_name).resolve())
         return str((build_dir / output_name).resolve())


 def build_inline(  # noqa: PLR0913
     name: str,
     *,
     cpp_sources: Sequence[str] | str | None = None,
     cuda_sources: Sequence[str] | str | None = None,
     functions: Mapping[str, str] | Sequence[str] | str | None = None,
     extra_cflags: Sequence[str] | None = None,
     extra_cuda_cflags: Sequence[str] | None = None,
     extra_ldflags: Sequence[str] | None = None,
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
     embed_cubin: Mapping[str, bytes] | None = None,
     backend: str | None = None,
     output: str | None = None,
 ) -> str:
     """Compile and build a C++/CUDA module from inline source code.

     This function compiles the given C++ and/or CUDA source code into a shared library or object file.
     Both ``cpp_sources`` and ``cuda_sources`` are compiled to an object file. When ``output`` is
     ``None`` (the default) or has a shared-library extension (``.so``, ``.dll``), object files are
     linked into a shared library. When ``output`` has an object-file extension (``.o``, ``.obj``),
     linking is skipped and the path to the object file is returned directly.

     The ``functions`` parameter is used to specify which functions in the source code should be exported to the tvm ffi
     module. It can be a mapping, a sequence, or a single string. When a mapping is given, the keys are the names of the
     exported functions, and the values are docstrings for the functions. When a sequence of string is given, they are
     the function names needed to be exported, and the docstrings are set to empty strings. A single function name can
     also be given as a string, indicating that only one function is to be exported.

     Extra compiler and linker flags can be provided via the ``extra_cflags``, ``extra_cuda_cflags``, and ``extra_ldflags``
     parameters. The default flags are generally sufficient for most use cases, but you may need to provide additional
     flags for your specific use case.

     The include dir of tvm ffi and dlpack are used by default for the compiler to find the headers. Thus, you can
     include any header from tvm ffi in your source code. You can also provide additional include paths via the
     ``extra_include_paths`` parameter and include custom headers in your source code.

     The compiled shared library is cached in a cache directory to avoid recompilation. The `build_directory` parameter
     is provided to specify the build directory. If not specified, a default tvm ffi cache directory will be used.
     The default cache directory can be specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified,
     the default cache directory is ``~/.cache/tvm-ffi``.

     Parameters
     ----------
     name
         The name of the tvm ffi module.
     cpp_sources
         The C++ source code. It can be a list of sources or a single source.
     cuda_sources
         The CUDA source code. It can be a list of sources or a single source.
     functions
         The functions in cpp_sources or cuda_source that will be exported to the tvm ffi module. When a mapping is
         given, the keys are the names of the exported functions, and the values are docstrings for the functions
         (use an empty string to skip documentation for specific functions). When a sequence or a single string is given, they are
         the functions needed to be exported, and the docstrings are set to empty strings. A single function name can
         also be given as a string. When cpp_sources is given, the functions must be declared (not necessarily defined)
         in the cpp_sources. When cpp_sources is not given, the functions must be defined in the cuda_sources. If not
         specified, no function will be exported.
     extra_cflags
         The extra compiler flags for C++ compilation.
         The default flags are:

         - On Linux/macOS: ['-std=c++17', '-fPIC', '-O2']
         - On Windows: ['/std:c++17', '/O2']

     extra_cuda_cflags
         The extra compiler flags for CUDA compilation.

     extra_ldflags
         The extra linker flags.
         The default flags are:

         - On Linux/macOS: ['-shared']
         - On Windows: ['/DLL']

     extra_include_paths
         The extra include paths.

     build_directory
         The build directory. If not specified, a default tvm ffi cache directory will be used. By default, the
         cache directory is ``~/.cache/tvm-ffi``. You can also set the ``TVM_FFI_CACHE_DIR`` environment variable to
         specify the cache directory.

     embed_cubin: Mapping[str, bytes], optional
         A mapping from CUBIN module names to CUBIN binary data. TVM-FFI provides a macro `TVM_FFI_EMBED_CUBIN(name)` to embed
         CUBIN data into the compiled shared library. The keys should match the names used in `TVM_FFI_EMBED_CUBIN(name)` calls
         in the C++ source code. The values are the CUBIN binary data bytes. The embedded CUBIN kernels can be accessed by
         the macro `TVM_FFI_EMBED_CUBIN_GET_KERNEL(name, kernel_name)` defined in the `tvm/ffi/extra/cuda/cubin_launcher.h` header.
         See the `examples/cubin_launcher` directory for examples how to use cubin launcher to launch CUBIN kernels in TVM-FFI.

     backend
         The GPU backend to use. It can be "cuda" or "hip".
         If not specified, the backend will be automatically determined based on the available GPU and the provided source code.

     output
         Output filename that determines the build type from its extension. When ``None``
         (the default), builds a shared library (``.so`` on Unix, ``.dll`` on Windows).
         Use an object-file extension (e.g., ``"hello.o"``) to skip linking and produce
         a relocatable object file. The file is placed in the build directory.

     Returns
     -------
     path: str
         The path to the built shared library or object file.

     Example
     -------

     .. code-block:: python

         import torch
         from tvm_ffi import Module
         import tvm_ffi.cpp

         # define the cpp source code
         cpp_source = '''
              void add_one_cpu(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
                // implementation of a library function
                TVM_FFI_ICHECK(x.ndim() == 1) << "x must be a 1D tensor";
                DLDataType f32_dtype{kDLFloat, 32, 1};
                TVM_FFI_ICHECK(x.dtype() == f32_dtype) << "x must be a float tensor";
                TVM_FFI_ICHECK(y.ndim() == 1) << "y must be a 1D tensor";
                TVM_FFI_ICHECK(y.dtype() == f32_dtype) << "y must be a float tensor";
                TVM_FFI_ICHECK(x.size(0) == y.size(0)) << "x and y must have the same shape";
                for (int i = 0; i < x.size(0); ++i) {
                  static_cast<float*>(y.data_ptr())[i] = static_cast<float*>(x.data_ptr())[i] + 1;
                }
              }
         '''

         # compile the cpp source code and load the module
         lib_path: str = tvm_ffi.cpp.build_inline(
             name="hello",
             cpp_sources=cpp_source,
             functions="add_one_cpu",
         )

         # load the module
         mod: Module = tvm_ffi.load_module(lib_path)

         # use the function from the loaded module to perform
         x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
         y = torch.empty_like(x)
         mod.add_one_cpu(x, y)
         torch.testing.assert_close(x + 1, y)

     """
     cpp_source_list = _str_seq2list(cpp_sources)
     cpp_source = "\n".join(cpp_source_list)
     with_cpp = bool(cpp_source_list)
     del cpp_source_list

     cuda_source_list = _str_seq2list(cuda_sources)
     cuda_source = "\n".join(cuda_source_list)
     with_backend = bool(cuda_source_list)
     del cuda_source_list

     extra_ldflags_list = list(extra_ldflags) if extra_ldflags is not None else []
     extra_cflags_list = list(extra_cflags) if extra_cflags is not None else []
     extra_cuda_cflags_list = list(extra_cuda_cflags) if extra_cuda_cflags is not None else []
     extra_include_paths_list = list(extra_include_paths) if extra_include_paths is not None else []

     # add function registration code to sources
     if functions is None:
         function_map: dict[str, str] = {}
     elif isinstance(functions, str):
         function_map = {functions: ""}
     elif isinstance(functions, Mapping):
         function_map = dict(functions)
     else:
         function_map = {name: "" for name in functions}

     if with_cpp:
         cpp_source = _decorate_with_tvm_ffi(cpp_source, function_map)
         cuda_source = _decorate_with_tvm_ffi(cuda_source, {})
     else:
         cpp_source = _decorate_with_tvm_ffi(cpp_source, {})
         cuda_source = _decorate_with_tvm_ffi(cuda_source, function_map)
     # determine the cache dir for the built module
     build_dir: Path
     if build_directory is None:
         cache_dir = os.environ.get("TVM_FFI_CACHE_DIR", str(Path("~/.cache/tvm-ffi").expanduser()))
         source_hash: str = _hash_sources(
             cpp_source,
             cuda_source,
             None,
             None,
             function_map,
             extra_cflags_list,
             extra_cuda_cflags_list,
             extra_ldflags_list,
             extra_include_paths_list,
             embed_cubin,
         )
         build_dir = Path(cache_dir).expanduser() / f"{name}_{source_hash}"
     else:
         build_dir = Path(build_directory).resolve()
     build_dir.mkdir(parents=True, exist_ok=True)

     cpp_file = str((build_dir / "main.cpp").resolve())
     cuda_file = str((build_dir / "cuda.cu").resolve())

     with FileLock(str(build_dir / "lock")):
         # write source files if they do not already exist
         _maybe_write(cpp_file, cpp_source)
         if with_backend:
             _maybe_write(cuda_file, cuda_source)

         src_files = []
         if with_cpp:
             src_files.append(cpp_file)
         if with_backend:
             src_files.append(cuda_file)
         return _build_impl(
             name=name,
             sources=src_files,
             extra_cflags=extra_cflags_list,
             extra_cuda_cflags=extra_cuda_cflags_list,
             extra_ldflags=extra_ldflags_list,
             extra_include_paths=extra_include_paths_list,
             build_directory=str(build_dir),
             need_lock=False,  # already hold the lock
             embed_cubin=embed_cubin,
             backend=backend,
             output=output,
         )


 def load_inline(  # noqa: PLR0913
     name: str,
     *,
     cpp_sources: Sequence[str] | str | None = None,
     cuda_sources: Sequence[str] | str | None = None,
     functions: Mapping[str, str] | Sequence[str] | str | None = None,
     extra_cflags: Sequence[str] | None = None,
     extra_cuda_cflags: Sequence[str] | None = None,
     extra_ldflags: Sequence[str] | None = None,
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
     embed_cubin: Mapping[str, bytes] | None = None,
     keep_module_alive: bool = True,
     backend: str | None = None,
 ) -> Module:
     """Compile, build and load a C++/CUDA module from inline source code.

     This function compiles the given C++ and/or CUDA source code into a shared library. Both ``cpp_sources`` and
     ``cuda_sources`` are compiled to an object file, and then linked together into a shared library. It's possible to only
     provide cpp_sources or cuda_sources.

     The ``functions`` parameter is used to specify which functions in the source code should be exported to the tvm ffi
     module. It can be a mapping, a sequence, or a single string. When a mapping is given, the keys are the names of the
     exported functions, and the values are docstrings for the functions. When a sequence of string is given, they are
     the function names needed to be exported, and the docstrings are set to empty strings. A single function name can
     also be given as a string, indicating that only one function is to be exported.

     Extra compiler and linker flags can be provided via the ``extra_cflags``, ``extra_cuda_cflags``, and ``extra_ldflags``
     parameters. The default flags are generally sufficient for most use cases, but you may need to provide additional
     flags for your specific use case.

     The include dir of tvm ffi and dlpack are used by default for the compiler to find the headers. Thus, you can
     include any header from tvm ffi in your source code. You can also provide additional include paths via the
     ``extra_include_paths`` parameter and include custom headers in your source code.

     The compiled shared library is cached in a cache directory to avoid recompilation. The `build_directory` parameter
     is provided to specify the build directory. If not specified, a default tvm ffi cache directory will be used.
     The default cache directory can be specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified,
     the default cache directory is ``~/.cache/tvm-ffi``.

     Parameters
     ----------
     name
         The name of the tvm ffi module.
     cpp_sources
         The C++ source code. It can be a list of sources or a single source.
     cuda_sources
         The CUDA source code. It can be a list of sources or a single source.
     functions
         The functions in cpp_sources or cuda_source that will be exported to the tvm ffi module. When a mapping is
         given, the keys are the names of the exported functions, and the values are docstrings for the functions
         (use an empty string to skip documentation for specific functions). When a sequence or a single string is given, they are
         the functions needed to be exported, and the docstrings are set to empty strings. A single function name can
         also be given as a string. When cpp_sources is given, the functions must be declared (not necessarily defined)
         in the cpp_sources. When cpp_sources is not given, the functions must be defined in the cuda_sources. If not
         specified, no function will be exported.
     extra_cflags
         The extra compiler flags for C++ compilation.
         The default flags are:

         - On Linux/macOS: ['-std=c++17', '-fPIC', '-O2']
         - On Windows: ['/std:c++17', '/O2']

     extra_cuda_cflags
         The extra compiler flags for CUDA compilation.

     extra_ldflags
         The extra linker flags.
         The default flags are:

         - On Linux/macOS: ['-shared']
         - On Windows: ['/DLL']

     extra_include_paths
         The extra include paths.

     build_directory
         The build directory. If not specified, a default tvm ffi cache directory will be used. By default, the
         cache directory is ``~/.cache/tvm-ffi``. You can also set the ``TVM_FFI_CACHE_DIR`` environment variable to
         specify the cache directory.

     embed_cubin
         A mapping from CUBIN module names to CUBIN binary data. When provided, the CUBIN data will be embedded
         into the compiled shared library using objcopy, making it accessible via the TVM_FFI_EMBED_CUBIN macro.
         The keys should match the names used in TVM_FFI_EMBED_CUBIN calls in the C++ source code.

     keep_module_alive
         Whether to keep the module alive. If True, the module will be kept alive
         for the duration of the program until libtvm_ffi.so is unloaded.

     backend
         The GPU backend to use. It can be "cuda" or "hip".
         If not specified, the backend will be automatically determined based on the available GPU and the provided source code.

     Returns
     -------
     mod: Module
         The loaded tvm ffi module.

     See Also
     --------
     :py:func:`tvm_ffi.load_module`

     Example
     -------

     .. code-block:: python

         import torch
         from tvm_ffi import Module
         import tvm_ffi.cpp

         # define the cpp source code
         cpp_source = '''
              void add_one_cpu(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
                // implementation of a library function
                TVM_FFI_ICHECK(x.ndim() == 1) << "x must be a 1D tensor";
                DLDataType f32_dtype{kDLFloat, 32, 1};
                TVM_FFI_ICHECK(x.dtype() == f32_dtype) << "x must be a float tensor";
                TVM_FFI_ICHECK(y.ndim() == 1) << "y must be a 1D tensor";
                TVM_FFI_ICHECK(y.dtype() == f32_dtype) << "y must be a float tensor";
                TVM_FFI_ICHECK(x.size(0) == y.size(0)) << "x and y must have the same shape";
                for (int i = 0; i < x.size(0); ++i) {
                  static_cast<float*>(y.data_ptr())[i] = static_cast<float*>(x.data_ptr())[i] + 1;
                }
              }
         '''

         # compile the cpp source code and load the module
         mod: Module = tvm_ffi.cpp.load_inline(
             name="hello",
             cpp_sources=cpp_source,
             functions="add_one_cpu",
         )

         # use the function from the loaded module to perform
         x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
         y = torch.empty_like(x)
         mod.add_one_cpu(x, y)
         torch.testing.assert_close(x + 1, y)

     """
     return load_module(
         build_inline(
             name=name,
             cpp_sources=cpp_sources,
             cuda_sources=cuda_sources,
             functions=functions,
             extra_cflags=extra_cflags,
             extra_cuda_cflags=extra_cuda_cflags,
             extra_ldflags=extra_ldflags,
             extra_include_paths=extra_include_paths,
             build_directory=build_directory,
             embed_cubin=embed_cubin,
             backend=backend,
         ),
         keep_module_alive=keep_module_alive,
     )


 def build(  # noqa: PLR0913
     name: str,
     *,
     sources: Sequence[str] | str | None = None,
     cpp_files: Sequence[str] | str | None = None,
     cuda_files: Sequence[str] | str | None = None,
     extra_cflags: Sequence[str] | None = None,
     extra_cuda_cflags: Sequence[str] | None = None,
     extra_ldflags: Sequence[str] | None = None,
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
     backend: str | None = None,
     output: str | None = None,
 ) -> str:
     """Compile and build a C/C++/CUDA module from source files.

     This function compiles the given C, C++, and/or CUDA source files into a shared library or
     object file. The compiler is selected automatically based on file extension:

     - ``.c`` — compiled with the C compiler (``$CC``)
     - ``.cc``, ``.cpp``, ``.cxx`` — compiled with the C++ compiler (``$CXX``)
     - ``.o``, ``.obj`` — pre-compiled objects, passed directly to the linker

     When ``output`` is ``None`` (the default) or has a shared-library extension, object files are
     linked into a shared library. When ``output`` has an object-file extension (``.o``, ``.obj``),
     linking is skipped and the path to the object file is returned.

     Note that this function does not automatically export functions to the tvm ffi module. You need to
     manually use the TVM FFI export macros (e.g., ``TVM_FFI_DLL_EXPORT_TYPED_FUNC``) in your source files to export
     functions. This gives you more control over which functions are exported and how they are exported.

     Extra compiler and linker flags can be provided via the ``extra_cflags``, ``extra_cuda_cflags``, and ``extra_ldflags``
     parameters. The default flags are generally sufficient for most use cases, but you may need to provide additional
     flags for your specific use case.

     The include dir of tvm ffi and dlpack are used by default for the compiler to find the headers. Thus, you can
     include any header from tvm ffi in your source files. You can also provide additional include paths via the
     ``extra_include_paths`` parameter and include custom headers in your source code.

     The compiled shared library is cached in a cache directory to avoid recompilation. The `build_directory` parameter
     is provided to specify the build directory. If not specified, a default tvm ffi cache directory will be used.
     The default cache directory can be specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified,
     the default cache directory is ``~/.cache/tvm-ffi``.

     The C compiler is controlled by the ``$CC`` environment variable (default: ``cc`` on Unix, ``cl`` on Windows).
     The C++ compiler is controlled by the ``$CXX`` environment variable (default: ``c++`` on Unix, ``cl`` on Windows).

     Parameters
     ----------
     name
         The name of the tvm ffi module.
     sources
         Source files to compile. The compiler is auto-detected from the file extension:

         - ``.c`` → C compiler (``$CC``)
         - ``.cc``, ``.cpp``, ``.cxx`` → C++ compiler (``$CXX``)
         - ``.cu`` → CUDA/HIP compiler (``nvcc`` or ``hipcc``)
         - ``.o``, ``.obj`` → pre-compiled objects, passed directly to the linker

         It can be a list of file paths or a single file path.
     cpp_files
         Alias for ``sources``, kept for backward compatibility.
     cuda_files
         Alias for ``sources``, kept for backward compatibility.
     extra_cflags
         Extra compiler flags applied to both C and C++ compilation.
         The C++ default flags are:

         - On Linux/macOS: ['-std=c++17', '-fPIC', '-O2']
         - On Windows: ['/std:c++17', '/MD', '/O2']

         The C default flags omit ``-std=c++17`` and ``/EHsc``.

     extra_cuda_cflags
         The extra compiler flags for CUDA compilation.
         The default flags are:

         - ['-Xcompiler', '-fPIC', '-std=c++17', '-O2'] (Linux/macOS)
         - ['-Xcompiler', '/std:c++17', '/O2'] (Windows)

     extra_ldflags
         The extra linker flags.
         The default flags are:

         - On Linux/macOS: ['-shared', '-L<tvm_ffi_lib_path>', '-ltvm_ffi']
         - On Windows: ['/DLL', '/LIBPATH:<tvm_ffi_lib_path>', '<tvm_ffi_lib_name>.lib']

     extra_include_paths
         The extra include paths for header files. Both absolute and relative paths are supported.

     build_directory
         The build directory. If not specified, a default tvm ffi cache directory will be used. By default, the
         cache directory is ``~/.cache/tvm-ffi``. You can also set the ``TVM_FFI_CACHE_DIR`` environment variable to
         specify the cache directory.

     backend
         The GPU backend to use. It can be "cuda" or "hip".
         If not specified, the backend will be automatically determined based on the available GPU and the provided source code.

     output
         Output filename that determines the build type from its extension. When ``None``
         (the default), builds a shared library (``.so`` on Unix, ``.dll`` on Windows).
         Use an object-file extension (e.g., ``"my_ops.o"``) to skip linking and produce
         a relocatable object file. The file is placed in the build directory.

     Returns
     -------
     path: str
         The path to the built shared library or object file.

     Example
     -------

     .. code-block:: python

         import torch
         from tvm_ffi import Module
         import tvm_ffi.cpp

         # Assume we have a C++ source file "my_ops.cpp" with the following content:
         # ```cpp
         # #include <tvm/ffi/container/tensor.h>
         # #include <tvm/ffi/dtype.h>
         # #include <tvm/ffi/error.h>
         # #include <tvm/ffi/extra/c_env_api.h>
         # #include <tvm/ffi/function.h>
         #
         # void add_one_cpu(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
         #   TVM_FFI_ICHECK(x.ndim() == 1) << "x must be a 1D tensor";
         #   DLDataType f32_dtype{kDLFloat, 32, 1};
         #   TVM_FFI_ICHECK(x.dtype() == f32_dtype) << "x must be a float tensor";
         #   TVM_FFI_ICHECK(y.ndim() == 1) << "y must be a 1D tensor";
         #   TVM_FFI_ICHECK(y.dtype() == f32_dtype) << "y must be a float tensor";
         #   TVM_FFI_ICHECK(x.size(0) == y.size(0)) << "x and y must have the same shape";
         #   for (int i = 0; i < x.size(0); ++i) {
         #     static_cast<float*>(y.data_ptr())[i] = static_cast<float*>(x.data_ptr())[i] + 1;
         #   }
         # }
         #
         # TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cpu, add_one_cpu);
         # ```

         # compile the cpp source file and get the library path
         lib_path: str = tvm_ffi.cpp.build(
             name="my_ops",
             sources="my_ops.cpp",
         )

         # load the module
         mod: Module = tvm_ffi.load_module(lib_path)

         # use the function from the loaded module
         x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
         y = torch.empty_like(x)
         mod.add_one_cpu(x, y)
         torch.testing.assert_close(x + 1, y)

     """
     # Merge sources, cpp_files, and cuda_files (backward compat aliases)
     merged = _str_seq2list(sources) + _str_seq2list(cpp_files) + _str_seq2list(cuda_files)
     return _build_impl(
         name=name,
         sources=merged or None,
         extra_cflags=extra_cflags,
         extra_cuda_cflags=extra_cuda_cflags,
         extra_ldflags=extra_ldflags,
         extra_include_paths=extra_include_paths,
         build_directory=build_directory,
         need_lock=True,
         backend=backend,
         output=output,
     )


 def load(  # noqa: PLR0913
     name: str,
     *,
     sources: Sequence[str] | str | None = None,
     cpp_files: Sequence[str] | str | None = None,
     cuda_files: Sequence[str] | str | None = None,
     extra_cflags: Sequence[str] | None = None,
     extra_cuda_cflags: Sequence[str] | None = None,
     extra_ldflags: Sequence[str] | None = None,
     extra_include_paths: Sequence[str] | None = None,
     build_directory: str | None = None,
     keep_module_alive: bool = True,
     backend: str | None = None,
 ) -> Module:
     """Compile, build and load a C/C++/CUDA module from source files.

     This function compiles the given source files into a shared library and loads it as a tvm ffi
     module. The compiler is selected automatically based on file extension.

     Note that this function does not automatically export functions to the tvm ffi module. You need to
     manually use the TVM FFI export macros (e.g., :c:macro:`TVM_FFI_DLL_EXPORT_TYPED_FUNC`) in your source files to export
     functions. This gives you more control over which functions are exported and how they are exported.

     Extra compiler and linker flags can be provided via the ``extra_cflags``, ``extra_cuda_cflags``, and ``extra_ldflags``
     parameters. The default flags are generally sufficient for most use cases, but you may need to provide additional
     flags for your specific use case.

     The include dir of tvm ffi and dlpack are used by default for the compiler to find the headers. Thus, you can
     include any header from tvm ffi in your source files. You can also provide additional include paths via the
     ``extra_include_paths`` parameter and include custom headers in your source code.

     The compiled shared library is cached in a cache directory to avoid recompilation. The `build_directory` parameter
     is provided to specify the build directory. If not specified, a default tvm ffi cache directory will be used.
     The default cache directory can be specified via the `TVM_FFI_CACHE_DIR` environment variable. If not specified,
     the default cache directory is ``~/.cache/tvm-ffi``.

     Parameters
     ----------
     name
         The name of the tvm ffi module.
     sources
         Source files to compile. The compiler is auto-detected from the file extension:
         ``.c`` → C, ``.cc``/``.cpp``/``.cxx`` → C++, ``.cu`` → CUDA/HIP,
         ``.o``/``.obj`` → linker passthrough. It can be a list of file paths or a single file path.
     cpp_files
         Alias for ``sources``, kept for backward compatibility.
     cuda_files
         Alias for ``sources``, kept for backward compatibility.
     extra_cflags
         The extra compiler flags for C++ compilation.
         The default flags are:

         - On Linux/macOS: ['-std=c++17', '-fPIC', '-O2']
         - On Windows: ['/std:c++17', '/MD', '/O2']

     extra_cuda_cflags
         The extra compiler flags for CUDA compilation.
         The default flags are:

         - ['-Xcompiler', '-fPIC', '-std=c++17', '-O2'] (Linux/macOS)
         - ['-Xcompiler', '/std:c++17', '/O2'] (Windows)

     extra_ldflags
         The extra linker flags.
         The default flags are:

         - On Linux/macOS: ['-shared', '-L<tvm_ffi_lib_path>', '-ltvm_ffi']
         - On Windows: ['/DLL', '/LIBPATH:<tvm_ffi_lib_path>', '<tvm_ffi_lib_name>.lib']

     extra_include_paths
         The extra include paths for header files. Both absolute and relative paths are supported.

     build_directory
         The build directory. If not specified, a default tvm ffi cache directory will be used. By default, the
         cache directory is ``~/.cache/tvm-ffi``. You can also set the ``TVM_FFI_CACHE_DIR`` environment variable to
         specify the cache directory.

     keep_module_alive
         Whether to keep the module alive. If True, the module will be kept alive
         for the duration of the program until libtvm_ffi.so is unloaded.

     backend
         The GPU backend to use. It can be "cuda" or "hip".
         If not specified, the backend will be automatically determined based on the available GPU and the provided source code.

     Returns
     -------
     mod: Module
         The loaded tvm ffi module.

     See Also
     --------
     :py:func:`tvm_ffi.load_module`

     Example
     -------

     .. code-block:: python

         import torch
         from tvm_ffi import Module
         import tvm_ffi.cpp

         # Assume we have a C++ source file "my_ops.cpp" with the following content:
         # ```cpp
         # #include <tvm/ffi/container/tensor.h>
         # #include <tvm/ffi/dtype.h>
         # #include <tvm/ffi/error.h>
         # #include <tvm/ffi/extra/c_env_api.h>
         # #include <tvm/ffi/function.h>
         #
         # void add_one_cpu(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
         #   TVM_FFI_ICHECK(x.ndim() == 1) << "x must be a 1D tensor";
         #   DLDataType f32_dtype{kDLFloat, 32, 1};
         #   TVM_FFI_ICHECK(x.dtype() == f32_dtype) << "x must be a float tensor";
         #   TVM_FFI_ICHECK(y.ndim() == 1) << "y must be a 1D tensor";
         #   TVM_FFI_ICHECK(y.dtype() == f32_dtype) << "y must be a float tensor";
         #   TVM_FFI_ICHECK(x.size(0) == y.size(0)) << "x and y must have the same shape";
         #   for (int i = 0; i < x.size(0); ++i) {
         #     static_cast<float*>(y.data_ptr())[i] = static_cast<float*>(x.data_ptr())[i] + 1;
         #   }
         # }
         #
         # TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cpu, add_one_cpu);
         # ```

         # compile the cpp source file and load the module
         mod: Module = tvm_ffi.cpp.load(
             name="my_ops",
             sources="my_ops.cpp",
         )

         # use the function from the loaded module
         x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
         y = torch.empty_like(x)
         mod.add_one_cpu(x, y)
         torch.testing.assert_close(x + 1, y)

     """
     return load_module(
         build(
             name=name,
             sources=sources,
             cpp_files=cpp_files,
             cuda_files=cuda_files,
             extra_cflags=extra_cflags,
             extra_cuda_cflags=extra_cuda_cflags,
             extra_ldflags=extra_ldflags,
             extra_include_paths=extra_include_paths,
             build_directory=build_directory,
             backend=backend,
         ),
         keep_module_alive=keep_module_alive,
     )