blob: 5ac424f138a0e087a6ec3b91f192d0853336112f [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Extract docstrings from pyarrow runtime and insert them into stub files.
Usage:
python scripts/update_stub_docstrings.py <install_prefix> <source_dir>
"""
import argparse
import importlib
import inspect
import os
import shutil
import sys
import tempfile
from pathlib import Path
from textwrap import indent
import libcst
from libcst import matchers as m
def _resolve_object(module, path):
"""Resolve an object by dotted path from a module."""
if not path:
return module, None, module.__name__
parts = path.split(".")
parent = None
obj = module
for part in parts:
parent = obj
try:
obj = getattr(obj, part)
except AttributeError:
try:
obj = vars(parent).get(part)
if obj is not None:
continue
except TypeError:
pass
return None, None, None
return obj, parent, getattr(obj, "__name__", parts[-1])
def _get_docstring(name, module, indentation):
"""Extract and format a docstring for insertion into a stub file."""
obj, parent, obj_name = _resolve_object(module, name)
if obj is None:
print(f"{name} not found in {module.__name__}")
return None
docstring = inspect.getdoc(obj)
if not docstring:
return None
# Remove signature prefix
parent_name = getattr(parent, "__name__", None) if parent else None
if docstring.startswith(obj_name) or (
parent_name and docstring.startswith(f"{parent_name}.{obj_name}")
):
docstring = "\n".join(docstring.splitlines()[2:])
# Skip empty docstrings
if not docstring.strip():
return None
prefix = " " * indentation
return '"""\n' + indent(docstring + '\n"""', prefix)
class DocstringInserter(libcst.CSTTransformer):
"""CST transformer that inserts docstrings into stub file nodes."""
def __init__(self, module, namespace):
self.module = module
self.base_namespace = namespace
self.stack = []
self.indentation = 0
def _full_name(self):
name = ".".join(self.stack)
return f"{self.base_namespace}.{name}" if self.base_namespace else name
def leave_Module(self, original_node, updated_node):
new_body = []
clone_matcher = m.SimpleStatementLine(
body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))),
m.ZeroOrMore()]
)
for stmt in updated_node.body:
new_body.append(stmt)
if m.matches(stmt, clone_matcher):
name = stmt.body[0].targets[0].target.value
if self.base_namespace:
name = f"{self.base_namespace}.{name}"
docstring = _get_docstring(name, self.module, 0)
if docstring:
new_body.append(libcst.SimpleStatementLine(
body=[libcst.Expr(value=libcst.SimpleString(docstring))]))
return updated_node.with_changes(body=new_body)
def visit_ClassDef(self, node):
self.stack.append(node.name.value)
self.indentation += 1
def leave_ClassDef(self, original_node, updated_node):
name = self._full_name()
docstring = _get_docstring(name, self.module, self.indentation)
if docstring:
ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[
m.SimpleStatementLine(body=[
m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()]))
func_class = m.ClassDef(body=m.IndentedBlock(
body=[m.FunctionDef(), m.ZeroOrMore()]))
if m.matches(updated_node, ellipsis_class):
updated_node = updated_node.deep_replace(
updated_node.body.body[0].body[0].value,
libcst.SimpleString(value=docstring))
elif m.matches(updated_node, func_class):
docstring_stmt = libcst.SimpleStatementLine(
body=[libcst.Expr(value=libcst.SimpleString(value=docstring))])
updated_node = updated_node.with_changes(
body=updated_node.body.with_changes(
body=[docstring_stmt] + list(updated_node.body.body)))
self.stack.pop()
self.indentation -= 1
return updated_node
def visit_FunctionDef(self, node):
self.stack.append(node.name.value)
self.indentation += 1
def leave_FunctionDef(self, original_node, updated_node):
name = self._full_name()
ellipsis_func = m.FunctionDef(
body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())]))
if m.matches(original_node, ellipsis_func):
docstring = _get_docstring(name, self.module, self.indentation)
if docstring:
docstring_stmt = libcst.SimpleStatementLine(
body=[libcst.Expr(value=libcst.SimpleString(value=docstring))])
updated_node = updated_node.with_changes(
body=libcst.IndentedBlock(body=[docstring_stmt]))
self.stack.pop()
self.indentation -= 1
return updated_node
LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io",
"_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"}
def add_docstrings_to_stubs(stubs_dir):
"""Update all stub files in stubs_dir with docstrings from pyarrow runtime."""
stubs_dir = Path(stubs_dir)
print(f"Updating stub docstrings in: {stubs_dir}")
pyarrow = importlib.import_module("pyarrow")
for stub_file in sorted(stubs_dir.rglob('*.pyi')):
if stub_file.name == "_stubs_typing.pyi":
continue
module_name = stub_file.stem
if module_name in LIB_MODULES:
namespace = "lib"
elif stub_file.parent.name in ("parquet", "interchange"):
namespace = (stub_file.parent.name if module_name == "__init__"
else f"{stub_file.parent.name}.{module_name}")
elif module_name == "__init__":
namespace = ""
else:
namespace = module_name
print(f" {stub_file.name} -> {namespace or '(root)'}")
tree = libcst.parse_module(stub_file.read_text(encoding="utf-8"))
modified = tree.visit(DocstringInserter(pyarrow, namespace))
stub_file.write_text(modified.code, encoding="utf-8")
def _link_or_copy(source, destination):
# Prefer symlinks (faster, no disk use) but fall back to copying when the
# filesystem doesn't support them (e.g. Docker volumes, network mounts).
if sys.platform != "win32":
try:
os.symlink(source, destination)
return
except OSError:
pass
if source.is_dir():
shutil.copytree(source, destination, symlinks=(sys.platform != "win32"))
else:
shutil.copy2(source, destination)
def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir):
"""
Assemble an importable pyarrow package inside a temporary directory.
During wheel builds the .py sources and compiled binary artifacts live in
separate trees (source checkout vs CMake install prefix). This function
symlinks (or copies) both into pyarrow_pkg folder so that a plain
``import pyarrow`` works and docstrings can be extracted at build time.
"""
source_pyarrow = source_dir / "pyarrow"
if not source_pyarrow.exists():
raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}")
for source_path in sorted(source_pyarrow.iterdir()):
if source_path.suffix == ".py":
_link_or_copy(source_path, pyarrow_pkg / source_path.name)
elif source_path.is_dir() and not source_path.name.startswith((".", "__")):
_link_or_copy(source_path, pyarrow_pkg / source_path.name)
for artifact in sorted(install_pyarrow_dir.iterdir()):
if not artifact.is_file() or artifact.suffix == ".pyi":
continue
destination = pyarrow_pkg / artifact.name
if not destination.exists():
_link_or_copy(artifact, destination)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("install_prefix", type=Path,
help="CMAKE_INSTALL_PREFIX used by wheel build")
parser.add_argument("source_dir", type=Path,
help="PyArrow source directory")
args = parser.parse_args()
install_prefix = args.install_prefix.resolve()
source_dir = args.source_dir.resolve()
install_pyarrow_dir = install_prefix / "pyarrow"
if not install_pyarrow_dir.exists():
install_pyarrow_dir = install_prefix
if not any(install_pyarrow_dir.rglob("*.pyi")):
print("No .pyi files found in install tree, skipping docstring injection")
sys.exit(0)
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
pyarrow_pkg = Path(tmpdir) / "pyarrow"
pyarrow_pkg.mkdir()
_create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir)
sys.path.insert(0, tmpdir)
try:
add_docstrings_to_stubs(install_pyarrow_dir)
finally:
sys.path.pop(0)