Adds documentation/examples for data adapters This has: 1. A reference table, automatically generated using a custom sphinx directive 2. References for the base classes for extension Re (1) we generate a bare-bones table but it should be enough. For now, we just link to the code, but we will, at some point, link to actual class docs.
diff --git a/docs/conf.py b/docs/conf.py index ea274db..35f7900 100644 --- a/docs/conf.py +++ b/docs/conf.py
@@ -30,6 +30,7 @@ "sphinx.ext.autosummary", "myst_parser", "sphinx_sitemap", + "docs.data_adapters_extension", ] # for the sitemap extension ---
diff --git a/docs/data_adapters_extension.py b/docs/data_adapters_extension.py new file mode 100644 index 0000000..923c78c --- /dev/null +++ b/docs/data_adapters_extension.py
@@ -0,0 +1,288 @@ +import dataclasses +import inspect +import os +from typing import List, Optional, Tuple, Type + +import git +from docutils import nodes +from docutils.parsers.rst import Directive + +import hamilton.io.data_adapters +from hamilton import registry + +"""A module to crawl available data adapters and generate documentation for them. +Note these currently link out to the source code on GitHub, but they should +be linking to the documentation instead, which hasn't been generated yet. +""" + +# These have fallbacks for local dev +GIT_URL = os.environ.get("READTHEDOCS_GIT_CLONE_URL", "https://github.com/dagworks-inc/hamilton") +GIT_ID = os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main") + +# All the modules that register data adapters +# When you register a new one, add it here +MODULES_TO_IMPORT = ["hamilton.io.default_data_loaders", "hamilton.plugins.pandas_extensions"] + +for module in MODULES_TO_IMPORT: + __import__(module) + + +def get_git_root(path: str) -> str: + """Yields the git room of a repo, given an absolute path to + a file within the repo. + + :param path: Path to a file within a git repo + :return: The root of the git repo + """ + git_repo = git.Repo(path, search_parent_directories=True) + git_root = git_repo.git.rev_parse("--show-toplevel") + return git_root + + +@dataclasses.dataclass +class Param: + name: str + type: str + default: Optional[str] = None + + +def get_default(param: dataclasses.Field) -> Optional[str]: + """Gets the deafult of a dataclass field, if it has one. + + :param param: The dataclass field + :return: The str representation of the default. + """ + if param.default is dataclasses.MISSING: + return None + return str(param.default) + + +def get_lines_for_class(class_: Type[Type]) -> Tuple[int, int]: + """Gets the set of lines in which a class is implemented + + :param class_: The class to get the lines for + :return: A tuple of the start and end lines + """ + lines = inspect.getsourcelines(class_) + start_line = lines[1] + end_line = lines[1] + len(lines[0]) + return start_line, end_line + + +def get_class_repr(class_: Type) -> str: + """Gets a representation of a class that can be used in documentation. + + :param class_: Python class to get the representation for + :return: Str representation + """ + + try: + return class_.__qualname__ + except AttributeError: + # This happens when we have generics or other oddities + return str(class_) + + +@dataclasses.dataclass +class AdapterInfo: + key: str + class_name: str + class_path: str + load_params: List[Param] + save_params: List[Param] + applicable_types: List[str] + file_: str + line_nos: Tuple[int, int] + + @staticmethod + def from_loader(loader: Type[hamilton.io.data_adapters.DataLoader]) -> "AdapterInfo": + """Utility constructor to create the AdapterInfo from a DataLoader class + + :param loader: DataLoader class + :return: AdapterInfo derived from it + """ + + return AdapterInfo( + key=loader.name(), + class_name=loader.__name__, + class_path=loader.__module__, + load_params=[ + Param(name=p.name, type=get_class_repr(p.type), default=get_default(p)) + for p in dataclasses.fields(loader) + ] + if issubclass(loader, hamilton.io.data_adapters.DataSaver) + else None, + save_params=[ + Param(name=p.name, type=get_class_repr(p.type), default=get_default(p)) + for p in dataclasses.fields(loader) + ] + if issubclass(loader, hamilton.io.data_adapters.DataSaver) + else None, + applicable_types=[get_class_repr(t) for t in loader.applicable_types()], + file_=inspect.getfile(loader), + line_nos=get_lines_for_class(loader), + ) + + +def _collect_loaders(saver_or_loader: str) -> List[Type[hamilton.io.data_adapters.AdapterCommon]]: + """Collects all loaders from the registry. + + :return: + """ + out = [] + loaders = ( + list(registry.LOADER_REGISTRY.values()) + if saver_or_loader == "loader" + else list(registry.SAVER_REGISTRY.values()) + ) + for classes in loaders: + for cls in classes: + if cls not in out: + out.append(cls) + return out + + +# Utility functions to render different components of the adapter in table cells + + +def render_key(key: str): + return [nodes.Text(key, key)] + + +def render_class_name(class_name: str): + return [nodes.literal(text=class_name)] + + +def render_class_path(class_path: str, file_: str, line_start: int, line_end: int): + git_path = get_git_root(file_) + file_relative_to_git_root = os.path.relpath(file_, git_path) + href = f"{GIT_URL}/blob/{GIT_ID}/{file_relative_to_git_root}#L{line_start}-L{line_end}" + # href = f"{GIT_URL}/blob/{GIT_ID}/{file_}#L{line_no}" + return [nodes.raw("", f'<a href="{href}">{class_path}</a>', format="html")] + + +def render_adapter_params(load_params: Optional[List[Param]]): + if load_params is None: + return nodes.raw("", "<div/>", format="html") + fieldlist = nodes.field_list() + for i, load_param in enumerate(load_params): + fieldname = nodes.Text(load_param.name) + fieldbody = nodes.literal( + text=load_param.type + + ("=" + load_param.default if load_param.default is not None else "") + ) + field = nodes.field("", fieldname, fieldbody) + fieldlist += field + if i < len(load_params) - 1: + fieldlist += nodes.raw("", "<br/>", format="html") + return fieldlist + + +def render_applicable_types(applicable_types: List[str]): + fieldlist = nodes.field_list() + for applicable_type in applicable_types: + fieldlist += nodes.field("", nodes.literal(text=applicable_type), nodes.Text("")) + fieldlist += nodes.raw("", "<br/>", format="html") + return fieldlist + + +class DataAdapterTableDirective(Directive): + """Custom directive to render a table of all data adapters. Takes in one argument + that is either 'loader' or 'saver' to indicate which adapters to render.""" + + has_content = True + required_arguments = 1 # Number of required arguments + + def run(self): + """Runs the directive. This does the following: + 1. Collects all loaders from the registry + 2. Creates a table with the following columns: + - Key + - Class name + - Class path + - Load params + - Applicable types + 3. Returns the table + :return: A list of nodes that Sphinx will render, consisting of the table node + """ + saver_or_loader = self.arguments[0] + if saver_or_loader not in ("loader", "saver"): + raise ValueError( + f"loader_or_saver must be one of 'loader' or 'saver', " f"got {saver_or_loader}" + ) + table_data = [ + AdapterInfo.from_loader(loader) for loader in _collect_loaders(saver_or_loader) + ] + + # Create the table and add columns + table_node = nodes.table() + tgroup = nodes.tgroup(cols=6) + table_node += tgroup + + # Create columns + key_spec = nodes.colspec(colwidth=1) + # class_spec = nodes.colspec(colwidth=1) + load_params_spec = nodes.colspec(colwidth=2) + applicable_types_spec = nodes.colspec(colwidth=1) + class_path_spec = nodes.colspec(colwidth=1) + + tgroup += [key_spec, load_params_spec, applicable_types_spec, class_path_spec] + + # Create the table body + thead = nodes.thead() + row = nodes.row() + + # Create entry nodes for each cell + key_entry = nodes.entry() + load_params_entry = nodes.entry() + applicable_types_entry = nodes.entry() + class_path_entry = nodes.entry() + + key_entry += nodes.paragraph(text="key") + + load_params_entry += nodes.paragraph(text=f"{saver_or_loader} params") + applicable_types_entry += nodes.paragraph(text="types") + class_path_entry += nodes.paragraph(text="module") + + row += [key_entry, load_params_entry, applicable_types_entry, class_path_entry] + thead += row + tgroup += thead + tbody = nodes.tbody() + tgroup += tbody + + # Populate table rows based on your table_data + for row_data in table_data: + row = nodes.row() + + # Create entry nodes for each cell + key_entry = nodes.entry() + load_params_entry = nodes.entry() + applicable_types_entry = nodes.entry() + class_path_entry = nodes.entry() + + # Create a paragraph node for each entry + # import pdb + # pdb.set_trace() + # para1 = nodes.literal(text=row_data['column1_data']) + # para2 = nodes.paragraph(text=row_data['column2_data']) + + # Add the paragraph nodes to the entry nodes + key_entry += render_key(row_data.key) + load_params_entry += render_adapter_params(row_data.load_params) + applicable_types_entry += render_applicable_types(row_data.applicable_types) + class_path_entry += render_class_path( + row_data.class_path, row_data.file_, *row_data.line_nos + ) + + # Add the entry nodes to the row + row += [key_entry, load_params_entry, applicable_types_entry, class_path_entry] + + # Add the row to the table body + tbody += row + + return [table_node] + + +def setup(app): + """Required to register the extension""" + app.add_directive("data_adapter_table", DataAdapterTableDirective)
diff --git a/docs/index.md b/docs/index.md index b82c522..ee17efa 100644 --- a/docs/index.md +++ b/docs/index.md
@@ -41,6 +41,7 @@ reference/decorators/index reference/drivers/index +reference/io/index reference/graph-adapters/index reference/result-builders/index reference/miscellaneous/index
diff --git a/docs/reference/io/adapter-documentation.rst b/docs/reference/io/adapter-documentation.rst new file mode 100644 index 0000000..6394a4c --- /dev/null +++ b/docs/reference/io/adapter-documentation.rst
@@ -0,0 +1,20 @@ +========================= +Data Adapters +========================= + +Reference for data adapter base classes: + +.. autoclass:: hamilton.io.data_adapters.DataLoader + :special-members: __init__ + :members: + :inherited-members: + +.. autoclass:: hamilton.io.data_adapters.DataSaver + :special-members: __init__ + :members: + :inherited-members: + +.. autoclass:: hamilton.io.data_adapters.AdapterCommon + :special-members: __init__ + :members: + :inherited-members:
diff --git a/docs/reference/io/available-data-adapters.rst b/docs/reference/io/available-data-adapters.rst new file mode 100644 index 0000000..a8c5a24 --- /dev/null +++ b/docs/reference/io/available-data-adapters.rst
@@ -0,0 +1,56 @@ +======================== +Using Data Adapters +======================== + +This is an index of all the available data adapters, both savers and loaders. +Note that some savers and loaders are the same (certain classes can handle both), +but some are different. You will want to reference this when calling out to any of the following: + +1. Using :doc:`/reference/decorators/save_to/`. +2. Using :doc:`/reference/decorators/load_from/`. +3. Using :doc:`materialize </reference/drivers/Driver/>` + +To read these tables, you want to first look at the key to determine which format you want -- +these should be human-readable and familiar to you. Then you'll want to look at the `types` field +to figure out which is the best for your case (the object you want to load from or save to). + +Finally, look up the adapter params to see what parameters you can pass to the data adapters. +The optional params come with their default value specified. + +If you want more information, click on the `module`, it will send you to the code that implements +it to see how the parameters are used. + +As an example, say we wanted to save a pandas dataframe to a CSV file. We would first find the +key `csv`, which would inform us that we want to call `save_to.csv` (or `to.csv` in the case +of `materialize`). Then, we would look at the `types` field, finding that there is a pandas +dataframe adapter. Finally, we would look at the `params` field, finding that we can pass +`path`, and (optionally) `sep` (which we'd realize defaults to `,` when looking at the code). + +All together, we'd end up with: + +.. code-block:: python + + import pandas as pd + from hamilton.function_modifiers import value, save_to + + @save_to.csv(path=value("my_file.csv")) + def my_data(...) -> pd.DataFrame: + ... + +And we're good to go! + +If you want to extend these, see :doc:`/reference/io/available-data-adapters` for documentation, +and `the example <https://github.com/DAGWorks-Inc/hamilton/blob/main/examples/materialization/README.md>`_ +in the repository for an example of how to do so. + +============= +Data Loaders +============= + +.. data_adapter_table:: loader + +============= +Data Savers +============= + +.. data_adapter_table:: saver
diff --git a/docs/reference/io/index.rst b/docs/reference/io/index.rst new file mode 100644 index 0000000..637a69c --- /dev/null +++ b/docs/reference/io/index.rst
@@ -0,0 +1,11 @@ +============== +I/O +============== + +This section contains any information about I/O within Hamilton + +.. toctree:: + :maxdepth: 2 + + available-data-adapters + adapter-documentation
diff --git a/examples/materialization/README.md b/examples/materialization/README.md index 24cf6fe..3a6c73e 100644 --- a/examples/materialization/README.md +++ b/examples/materialization/README.md
@@ -23,6 +23,9 @@ See [run.py](run.py) for the full example. +In this example we only pass literal values to the materializers. That said, you can use both `source` (to specify the source from an upstream node), +and `value` (which is the default) to specify literals. + ## `driver.materialize`
diff --git a/hamilton/function_modifiers/adapters.py b/hamilton/function_modifiers/adapters.py index 062c926..3335615 100644 --- a/hamilton/function_modifiers/adapters.py +++ b/hamilton/function_modifiers/adapters.py
@@ -312,7 +312,9 @@ f"Available loaders are: {LOADER_REGISTRY.keys()}. " f"If you've gotten to this point, you either (1) spelled the " f"loader name wrong, (2) are trying to use a loader that does" - f"not exist (yet)" + f"not exist (yet). For a list of available loaders, see: " + f"https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data" + f"-loaders " ) from e @@ -425,11 +427,13 @@ return super().__getattribute__(item) except AttributeError as e: raise AttributeError( - f"No saver named: {item} available for {cls.__name__}. " - f"Available data savers are: {list(SAVER_REGISTRY.keys())}. " - f"If you've gotten to this point, you either (1) spelled the " - f"loader name wrong, (2) are trying to use a saver that does" - f"not exist (yet)." + "No saver named: {item} available for {cls.__name__}. " + "Available data savers are: {list(SAVER_REGISTRY.keys())}. " + "If you've gotten to this point, you either (1) spelled the " + "loader name wrong, (2) are trying to use a saver that does" + "not exist (yet). For a list of available savers, see " + "https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data" + "-loaders " ) from e
diff --git a/hamilton/io/data_adapters.py b/hamilton/io/data_adapters.py index 965a1f2..b997227 100644 --- a/hamilton/io/data_adapters.py +++ b/hamilton/io/data_adapters.py
@@ -153,12 +153,12 @@ @abc.abstractmethod def save_data(self, data: Any) -> Dict[str, Any]: """Saves the data to the data source. - Note this uses the constructor parameters to determine - how to save the data. + Note this uses the constructor parameters to determine + how to save the data. :return: Any relevant metadata. This is up the the data saver, but will likely - include the URI, etc... This is going to be similar to the metadata returned - by the data loader in the loading tuple. + include the URI, etc... This is going to be similar to the metadata returned + by the data loader in the loading tuple. """ pass
diff --git a/hamilton/io/materialization.py b/hamilton/io/materialization.py index 0bf6fc7..09952cb 100644 --- a/hamilton/io/materialization.py +++ b/hamilton/io/materialization.py
@@ -28,11 +28,13 @@ return super().__getattribute__(item) except AttributeError as e: raise AttributeError( - f"No data materializer named: {item}. " - f"Available materializers are: {SAVER_REGISTRY.keys()}. " - f"If you've gotten to this point, you either (1) spelled the " - f"loader name wrong, (2) are trying to use a loader that does" - f"not exist (yet)" + "No data materializer named: {item}. " + "Available materializers are: {SAVER_REGISTRY.keys()}. " + "If you've gotten to this point, you either (1) spelled the " + "loader name wrong, (2) are trying to use a loader that does" + "not exist (yet). For a list of available materializers, see " + "https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data" + "-loaders " ) from e
diff --git a/requirements-docs.txt b/requirements-docs.txt index 3613769..2403cff 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt
@@ -3,6 +3,7 @@ commonmark==0.9.1 # read the docs pins dask[distributed] furo +gitpython # Required for parsing git info for generation of data-adapter docs mock==1.0.1 # read the docs pins myst-parser==0.18.1 # latest version of myst at this time pillow