Adds documentation/examples for data adapters
This has:
1. A reference table, automatically generated using a custom sphinx
directive
2. References for the base classes for extension
Re (1) we generate a bare-bones table but it should be enough. For now,
we just link to the code, but we will, at some point, link to actual
class docs.
diff --git a/docs/conf.py b/docs/conf.py
index ea274db..35f7900 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,6 +30,7 @@
"sphinx.ext.autosummary",
"myst_parser",
"sphinx_sitemap",
+ "docs.data_adapters_extension",
]
# for the sitemap extension ---
diff --git a/docs/data_adapters_extension.py b/docs/data_adapters_extension.py
new file mode 100644
index 0000000..923c78c
--- /dev/null
+++ b/docs/data_adapters_extension.py
@@ -0,0 +1,288 @@
+import dataclasses
+import inspect
+import os
+from typing import List, Optional, Tuple, Type
+
+import git
+from docutils import nodes
+from docutils.parsers.rst import Directive
+
+import hamilton.io.data_adapters
+from hamilton import registry
+
+"""A module to crawl available data adapters and generate documentation for them.
+Note these currently link out to the source code on GitHub, but they should
+be linking to the documentation instead, which hasn't been generated yet.
+"""
+
+# These have fallbacks for local dev
+GIT_URL = os.environ.get("READTHEDOCS_GIT_CLONE_URL", "https://github.com/dagworks-inc/hamilton")
+GIT_ID = os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main")
+
+# All the modules that register data adapters
+# When you register a new one, add it here
+MODULES_TO_IMPORT = ["hamilton.io.default_data_loaders", "hamilton.plugins.pandas_extensions"]
+
+for module in MODULES_TO_IMPORT:
+ __import__(module)
+
+
+def get_git_root(path: str) -> str:
+ """Yields the git room of a repo, given an absolute path to
+ a file within the repo.
+
+ :param path: Path to a file within a git repo
+ :return: The root of the git repo
+ """
+ git_repo = git.Repo(path, search_parent_directories=True)
+ git_root = git_repo.git.rev_parse("--show-toplevel")
+ return git_root
+
+
+@dataclasses.dataclass
+class Param:
+ name: str
+ type: str
+ default: Optional[str] = None
+
+
+def get_default(param: dataclasses.Field) -> Optional[str]:
+ """Gets the deafult of a dataclass field, if it has one.
+
+ :param param: The dataclass field
+ :return: The str representation of the default.
+ """
+ if param.default is dataclasses.MISSING:
+ return None
+ return str(param.default)
+
+
+def get_lines_for_class(class_: Type[Type]) -> Tuple[int, int]:
+ """Gets the set of lines in which a class is implemented
+
+ :param class_: The class to get the lines for
+ :return: A tuple of the start and end lines
+ """
+ lines = inspect.getsourcelines(class_)
+ start_line = lines[1]
+ end_line = lines[1] + len(lines[0])
+ return start_line, end_line
+
+
+def get_class_repr(class_: Type) -> str:
+ """Gets a representation of a class that can be used in documentation.
+
+ :param class_: Python class to get the representation for
+ :return: Str representation
+ """
+
+ try:
+ return class_.__qualname__
+ except AttributeError:
+ # This happens when we have generics or other oddities
+ return str(class_)
+
+
+@dataclasses.dataclass
+class AdapterInfo:
+ key: str
+ class_name: str
+ class_path: str
+ load_params: List[Param]
+ save_params: List[Param]
+ applicable_types: List[str]
+ file_: str
+ line_nos: Tuple[int, int]
+
+ @staticmethod
+ def from_loader(loader: Type[hamilton.io.data_adapters.DataLoader]) -> "AdapterInfo":
+ """Utility constructor to create the AdapterInfo from a DataLoader class
+
+ :param loader: DataLoader class
+ :return: AdapterInfo derived from it
+ """
+
+ return AdapterInfo(
+ key=loader.name(),
+ class_name=loader.__name__,
+ class_path=loader.__module__,
+ load_params=[
+ Param(name=p.name, type=get_class_repr(p.type), default=get_default(p))
+ for p in dataclasses.fields(loader)
+ ]
+ if issubclass(loader, hamilton.io.data_adapters.DataSaver)
+ else None,
+ save_params=[
+ Param(name=p.name, type=get_class_repr(p.type), default=get_default(p))
+ for p in dataclasses.fields(loader)
+ ]
+ if issubclass(loader, hamilton.io.data_adapters.DataSaver)
+ else None,
+ applicable_types=[get_class_repr(t) for t in loader.applicable_types()],
+ file_=inspect.getfile(loader),
+ line_nos=get_lines_for_class(loader),
+ )
+
+
+def _collect_loaders(saver_or_loader: str) -> List[Type[hamilton.io.data_adapters.AdapterCommon]]:
+ """Collects all loaders from the registry.
+
+ :return:
+ """
+ out = []
+ loaders = (
+ list(registry.LOADER_REGISTRY.values())
+ if saver_or_loader == "loader"
+ else list(registry.SAVER_REGISTRY.values())
+ )
+ for classes in loaders:
+ for cls in classes:
+ if cls not in out:
+ out.append(cls)
+ return out
+
+
+# Utility functions to render different components of the adapter in table cells
+
+
+def render_key(key: str):
+ return [nodes.Text(key, key)]
+
+
+def render_class_name(class_name: str):
+ return [nodes.literal(text=class_name)]
+
+
+def render_class_path(class_path: str, file_: str, line_start: int, line_end: int):
+ git_path = get_git_root(file_)
+ file_relative_to_git_root = os.path.relpath(file_, git_path)
+ href = f"{GIT_URL}/blob/{GIT_ID}/{file_relative_to_git_root}#L{line_start}-L{line_end}"
+ # href = f"{GIT_URL}/blob/{GIT_ID}/{file_}#L{line_no}"
+ return [nodes.raw("", f'<a href="{href}">{class_path}</a>', format="html")]
+
+
+def render_adapter_params(load_params: Optional[List[Param]]):
+ if load_params is None:
+ return nodes.raw("", "<div/>", format="html")
+ fieldlist = nodes.field_list()
+ for i, load_param in enumerate(load_params):
+ fieldname = nodes.Text(load_param.name)
+ fieldbody = nodes.literal(
+ text=load_param.type
+ + ("=" + load_param.default if load_param.default is not None else "")
+ )
+ field = nodes.field("", fieldname, fieldbody)
+ fieldlist += field
+ if i < len(load_params) - 1:
+ fieldlist += nodes.raw("", "<br/>", format="html")
+ return fieldlist
+
+
+def render_applicable_types(applicable_types: List[str]):
+ fieldlist = nodes.field_list()
+ for applicable_type in applicable_types:
+ fieldlist += nodes.field("", nodes.literal(text=applicable_type), nodes.Text(""))
+ fieldlist += nodes.raw("", "<br/>", format="html")
+ return fieldlist
+
+
+class DataAdapterTableDirective(Directive):
+ """Custom directive to render a table of all data adapters. Takes in one argument
+ that is either 'loader' or 'saver' to indicate which adapters to render."""
+
+ has_content = True
+ required_arguments = 1 # Number of required arguments
+
+ def run(self):
+ """Runs the directive. This does the following:
+ 1. Collects all loaders from the registry
+ 2. Creates a table with the following columns:
+ - Key
+ - Class name
+ - Class path
+ - Load params
+ - Applicable types
+ 3. Returns the table
+ :return: A list of nodes that Sphinx will render, consisting of the table node
+ """
+ saver_or_loader = self.arguments[0]
+ if saver_or_loader not in ("loader", "saver"):
+ raise ValueError(
+ f"loader_or_saver must be one of 'loader' or 'saver', " f"got {saver_or_loader}"
+ )
+ table_data = [
+ AdapterInfo.from_loader(loader) for loader in _collect_loaders(saver_or_loader)
+ ]
+
+ # Create the table and add columns
+ table_node = nodes.table()
+ tgroup = nodes.tgroup(cols=6)
+ table_node += tgroup
+
+ # Create columns
+ key_spec = nodes.colspec(colwidth=1)
+ # class_spec = nodes.colspec(colwidth=1)
+ load_params_spec = nodes.colspec(colwidth=2)
+ applicable_types_spec = nodes.colspec(colwidth=1)
+ class_path_spec = nodes.colspec(colwidth=1)
+
+ tgroup += [key_spec, load_params_spec, applicable_types_spec, class_path_spec]
+
+ # Create the table body
+ thead = nodes.thead()
+ row = nodes.row()
+
+ # Create entry nodes for each cell
+ key_entry = nodes.entry()
+ load_params_entry = nodes.entry()
+ applicable_types_entry = nodes.entry()
+ class_path_entry = nodes.entry()
+
+ key_entry += nodes.paragraph(text="key")
+
+ load_params_entry += nodes.paragraph(text=f"{saver_or_loader} params")
+ applicable_types_entry += nodes.paragraph(text="types")
+ class_path_entry += nodes.paragraph(text="module")
+
+ row += [key_entry, load_params_entry, applicable_types_entry, class_path_entry]
+ thead += row
+ tgroup += thead
+ tbody = nodes.tbody()
+ tgroup += tbody
+
+ # Populate table rows based on your table_data
+ for row_data in table_data:
+ row = nodes.row()
+
+ # Create entry nodes for each cell
+ key_entry = nodes.entry()
+ load_params_entry = nodes.entry()
+ applicable_types_entry = nodes.entry()
+ class_path_entry = nodes.entry()
+
+ # Create a paragraph node for each entry
+ # import pdb
+ # pdb.set_trace()
+ # para1 = nodes.literal(text=row_data['column1_data'])
+ # para2 = nodes.paragraph(text=row_data['column2_data'])
+
+ # Add the paragraph nodes to the entry nodes
+ key_entry += render_key(row_data.key)
+ load_params_entry += render_adapter_params(row_data.load_params)
+ applicable_types_entry += render_applicable_types(row_data.applicable_types)
+ class_path_entry += render_class_path(
+ row_data.class_path, row_data.file_, *row_data.line_nos
+ )
+
+ # Add the entry nodes to the row
+ row += [key_entry, load_params_entry, applicable_types_entry, class_path_entry]
+
+ # Add the row to the table body
+ tbody += row
+
+ return [table_node]
+
+
+def setup(app):
+ """Required to register the extension"""
+ app.add_directive("data_adapter_table", DataAdapterTableDirective)
diff --git a/docs/index.md b/docs/index.md
index b82c522..ee17efa 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -41,6 +41,7 @@
reference/decorators/index
reference/drivers/index
+reference/io/index
reference/graph-adapters/index
reference/result-builders/index
reference/miscellaneous/index
diff --git a/docs/reference/io/adapter-documentation.rst b/docs/reference/io/adapter-documentation.rst
new file mode 100644
index 0000000..6394a4c
--- /dev/null
+++ b/docs/reference/io/adapter-documentation.rst
@@ -0,0 +1,20 @@
+=========================
+Data Adapters
+=========================
+
+Reference for data adapter base classes:
+
+.. autoclass:: hamilton.io.data_adapters.DataLoader
+ :special-members: __init__
+ :members:
+ :inherited-members:
+
+.. autoclass:: hamilton.io.data_adapters.DataSaver
+ :special-members: __init__
+ :members:
+ :inherited-members:
+
+.. autoclass:: hamilton.io.data_adapters.AdapterCommon
+ :special-members: __init__
+ :members:
+ :inherited-members:
diff --git a/docs/reference/io/available-data-adapters.rst b/docs/reference/io/available-data-adapters.rst
new file mode 100644
index 0000000..a8c5a24
--- /dev/null
+++ b/docs/reference/io/available-data-adapters.rst
@@ -0,0 +1,56 @@
+========================
+Using Data Adapters
+========================
+
+This is an index of all the available data adapters, both savers and loaders.
+Note that some savers and loaders are the same (certain classes can handle both),
+but some are different. You will want to reference this when calling out to any of the following:
+
+1. Using :doc:`/reference/decorators/save_to/`.
+2. Using :doc:`/reference/decorators/load_from/`.
+3. Using :doc:`materialize </reference/drivers/Driver/>`
+
+To read these tables, you want to first look at the key to determine which format you want --
+these should be human-readable and familiar to you. Then you'll want to look at the `types` field
+to figure out which is the best for your case (the object you want to load from or save to).
+
+Finally, look up the adapter params to see what parameters you can pass to the data adapters.
+The optional params come with their default value specified.
+
+If you want more information, click on the `module`, it will send you to the code that implements
+it to see how the parameters are used.
+
+As an example, say we wanted to save a pandas dataframe to a CSV file. We would first find the
+key `csv`, which would inform us that we want to call `save_to.csv` (or `to.csv` in the case
+of `materialize`). Then, we would look at the `types` field, finding that there is a pandas
+dataframe adapter. Finally, we would look at the `params` field, finding that we can pass
+`path`, and (optionally) `sep` (which we'd realize defaults to `,` when looking at the code).
+
+All together, we'd end up with:
+
+.. code-block:: python
+
+ import pandas as pd
+ from hamilton.function_modifiers import value, save_to
+
+ @save_to.csv(path=value("my_file.csv"))
+ def my_data(...) -> pd.DataFrame:
+ ...
+
+And we're good to go!
+
+If you want to extend these, see :doc:`/reference/io/available-data-adapters` for documentation,
+and `the example <https://github.com/DAGWorks-Inc/hamilton/blob/main/examples/materialization/README.md>`_
+in the repository for an example of how to do so.
+
+=============
+Data Loaders
+=============
+
+.. data_adapter_table:: loader
+
+=============
+Data Savers
+=============
+
+.. data_adapter_table:: saver
diff --git a/docs/reference/io/index.rst b/docs/reference/io/index.rst
new file mode 100644
index 0000000..637a69c
--- /dev/null
+++ b/docs/reference/io/index.rst
@@ -0,0 +1,11 @@
+==============
+I/O
+==============
+
+This section contains any information about I/O within Hamilton
+
+.. toctree::
+ :maxdepth: 2
+
+ available-data-adapters
+ adapter-documentation
diff --git a/examples/materialization/README.md b/examples/materialization/README.md
index 24cf6fe..3a6c73e 100644
--- a/examples/materialization/README.md
+++ b/examples/materialization/README.md
@@ -23,6 +23,9 @@
See [run.py](run.py) for the full example.
+In this example we only pass literal values to the materializers. That said, you can use both `source` (to specify the source from an upstream node),
+and `value` (which is the default) to specify literals.
+
## `driver.materialize`
diff --git a/hamilton/function_modifiers/adapters.py b/hamilton/function_modifiers/adapters.py
index 062c926..3335615 100644
--- a/hamilton/function_modifiers/adapters.py
+++ b/hamilton/function_modifiers/adapters.py
@@ -312,7 +312,9 @@
f"Available loaders are: {LOADER_REGISTRY.keys()}. "
f"If you've gotten to this point, you either (1) spelled the "
f"loader name wrong, (2) are trying to use a loader that does"
- f"not exist (yet)"
+ f"not exist (yet). For a list of available loaders, see: "
+ f"https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data"
+ f"-loaders "
) from e
@@ -425,11 +427,13 @@
return super().__getattribute__(item)
except AttributeError as e:
raise AttributeError(
- f"No saver named: {item} available for {cls.__name__}. "
- f"Available data savers are: {list(SAVER_REGISTRY.keys())}. "
- f"If you've gotten to this point, you either (1) spelled the "
- f"loader name wrong, (2) are trying to use a saver that does"
- f"not exist (yet)."
+ "No saver named: {item} available for {cls.__name__}. "
+ "Available data savers are: {list(SAVER_REGISTRY.keys())}. "
+ "If you've gotten to this point, you either (1) spelled the "
+ "loader name wrong, (2) are trying to use a saver that does"
+ "not exist (yet). For a list of available savers, see "
+ "https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data"
+ "-loaders "
) from e
diff --git a/hamilton/io/data_adapters.py b/hamilton/io/data_adapters.py
index 965a1f2..b997227 100644
--- a/hamilton/io/data_adapters.py
+++ b/hamilton/io/data_adapters.py
@@ -153,12 +153,12 @@
@abc.abstractmethod
def save_data(self, data: Any) -> Dict[str, Any]:
"""Saves the data to the data source.
- Note this uses the constructor parameters to determine
- how to save the data.
+ Note this uses the constructor parameters to determine
+ how to save the data.
:return: Any relevant metadata. This is up the the data saver, but will likely
- include the URI, etc... This is going to be similar to the metadata returned
- by the data loader in the loading tuple.
+ include the URI, etc... This is going to be similar to the metadata returned
+ by the data loader in the loading tuple.
"""
pass
diff --git a/hamilton/io/materialization.py b/hamilton/io/materialization.py
index 0bf6fc7..09952cb 100644
--- a/hamilton/io/materialization.py
+++ b/hamilton/io/materialization.py
@@ -28,11 +28,13 @@
return super().__getattribute__(item)
except AttributeError as e:
raise AttributeError(
- f"No data materializer named: {item}. "
- f"Available materializers are: {SAVER_REGISTRY.keys()}. "
- f"If you've gotten to this point, you either (1) spelled the "
- f"loader name wrong, (2) are trying to use a loader that does"
- f"not exist (yet)"
+ "No data materializer named: {item}. "
+ "Available materializers are: {SAVER_REGISTRY.keys()}. "
+ "If you've gotten to this point, you either (1) spelled the "
+ "loader name wrong, (2) are trying to use a loader that does"
+ "not exist (yet). For a list of available materializers, see "
+ "https://hamilton.readthedocs.io/reference/io/available-data-adapters/#data"
+ "-loaders "
) from e
diff --git a/requirements-docs.txt b/requirements-docs.txt
index 3613769..2403cff 100644
--- a/requirements-docs.txt
+++ b/requirements-docs.txt
@@ -3,6 +3,7 @@
commonmark==0.9.1 # read the docs pins
dask[distributed]
furo
+gitpython # Required for parsing git info for generation of data-adapter docs
mock==1.0.1 # read the docs pins
myst-parser==0.18.1 # latest version of myst at this time
pillow