blob: 6b4853140dadef70953a4795505af77e65572e13 [file] [log] [blame]
import pathlib
import sqlite3
from typing import Union
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal
from sqlalchemy import create_engine
from hamilton.plugins.pandas_extensions import (
PandasCSVReader,
PandasCSVWriter,
PandasExcelReader,
PandasExcelWriter,
PandasFeatherReader,
PandasFeatherWriter,
PandasFWFReader,
PandasHtmlReader,
PandasHtmlWriter,
PandasJsonReader,
PandasJsonWriter,
PandasORCReader,
PandasORCWriter,
PandasParquetReader,
PandasParquetWriter,
PandasPickleReader,
PandasPickleWriter,
PandasSPSSReader,
PandasSqlReader,
PandasSqlWriter,
PandasStataReader,
PandasStataWriter,
PandasTableReader,
PandasXmlReader,
PandasXmlWriter,
)
@pytest.fixture
def df():
yield pd.DataFrame({"foo": ["bar"]})
def test_pandas_parquet(df: pd.DataFrame, tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "sample_df.parquet.gzip"
writer = PandasParquetWriter(path=file_path, engine="pyarrow")
writer.save_data(df)
reader = PandasParquetReader(path=file_path, engine="pyarrow")
read_df, metadata = reader.load_data(pd.DataFrame)
assert_frame_equal(read_df, df, check_dtype=False)
assert read_df.shape == df.shape, "DataFrames do not match"
assert read_df.columns == df.columns
assert len(list(tmp_path.iterdir())) == 1, "Unexpected number of files in tmp_path directory."
def test_pandas_pickle(df: pd.DataFrame, tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "sample_df.pkl"
writer = PandasPickleWriter(path=file_path)
writer.save_data(df)
reader = PandasPickleReader(filepath_or_buffer=file_path)
read_df, metadata = reader.load_data(pd.DataFrame)
assert read_df.equals(df), "DataFrames do not match"
assert len(list(tmp_path.iterdir())) == 1, "Unexpected number of files in tmp_path directory."
def test_pandas_json(df: pd.DataFrame, tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.json"
writer = PandasJsonWriter(filepath_or_buffer=file_path, indent=4)
kwargs1 = writer._get_saving_kwargs()
writer.save_data(df)
reader = PandasJsonReader(filepath_or_buffer=file_path, encoding="utf-8")
kwargs2 = reader._get_loading_kwargs()
df2, metadata = reader.load_data(pd.DataFrame)
assert PandasJsonReader.applicable_types() == [pd.DataFrame]
assert PandasJsonWriter.applicable_types() == [pd.DataFrame]
assert kwargs1["indent"] == 4
assert kwargs2["encoding"] == "utf-8"
assert df.equals(df2)
@pytest.mark.parametrize(
"conn",
[
sqlite3.connect(":memory:"),
create_engine("sqlite://"),
],
)
def test_pandas_sql(df: pd.DataFrame, conn: Union[str, sqlite3.Connection]) -> None:
writer = PandasSqlWriter(table_name="bar", db_connection=conn)
kwargs1 = writer._get_saving_kwargs()
metadata1 = writer.save_data(df)
reader = PandasSqlReader(query_or_table="SELECT foo FROM bar", db_connection=conn)
kwargs2 = reader._get_loading_kwargs()
df2, metadata2 = reader.load_data(pd.DataFrame)
assert PandasSqlReader.applicable_types() == [pd.DataFrame]
assert PandasSqlWriter.applicable_types() == [pd.DataFrame]
assert kwargs1["if_exists"] == "fail"
assert kwargs2["coerce_float"] is True
assert df.equals(df2)
assert metadata1["sql_metadata"]["rows"] == 1
assert metadata2["sql_metadata"]["rows"] == 1
assert metadata1["dataframe_metadata"]["datatypes"] == ["object"]
if hasattr(conn, "close"):
conn.close()
def test_pandas_xml_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.xml"
reader = PandasXmlReader(path_or_buffer=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasXmlReader.applicable_types() == [pd.DataFrame]
assert df.shape == (4, 4)
def test_pandas_xml_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.xml"
writer = PandasXmlWriter(path_or_buffer=file_path)
metadata = writer.save_data(pd.DataFrame({"foo": ["bar"]}))
assert PandasXmlWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["foo"]
def test_pandas_html_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.html"
reader = PandasHtmlReader(io=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasHtmlReader.applicable_types() == [pd.DataFrame]
assert df[0].shape == (3, 4)
def test_pandas_html_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.xml"
writer = PandasHtmlWriter(buf=file_path)
metadata = writer.save_data(pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}))
assert PandasHtmlWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["col1", "col2"]
def test_pandas_stata_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.dta"
reader = PandasStataReader(filepath_or_buffer=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasStataReader.applicable_types() == [pd.DataFrame]
assert df.shape == (4, 4)
def test_pandas_stata_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.dta"
writer = PandasStataWriter(path=file_path)
metadata = writer.save_data(pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}))
assert PandasStataWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["col1", "col2"]
def test_pandas_feather_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.feather"
reader = PandasFeatherReader(path=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasFeatherReader.applicable_types() == [pd.DataFrame]
assert df.shape == (4, 3)
def test_pandas_feather_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.dta"
writer = PandasFeatherWriter(path=file_path)
metadata = writer.save_data(pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}))
assert PandasStataWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["col1", "col2"]
def test_pandas_csv_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.csv"
reader = PandasCSVReader(path=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasCSVReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]
def test_pandas_csv_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.csv"
writer = PandasCSVWriter(path=file_path)
metadata = writer.save_data(pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}))
assert PandasCSVWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["col1", "col2"]
def test_pandas_orc_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.orc"
writer = PandasORCWriter(path=file_path)
metadata = writer.save_data(pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}))
assert PandasORCWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["col1", "col2"]
def test_pandas_orc_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.orc"
reader = PandasORCReader(path=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasORCReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]
def test_pandas_excel_writer(tmp_path: pathlib.Path) -> None:
file_path = tmp_path / "test.xlsx"
writer = PandasExcelWriter(path=file_path)
metadata = writer.save_data(pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}))
assert PandasExcelWriter.applicable_types() == [pd.DataFrame]
assert file_path.exists()
assert metadata["file_metadata"]["path"] == str(file_path)
assert metadata["dataframe_metadata"]["column_names"] == ["col1", "col2"]
def test_pandas_excel_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.xlsx"
reader = PandasExcelReader(path=path_to_test, sheet_name="test_load_from_data_sheet")
df, metadata = reader.load_data(pd.DataFrame)
assert PandasExcelReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]
def test_pandas_table_reader(tmp_path: pathlib.Path) -> None:
path_to_test = "tests/resources/data/test_load_from_data.csv"
reader = PandasTableReader(filepath_or_buffer=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasTableReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]
def test_pandas_spss_reader(tmp_path: pathlib.Path) -> None:
import pyreadstat
path_to_test = "tests/resources/data/test_load_from_data.xlsx"
reader = PandasExcelReader(path=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
pyreadstat.write_sav(df, tmp_path / "test.sav")
reader = PandasSPSSReader(path=tmp_path / "test.sav")
df, metadata = reader.load_data(pd.DataFrame)
assert PandasSPSSReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]
def test_pandas_fwf_reader() -> None:
path_to_test = "tests/resources/data/test_load_from_data.fwf"
reader = PandasFWFReader(filepath_or_buffer=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)
assert PandasFWFReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]