blob: b23137e824e6d47cacefdaeac9963f5c99c19f5c [file]
import pandas as pd
import pytest
from hamilton import driver
from hamilton_code import data_science
@pytest.fixture
def dummy_data():
return pd.DataFrame(
{
"engines": [1, 2, 3],
"crew": [4, 5, 6],
"passenger_capacity": [5, 6, 7],
"price": [120, 290, 30],
}
)
@pytest.fixture
def dummy_inputs():
return {
"test_size": 0.2,
"random_state": 3,
"features": ["engines", "passenger_capacity", "crew"],
}
def test_split_data(dummy_data, dummy_inputs):
results = data_science.split_data(create_model_input_table=dummy_data, **dummy_inputs)
assert len(results["X_train"]) == 2
assert len(results["y_train"]) == 2
assert len(results["X_test"]) == 1
assert len(results["y_test"]) == 1
def test_split_data_missing_price(dummy_data, dummy_inputs):
dummy_data_missing_price = dummy_data.drop(columns="price")
with pytest.raises(KeyError) as e_info:
X_train, X_test, y_train, y_test = data_science.split_data(
create_model_input_table=dummy_data_missing_price, **dummy_inputs
)
assert "price" in str(e_info.value)
def test_data_science_pipeline(dummy_data, dummy_inputs):
dr = driver.Builder().with_modules(data_science).build()
results = dr.execute(
["evaluate_model"],
inputs=dict(
create_model_input_table=dummy_data,
**dummy_inputs,
),
)
assert results["evaluate_model"]