blob: 223f8e614dc7981087faad145f68917f81498d9e [file]
from typing import Type
import pandas as pd
from hamilton.data_quality.base import BaseDefaultValidator, DataValidator, ValidationResult
class SampleDataValidator1(BaseDefaultValidator):
def __init__(self, equal_to: int, importance: str):
super(SampleDataValidator1, self).__init__(importance=importance)
self.equal_to = equal_to
@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return datatype == int
def description(self) -> str:
return "Data must be equal to 10 to be valid"
@classmethod
def name(cls) -> str:
return "dummy_data_validator_1"
def validate(self, dataset: int) -> ValidationResult:
passes = dataset == 10
return ValidationResult(
passes=passes,
message=f"Data value: {dataset} {'does' if passes else 'does not'} equal {self.equal_to}",
)
@classmethod
def arg(cls) -> str:
return "equal_to"
def __eq__(self, other):
return isinstance(other, self.__class__) and self.equal_to == other.equal_to
class SampleDataValidator2(DataValidator):
def __init__(self, dataset_length: int, importance: str):
super(SampleDataValidator2, self).__init__(importance=importance)
self.dataset_length = dataset_length
def description(self) -> str:
return f"series must have length {self.dataset_length} to be valid"
@classmethod
def name(cls) -> str:
return "dummy_data_validator_2"
def validate(self, dataset: pd.Series) -> ValidationResult:
passes = len(dataset) == self.dataset_length
return ValidationResult(
passes=passes,
message=f"Dataset length is: {len(dataset)}. This {'passes' if passes else 'does not pass'}. It must be exactly '{self.dataset_length}",
diagnostics={
"dataset_length_must_be": self.dataset_length,
"dataset_length_is": len(dataset),
},
)
@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return datatype == pd.Series
@classmethod
def arg(cls) -> str:
return "dataset_length"
def __eq__(self, other):
return isinstance(other, self.__class__) and self.dataset_length == other.dataset_length
class SampleDataValidator3(DataValidator):
def __init__(self, dtype: type, importance: str):
super(SampleDataValidator3, self).__init__(importance=importance)
self.dtype = dtype
def description(self) -> str:
return f"Series dtype must be {self.dtype} to be valid"
@classmethod
def name(cls) -> str:
return "dummy_data_validator_3"
def validate(self, dataset: pd.Series) -> ValidationResult:
passes = dataset.dtype == self.dtype
return ValidationResult(
passes=passes,
message=f"Dataset type is: {dataset.dtype}. This {'passes' if passes else 'does not pass'}. It must be '{self.dtype}",
diagnostics={"required_dtype": str(self.dtype), "actual_dtype": str(dataset.dtype)},
)
@classmethod
def applies_to(cls, datatype: Type[Type]) -> bool:
return datatype == pd.Series
@classmethod
def arg(cls) -> str:
return "dtype"
def __eq__(self, other):
return isinstance(other, self.__class__) and self.dtype == other.dtype
DUMMY_VALIDATORS_FOR_TESTING = [SampleDataValidator1, SampleDataValidator2, SampleDataValidator3]