blob: 5c673e5ed91a740ba98ce81cd4604f371ab76297 [file] [log] [blame]
from typing import List
import pandas as pd
from scipy import stats
from hamilton.function_modifiers import extract_fields
def raw_data() -> str:
s = (
"""31 weeks < 0.1% < 0.1% > 99.9%
31 weeks, 1 day < 0.1% < 0.1% > 99.9%
31 weeks, 2 days < 0.1% < 0.1% > 99.9%
31 weeks, 3 days < 0.1% < 0.1% > 99.9%
31 weeks, 4 days 0.1% < 0.1% 99.9%
31 weeks, 5 days 0.1% < 0.1% 99.9%
31 weeks, 6 days 0.1% < 0.1% 99.9%
32 weeks 0.1% < 0.1% 99.9%
32 weeks, 1 day 0.1% < 0.1% 99.9%
32 weeks, 2 days 0.1% < 0.1% 99.9%
32 weeks, 3 days 0.1% < 0.1% 99.8%
32 weeks, 4 days 0.2% < 0.1% 99.8%
32 weeks, 5 days 0.2% < 0.1% 99.8%
32 weeks, 6 days 0.2% < 0.1% 99.7%
33 weeks 0.3% < 0.1% 99.7%
33 weeks, 1 day 0.3% < 0.1% 99.6%
33 weeks, 2 days 0.4% 0.1% 99.6%
33 weeks, 3 days 0.4% 0.1% 99.5%
33 weeks, 4 days 0.5% 0.1% 99.4%
33 weeks, 5 days 0.6% 0.1% 99.3%
33 weeks, 6 days 0.7% 0.1% 99.2%
34 weeks 0.8% 0.1% 99.1%
34 weeks, 1 day 0.9% 0.1% 99%
34 weeks, 2 days 1% 0.1% 98.9%
34 weeks, 3 days 1.2% 0.1% 98.7%
34 weeks, 4 days 1.3% 0.2% 98.5%
34 weeks, 5 days 1.5% 0.2% 98.3%
34 weeks, 6 days 1.7% 0.2% 98.1%
35 weeks 2% 0.2% 97.8%
35 weeks, 1 day 2.2% 0.3% 97.5%
35 weeks, 2 days 2.5% 0.3% 97.2%
35 weeks, 3 days 2.9% 0.3% 96.8%
35 weeks, 4 days 3.2% 0.4% 96.4%
35 weeks, 5 days 3.6% 0.4% 96%
35 weeks, 6 days 4.1% 0.4% 95.5%
36 weeks 4.6% 0.5% 95%
36 weeks, 1 day 5.1% 0.5% 94.4%
36 weeks, 2 days 5.7% 0.6% 93.7%
36 weeks, 3 days 6.3% 0.6% 93%
36 weeks, 4 days 7% 0.7% 92.2%
36 weeks, 5 days 7.8% 0.8% 91.4%
36 weeks, 6 days 8.7% 0.8% 90.5%
37 weeks 9.6% 0.9% 89.5%
37 weeks, 1 day 10.6% 1% 88.5%
37 weeks, 2 days 11.6% 1.1% 87.3%
37 weeks, 3 days 12.8% 1.1% 86.1%
37 weeks, 4 days 14% 1.2% 84.8%
37 weeks, 5 days 15.3% 1.3% 83.4%
37 weeks, 6 days 16.7% 1.4% 81.8%
38 weeks 18.3% 1.5% 80.2%
38 weeks, 1 day 19.9% 1.6% 78.5%
38 weeks, 2 days 21.6% 1.7% 76.7%
38 weeks, 3 days 23.4% 1.8% 74.8%
38 weeks, 4 days 25.3% 1.9% 72.8%
38 weeks, 5 days 27.4% 2% 70.6%
38 weeks, 6 days 29.5% 2.1% 68.4%
39 weeks 31.7% 2.2% 66%
39 weeks, 1 day 34.1% 2.4% 63.5%
39 weeks, 2 days 36.6% 2.5% 61%
39 weeks, 3 days 39.1% 2.6% 58.3%
39 weeks, 4 days 41.8% 2.7% 55.5%
39 weeks, 5 days 44.6% 2.8% 52.6%
39 weeks, 6 days 47.5% 2.9% 49.6%
40 weeks 50.5% 3% 46.5%
40 weeks, 1 day 53.6% 3.1% 43.4%
40 weeks, 2 days 56.7% 3.2% 40.1%
40 weeks, 3 days 59.9% 3.2% 36.8%
40 weeks, 4 days 63.2% 3.3% 33.5%
40 weeks, 5 days 66.5% 3.3% 30.2%
40 weeks, 6 days 69.8% 3.3% 26.9%
41 weeks 73.1% 3.3% 23.6%
41 weeks, 1 day 76.3% 3.2% 20.4%
41 weeks, 2 days 79.5% 3.1% 17.4%
41 weeks, 3 days 82.4% 3% 14.6%
41 weeks, 4 days 85.2% 2.8% 12%
41 weeks, 5 days 87.8% 2.6% 9.6%
41 weeks, 6 days 90.1% 2.3% 7.5%
42 weeks 92.2% 2% 5.8%
42 weeks, 1 day 93.9% 1.8% 4.3%
42 weeks, 2 days 95.4% 1.5% 3.1%
42 weeks, 3 days 96.6% 1.2% 2.2%
42 weeks, 4 days 97.6% 0.9% 1.5%
42 weeks, 5 days 98.3% 0.7% 1%
42 weeks, 6 days 98.8% 0.5% 0.6%
43 weeks 99.2% 0.4% 0.4%
43 weeks, 1 day 99.5% 0.3% 0.2%
43 weeks, 2 days 99.7% 0.2% 0.1%
43 weeks, 3 days 99.8% 0.1% 0.1%
43 weeks, 4 days 99.9% 0.1% < 0.1%
43 weeks, 5 days 99.9% < 0.1% < 0.1%
43 weeks, 6 days > 99.9% < 0.1%""".replace("weeks\t", "weeks, 0 days ")
.replace("\t", " ")
.split("\n")
)
return s
def raw_probabilities(raw_data: str) -> pd.DataFrame:
# filter out the outliers, we'll want them back in at some point but we'll let the fitting do the job
raw_data = [item for item in raw_data if "<" not in item and ">" not in item]
weeks = [
int(item.split(",")[0].split()[0])
for item in raw_data
if "<" not in raw_data and ">" not in raw_data
]
days = [int(item.split(", ")[1].split()[0]) for item in raw_data]
probability = [float(item.split()[5].replace("%", "")) / 100 for item in raw_data]
probabilities_data = [
(week * 7 + day, probability) for week, day, probability in zip(weeks, days, probability)
]
probabilities_df = pd.DataFrame(probabilities_data)
probabilities_df.columns = ["days", "probability"]
return probabilities_df # .set_index("days")
def resampled(raw_probabilities: pd.DataFrame) -> List[int]:
sample_data = []
for _idx, row in raw_probabilities.iterrows():
count = row.probability * 1000
sample_data.extend([row.days] * int(count))
return sample_data
@extract_fields(fields={"a": float, "loc": float, "scale": float})
def distribution_params(resampled: list[int]) -> dict[str, float]:
a, loc, scale = stats.skewnorm.fit(resampled)
return {"a": a, "loc": loc, "scale": scale}
def distribution(distribution_params: dict[str, float]) -> stats.rv_continuous:
return stats.skewnorm(**distribution_params)