blob: 465e56c8239cb03f3c292669e80218568c86e18e [file] [log] [blame]
import modin.pandas as pd
def base_table(tablename: str, db_connection: object) -> pd.DataFrame:
# Select Data Source form configured DB
df = pd.read_sql(tablename, con=db_connection)
return df
def filtered_table(base_table: pd.DataFrame) -> pd.DataFrame:
# keep columns that have more than 50% non-null values
keep = base_table.columns[
(((base_table.isnull().sum() / base_table.shape[0])) * 100 < 50).values
]
return base_table[keep]
def cleaned_table(filtered_table: pd.DataFrame) -> pd.DataFrame:
upper_bound = filtered_table.annual_inc.quantile(0.95)
lower_bound = filtered_table.annual_inc.quantile(0.05)
no_outliers = filtered_table[
(filtered_table.annual_inc < upper_bound) & (filtered_table.annual_inc > lower_bound)
]
print("{} outliers removed".format(filtered_table.shape[0] - no_outliers.shape[0]))
return no_outliers
def final_table(cleaned_table: pd.DataFrame) -> pd.DataFrame:
return pd.get_dummies(cleaned_table, columns="grade")