blob: a38b611e828da461c1d2343ff27f61c0d174c0d5 [file] [log] [blame]
import json
import logging
import numpy as np
import pandas as pd
import requests
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from scipy.stats import norm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from statsmodels.multivariate.manova import MANOVA
import configparser
import csv
log = logging.getLogger(__name__)
def linear_regression(input_row, data, categorical):
print(input_row)
data = data.dropna()
data.loc[len(data)] = input_row
le = LabelEncoder()
for val in categorical:
data[val] = le.fit_transform(data[val])
data['job'] = data['job'].astype('int')
for col in data.columns:
if(col not in categorical):
data[col] = (data[col] - np.mean(data[col]))/np.std(data[col])
print(data)
split = 0.8
split_idx = int(len(data)*split)
data_train = data[:split_idx]
data_test = data[split_idx:]
y_train = data_train[10]
x_train = data_train.loc[:, data_train.columns != 10]
y_test = data_test[10]
x_test = data_test.loc[:, data_test.columns != 10]
reg = LinearRegression().fit(x_train, y_train)
print(data)
predictions = reg.predict(x_test)
for i in range(len(predictions)):
if predictions[i] > 0.5:
predictions[i] = 1
else:
predictions[i] = 0
count = 0
y_test = np.array(y_test)
for i in range(len(predictions)):
if(y_test[i] == predictions[i]):
count = count + 1
accuracy = count/len(predictions)
if(predictions[len(x_test)-1] < 0.5):
color = "red"
else:
color = "green"
return {"method": "LinReg", "color": color, "prediction": 100*accuracy}
def polynomial_regression(input_row, data, categorical):
data = data.dropna()
data.loc[len(data)] = input_row
le = LabelEncoder()
for val in categorical:
data[val] = le.fit_transform(data[val])
for col in data.columns:
if(col not in categorical):
data[col] = (data[col] - np.mean(data[col]))/np.std(data[col])
split = 0.7
split_idx = int(len(data)*split)
data_train = data[:split_idx]
data_test = data[split_idx:]
y_train = data_train[10]
x_train = data_train.loc[:, data_train.columns != 10]
y_test = data_test[10]
x_test = data_test.loc[:, data_test.columns != 10]
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
('linear', LinearRegression(fit_intercept=False))])
reg = model.fit(x_train, y_train)
# reg = LinearRegression().fit(x_train, y_train)
predictions = reg.predict(x_test)
for i in range(len(predictions)):
if predictions[i] > 0.5:
predictions[i] = 1
else:
predictions[i] = 0
count = 0
y_test = np.array(y_test)
for i in range(len(predictions)):
if(y_test[i] == predictions[i]):
count = count + 1
accuracy = count/len(predictions)
if(predictions[len(x_test)-1] < 0.5):
color = "red"
else:
color = "green"
return {"method": "PolyReg", "color": color, "prediction": 100*accuracy}
def manova(test_row, data, categorical):
data = data.dropna()
data.loc[len(data)] = test_row
le = LabelEncoder()
for val in categorical:
data[val] = le.fit_transform(data[val])
for col in data.columns:
if(col not in categorical):
data[col] = (data[col] - np.mean(data[col]))/np.std(data[col])
test_row = data.iloc[len(data)-1]
data.drop([len(data)-1])
data_good = data[data[10] == 0]
data_bad = data[data[10] == 1]
x_good = data_good.drop([10, 9], axis=1)
y_good = data_good[[9]]
x_bad = data_bad.drop([10, 9], axis=1)
y_bad = data_bad[[9]]
man_good = MANOVA(endog=x_good, exog=y_good)
man_bad = MANOVA(endog=x_bad, exog=y_bad)
output_good = man_good.mv_test()
output_bad = man_bad.mv_test()
out_good = np.array(output_good['x0']['stat'])
out_bad = np.array(output_bad['x0']['stat'])
# Wilki's Lambda
WL_good = out_good[0][0]
# Pillai's Trace
PT_good = out_good[1][0]
# Hotelling-Lawley Trace
HT_good = out_good[2][0]
# Roy's Greatest Roots
RGR_good = out_good[3][0]
WL_bad = out_bad[0][0]
PT_bad = out_bad[1][0]
HT_bad = out_bad[2][0]
RGR_bad = out_bad[3][0]
x = test_row.drop([10, 9])
y = test_row[[9]]
data_test_x = x_good.append(x)
data_test_y = y_good.append(y)
man_test = MANOVA(endog=data_test_x, exog=data_test_y)
output_test = man_test.mv_test()
out_test = np.array(output_test['x0']['stat'])
# Wilki's Lambda
WL_test_good = out_test[0][0]
# Pillai's Trace
PT_test_good = out_test[1][0]
# Hotelling-Lawley Trace
HT_test_good = out_test[2][0]
# Roy's Greatest Roots
RGR_test_good = out_test[3][0]
data_test_x = x_bad.append(x)
data_test_y = y_bad.append(y)
man_test = MANOVA(endog=data_test_x, exog=data_test_y)
output_test = man_test.mv_test()
out_test = np.array(output_test['x0']['stat'])
WL_test_bad = out_test[0][0]
PT_test_bad = out_test[1][0]
HT_test_bad = out_test[2][0]
RGR_test_bad = out_test[3][0]
scorecard = {"method": "MANOVA", "WL_good": WL_good,
"WL_test_good": WL_test_good, "WL_bad": WL_bad, "WL_test_bad": WL_test_bad}
ret = "WL good : " + str(WL_good) + " WL test good : " + str(WL_test_good) + \
"\nWL bad : " + \
str(WL_bad) + " WL test bad : " + \
str(WL_test_bad)
return scorecard
def stat_score(input_row, model_type):
df = pd.read_csv(f'zoo/data/german.csv', index_col=0)
data = df.drop(columns=['Saving accounts', 'Checking account', 'Risk'])
dat_dict = data.to_dict()
new_dat_dict = {}
# rename columns(Make them lowercase and snakecase)
for key, value in dat_dict.items():
newKey = key
if type(key) == str:
newKey = newKey.lower().replace(' ', '_')
# if newKey != key:
new_dat_dict[newKey] = dat_dict[key]
del dat_dict
data = pd.DataFrame.from_dict(new_dat_dict)
del new_dat_dict
cols = data.columns
num_cols = data._get_numeric_data().columns
categorical = list(set(cols) - set(num_cols))
print(categorical)
print(input_row)
try:
if(model_type == 'manova'):
output = manova(input_row, data, categorical)
elif(model_type == 'linearRegression'):
output = linear_regression(input_row, data, categorical)
elif(model_type == 'polynomialRegression'):
output = polynomial_regression(input_row, data, categorical)
return output
except Exception as e:
log.debug(f"An Exception Occurred; {str(e)}")