blob: 6b41fc2d1c652623b6003d89a787299d806359d4 [file] [log] [blame]
{
"nbformat": 4,
"nbformat_minor": 2,
"metadata": {
"colab": {
"name": "Untitled8.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"metadata": {
"interpreter": {
"hash": "434fba307fd1171c9cfc17821a2afcf8929f30379beeaab9e3fdf8c6db2d1c93"
}
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"source": [
"import joblib\r\n",
"import numpy as np\r\n",
"import pandas as pd\r\n",
"\r\n",
"from sklearn.preprocessing import LabelEncoder\r\n",
"from sklearn.preprocessing import StandardScaler\r\n",
"from sklearn.model_selection import train_test_split\r\n",
"\r\n",
"test_size = 0.20\r\n",
"\r\n",
"processed_data = None\r\n",
"categorical = None\r\n",
"label_encoders = {}\r\n",
"\r\n",
"def preprocessing(dataset, data, test_size):\r\n",
" \"\"\"\r\n",
" Preprocess dataset\r\n",
"\r\n",
" Parameters\r\n",
" ----------\r\n",
" data: DataFrame\r\n",
" Pandas dataframe containing German dataset.\r\n",
" \"\"\"\r\n",
" \r\n",
" global processed_data\r\n",
" global categorical\r\n",
" global label_encoders\r\n",
"\r\n",
" # Reset global variables\r\n",
" \r\n",
" processed_data = None\r\n",
" categorical = None\r\n",
" label_encoders = {}\r\n",
"\r\n",
"\r\n",
" if dataset == \"German\":\r\n",
" # Drop savings account and checkings account columns as they contain a lot\r\n",
" # of NaN values and may not always be available in real life scenarios\r\n",
" data = data.drop(columns = ['Saving accounts', 'Checking account'])\r\n",
" \r\n",
" dat_dict = data.to_dict()\r\n",
" new_dat_dict = {}\r\n",
"\r\n",
" # rename columns(Make them lowercase and snakecase)\r\n",
" for key, value in dat_dict.items():\r\n",
" newKey = key\r\n",
" if type(key) == str:\r\n",
" newKey = newKey.lower().replace(' ', '_')\r\n",
" # if newKey != key:\r\n",
" new_dat_dict[newKey] = dat_dict[key]\r\n",
" del dat_dict\r\n",
"\r\n",
" data = pd.DataFrame.from_dict(new_dat_dict)\r\n",
" del new_dat_dict\r\n",
"\r\n",
"\r\n",
" # print(data.describe())\r\n",
" # print(data.describe(include='O'))\r\n",
"\r\n",
" cols = data.columns\r\n",
" num_cols = data._get_numeric_data().columns\r\n",
" categorical = list(set(cols) - set(num_cols))\r\n",
"\r\n",
" # Drop null rows\r\n",
" data = data.dropna()\r\n",
"\r\n",
" # Encode text columns to number values\r\n",
" for category in categorical:\r\n",
" le = LabelEncoder()\r\n",
" data[category] = le.fit_transform(data[category])\r\n",
" label_encoders[category] = le\r\n",
"\r\n",
" for col in data.columns:\r\n",
" if(col not in categorical):\r\n",
" data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))\r\n",
"\r\n",
" # print(data.describe())\r\n",
" # print(data.describe(include='O'))\r\n",
"\r\n",
" processed_data = data\r\n",
"\r\n",
" # Get Training parameters\r\n",
" if dataset == \"German\":\r\n",
" target_col = data.columns[-1]\r\n",
" x = data.drop(columns=target_col, axis=1)\r\n",
" y = data[target_col].astype('int')\r\n",
" elif dataset == \"Australian\":\r\n",
" x = data.drop(14, axis=1)\r\n",
" y = data[14].astype('int')\r\n",
" elif dataset == \"Japanese\":\r\n",
" x = data.drop(15, axis=1)\r\n",
" y = data[15].astype('int')\r\n",
" elif dataset == \"Taiwan\":\r\n",
" x = data.drop('default_payment_next_month', axis=1)\r\n",
" y = data['default_payment_next_month'].astype('int')\r\n",
" elif dataset == \"Polish\":\r\n",
" x = data.drop('class', axis=1)\r\n",
" y = data['class'].astype('int')\r\n",
"\r\n",
"\r\n",
" x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size)\r\n",
" x_train = pd.DataFrame(x_train)\r\n",
" y_train = pd.DataFrame(y_train)\r\n",
"\r\n",
" sc = StandardScaler()\r\n",
" x_train = sc.fit_transform(x_train)\r\n",
" x_test = sc.transform(x_test)\r\n",
"\r\n",
" return (x_train, x_test, y_train, y_test)\r\n"
],
"outputs": [],
"metadata": {
"id": "Z5EFqHxB-zEn"
}
},
{
"cell_type": "code",
"execution_count": 2,
"source": [
"from sklearn.svm import SVC\r\n",
"from sklearn.neural_network import MLPClassifier\r\n",
"from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\r\n",
"from sklearn.model_selection import GridSearchCV, ShuffleSplit\r\n",
"from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score\r\n",
"\r\n",
"class Model(object):\r\n",
" \"\"\"\r\n",
" Basic Scorecard Model\r\n",
"\r\n",
" Warning: This class should not be used directly. Use derived classes\r\n",
" instead.\r\n",
" \"\"\"\r\n",
"\r\n",
" def __init__(self,\r\n",
" classifier=None,\r\n",
" test_size=test_size,\r\n",
" n_splits=1,\r\n",
" random_state=None,\r\n",
" n_jobs=None,\r\n",
" params=None):\r\n",
" \r\n",
" self.classifier = classifier\r\n",
" self.params = params\r\n",
" self.random_state = random_state\r\n",
" self.test_size = test_size\r\n",
" self.n_splits = n_splits\r\n",
" self.n_jobs = n_jobs\r\n",
"\r\n",
" self.model = GridSearchCV(estimator=classifier,\r\n",
" param_grid=params,\r\n",
" n_jobs=n_jobs,\r\n",
" cv=ShuffleSplit(test_size=test_size,\r\n",
" n_splits=n_splits,\r\n",
" random_state=0))\r\n",
" \r\n",
" def __str__(self):\r\n",
" return f\"\"\"\r\n",
" Model Object\r\n",
" ----------------------------------------------------------------\r\n",
"\r\n",
" Classifier: {self.classifier.__class__.__name__}\r\n",
" Test Size: {self.test_size}\r\n",
" Random State: {self.random_state}\r\n",
" Number of Splits: {self.n_splits}\r\n",
" Parameter Grid: {self.params}\r\n",
"\r\n",
" {self.model}\r\n",
" \"\"\"\r\n",
" \r\n",
" def __repr__(self):\r\n",
" return self.__str__()\r\n",
"\r\n",
" def train(self, x_train, y_train):\r\n",
" \"\"\"\r\n",
" Train scorecard model\r\n",
" \r\n",
" Args:\r\n",
" x_train:\r\n",
" array of training parameters\r\n",
" y_train:\r\n",
" pandas dataframe with training labels\r\n",
" \"\"\"\r\n",
"\r\n",
" self.model = self.model.fit(x_train, y_train.values.ravel())\r\n",
" return self\r\n",
"\r\n",
" def predict(self, data):\r\n",
" \"\"\"\r\n",
" Predict scorecard model\r\n",
"\r\n",
" Args:\r\n",
" data: array\r\n",
" Data to perform prediction on.\r\n",
" \"\"\"\r\n",
"\r\n",
" return self.model.predict(data)\r\n",
"\r\n",
" def accuracy(self, x_test, y_test):\r\n",
" \"\"\"\r\n",
" Compute scorecard model accuracy\r\n",
"\r\n",
" Args:\r\n",
" x_test: array\r\n",
" The test parameters.\r\n",
" y_test: array\r\n",
" The labels\r\n",
" \"\"\"\r\n",
"\r\n",
" y_pred = self.predict(x_test)\r\n",
" return accuracy_score(y_test, y_pred, normalize=False)\r\n",
"\r\n",
" def metrics(self, x_test, y_test):\r\n",
" \"\"\"\r\n",
" Comput scorecard model metrics\r\n",
" \r\n",
" Args:\r\n",
" x_test: array\r\n",
" The test parameters.\r\n",
" y_test: array\r\n",
" The labels\r\n",
" \"\"\"\r\n",
"\r\n",
" y_pred = self.predict(x_test)\r\n",
" \r\n",
" cm = confusion_matrix(y_pred, y_test)\r\n",
" accuracy = accuracy_score(y_test, y_pred, normalize=True)\r\n",
" f1 = f1_score(y_test, y_pred, average=\"macro\")\r\n",
" recall = recall_score(y_test, y_pred, average=\"macro\")\r\n",
" precision = precision_score(y_test, y_pred, average=\"macro\")\r\n",
"\r\n",
" return {\"accuracy\" : accuracy,\r\n",
" \"f1_score\" : f1,\r\n",
" \"recall_score\" : recall,\r\n",
" \"precision_score\": precision}\r\n",
"\r\n",
"class RandomForest(Model):\r\n",
" def __init__(self,\r\n",
" classifier=RandomForestClassifier(),\r\n",
" test_size=test_size,\r\n",
" n_splits=1,\r\n",
" random_state=0,\r\n",
" n_jobs=None,\r\n",
" params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}): \r\n",
" super(RandomForest, self).__init__(classifier,\r\n",
" test_size,\r\n",
" n_splits,\r\n",
" random_state,\r\n",
" n_jobs,\r\n",
" params)\r\n",
"\r\n",
"class SVC(Model):\r\n",
" def __init__(self,\r\n",
" classifier=SVC(),\r\n",
" test_size=test_size,\r\n",
" n_splits=1,\r\n",
" random_state=0,\r\n",
" n_jobs=None,\r\n",
" params={'kernel' : ['poly'], 'degree' : [2, 3, 4]}):\r\n",
" super(SVC, self).__init__(classifier,\r\n",
" test_size,\r\n",
" n_splits,\r\n",
" random_state,\r\n",
" n_jobs,\r\n",
" params)\r\n",
"\r\n",
"class MLP(Model):\r\n",
" def __init__(self,\r\n",
" classifier=MLPClassifier(),\r\n",
" test_size=test_size,\r\n",
" n_splits=1,\r\n",
" random_state=0,\r\n",
" n_jobs=-1,\r\n",
" params={'hidden_layer_sizes' : [(100, 50 ,10)],\r\n",
" 'max_iter' : [500],\r\n",
" 'activation' : ['relu'],\r\n",
" 'solver' : ['adam'],\r\n",
" 'random_state' : [1]}):\r\n",
" super(MLP, self).__init__(classifier,\r\n",
" test_size,\r\n",
" n_splits,\r\n",
" random_state,\r\n",
" n_jobs,\r\n",
" params)\r\n",
"\r\n",
"class GradientBoost(Model):\r\n",
" def __init__(self,\r\n",
" classifier=GradientBoostingClassifier(),\r\n",
" test_size=test_size,\r\n",
" n_splits=1,\r\n",
" random_state=0,\r\n",
" n_jobs=None,\r\n",
" params={'n_estimators' : [100, 200, 50],\r\n",
" 'random_state' : [0],\r\n",
" 'learning_rate' : [1.0],\r\n",
" 'max_depth' : [1, 2, 3]}):\r\n",
" super(GradientBoost, self).__init__(classifier,\r\n",
" test_size,\r\n",
" n_splits,\r\n",
" random_state,\r\n",
" n_jobs,\r\n",
" params)\r\n",
"\r\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 3,
"source": [
"# GERMAN DATASET\r\n",
"german = pd.read_csv('./zoo/data/german.csv', index_col=0)\r\n",
"x_train, x_test, y_train, y_test = preprocessing(\"German\", german, test_size)\r\n",
"\r\n",
"# Print Encoders\r\n",
"print(categorical)\r\n",
"print(label_encoders)\r\n",
"\r\n",
"# Set and Train the models\r\n",
"print('\\nRF')\r\n",
"RFmodel = RandomForest().train(x_train, y_train)\r\n",
"print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nSVC\")\r\n",
"SVCmodel = SVC().train(x_train, y_train)\r\n",
"print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nMLP\")\r\n",
"MLPmodel = MLP().train(x_train, y_train)\r\n",
"print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nGB\")\r\n",
"GBmodel = GradientBoost().train(x_train, y_train)\r\n",
"print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"# Save Training Data\r\n",
"joblib.dump(categorical, \"zoo/models/german/categorical.joblib\", compress=True)\r\n",
"joblib.dump(label_encoders, \"zoo/models/german/label_encoders.joblib\", compress=True)\r\n",
"\r\n",
"joblib.dump(RFmodel.model, \"zoo/models/german/rf_classifier.joblib\", compress=True)\r\n",
"joblib.dump(SVCmodel.model, \"zoo/models/german/svc_classifier.joblib\", compress=True)\r\n",
"joblib.dump(MLPmodel.model, \"zoo/models/german/mlp_classifier.joblib\", compress=True)\r\n",
"joblib.dump(GBmodel.model, \"zoo/models/german/gb_classifier.joblib\", compress=True)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['sex', 'risk', 'purpose', 'housing']\n",
"{'sex': LabelEncoder(), 'risk': LabelEncoder(), 'purpose': LabelEncoder(), 'housing': LabelEncoder()}\n",
"\n",
"RF\n",
"Random Forest: {'accuracy': 0.675, 'f1_score': 0.5226029157944989, 'recall_score': 0.5362070998938554, 'precision_score': 0.5678603006189213}\n",
"\n",
"SVC\n",
"SVM: {'accuracy': 0.69, 'f1_score': 0.4859890565412038, 'recall_score': 0.5240004717537445, 'precision_score': 0.583710407239819}\n",
"\n",
"MLP\n",
"MLP: {'accuracy': 0.7, 'f1_score': 0.6101871101871102, 'recall_score': 0.6047883005071353, 'precision_score': 0.6316101318323212}\n",
"\n",
"GB\n",
"Gradient Boost: {'accuracy': 0.705, 'f1_score': 0.5736840203764586, 'recall_score': 0.5761882297440736, 'precision_score': 0.6341362126245846}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['zoo/german/gb_classifier.joblib']"
]
},
"metadata": {},
"execution_count": 3
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 4,
"source": [
"# Australian DATASET\r\n",
"australian = [i.strip().split() for i in open(\"./zoo/data/australian.dat\").readlines()]\r\n",
"australian = pd.DataFrame(australian)\r\n",
"x_train, x_test, y_train, y_test = preprocessing(\"Australian\", australian, test_size)\r\n",
"\r\n",
"# Print Encoders\r\n",
"print(categorical)\r\n",
"print(label_encoders)\r\n",
"\r\n",
"# Set and Train the models\r\n",
"print('\\nRF')\r\n",
"RFmodel = RandomForest().train(x_train, y_train)\r\n",
"print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nSVC\")\r\n",
"SVCmodel = SVC().train(x_train, y_train)\r\n",
"print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nMLP\")\r\n",
"MLPmodel = MLP().train(x_train, y_train)\r\n",
"print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nGB\")\r\n",
"GBmodel = GradientBoost().train(x_train, y_train)\r\n",
"print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"# Save Training Data\r\n",
"joblib.dump(categorical, \"zoo/models/australian/categorical.joblib\", compress=True)\r\n",
"joblib.dump(label_encoders, \"zoo/australian/label_encoders.joblib\", compress=True)\r\n",
"\r\n",
"joblib.dump(RFmodel.model, \"zoo/models/australian/rf_classifier.joblib\", compress=True)\r\n",
"joblib.dump(SVCmodel.model, \"zoo/models/australian/svc_classifier.joblib\", compress=True)\r\n",
"joblib.dump(MLPmodel.model, \"zoo/models/australian/mlp_classifier.joblib\", compress=True)\r\n",
"joblib.dump(GBmodel.model, \"zoo/models/australian/gb_classifier.joblib\", compress=True)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n",
"{0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder()}\n",
"\n",
"RF\n",
"Random Forest: {'accuracy': 0.855072463768116, 'f1_score': 0.8549505991170906, 'recall_score': 0.8562697576396207, 'precision_score': 0.855072463768116}\n",
"\n",
"SVC\n",
"SVM: {'accuracy': 0.8623188405797102, 'f1_score': 0.8623116105655622, 'recall_score': 0.8648050579557429, 'precision_score': 0.8642676767676767}\n",
"\n",
"MLP\n",
"MLP: {'accuracy': 0.855072463768116, 'f1_score': 0.8549505991170906, 'recall_score': 0.8562697576396207, 'precision_score': 0.855072463768116}\n",
"\n",
"GB\n",
"Gradient Boost: {'accuracy': 0.8695652173913043, 'f1_score': 0.8691253951527924, 'recall_score': 0.8691253951527924, 'precision_score': 0.8691253951527924}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['zoo/australian/gb_classifier.joblib']"
]
},
"metadata": {},
"execution_count": 4
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 5,
"source": [
"# Japanese DATASET\r\n",
"japanese = [i.strip().split(\",\") for i in open(\"./zoo/data/japanese/japanese.data\").readlines()]\r\n",
"japanese = pd.DataFrame(japanese)\r\n",
"x_train, x_test, y_train, y_test = preprocessing(\"Japanese\", japanese, test_size)\r\n",
"\r\n",
"# Print Encoders\r\n",
"print(categorical)\r\n",
"print(label_encoders)\r\n",
"\r\n",
"# Set and Train the models\r\n",
"print('\\nRF')\r\n",
"RFmodel = RandomForest().train(x_train, y_train)\r\n",
"print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nSVC\")\r\n",
"SVCmodel = SVC().train(x_train, y_train)\r\n",
"print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nMLP\")\r\n",
"MLPmodel = MLP().train(x_train, y_train)\r\n",
"print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nGB\")\r\n",
"GBmodel = GradientBoost().train(x_train, y_train)\r\n",
"print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"# Save Training Data\r\n",
"joblib.dump(categorical, \"zoo/models/japanese/categorical.joblib\", compress=True)\r\n",
"joblib.dump(label_encoders, \"zoo/models/japanese/label_encoders.joblib\", compress=True)\r\n",
"\r\n",
"joblib.dump(RFmodel.model, \"zoo/models/japanese/rf_classifier.joblib\", compress=True)\r\n",
"joblib.dump(SVCmodel.model, \"zoo/models/japanese/svc_classifier.joblib\", compress=True)\r\n",
"joblib.dump(MLPmodel.model, \"zoo/models/japanese/mlp_classifier.joblib\", compress=True)\r\n",
"joblib.dump(GBmodel.model, \"zoo/models/japanese/gb_classifier.joblib\", compress=True)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
"{0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder(), 15: LabelEncoder()}\n",
"\n",
"RF\n",
"Random Forest: {'accuracy': 0.9057971014492754, 'f1_score': 0.9020475020475021, 'recall_score': 0.9130143112701252, 'precision_score': 0.8962668955159837}\n",
"\n",
"SVC\n",
"SVM: {'accuracy': 0.8478260869565217, 'f1_score': 0.8442186744073537, 'recall_score': 0.8627012522361359, 'precision_score': 0.8418335089567967}\n",
"\n",
"MLP\n",
"MLP: {'accuracy': 0.8115942028985508, 'f1_score': 0.8057179987004548, 'recall_score': 0.8184257602862254, 'precision_score': 0.8022071307300509}\n",
"\n",
"GB\n",
"Gradient Boost: {'accuracy': 0.8695652173913043, 'f1_score': 0.8639649507119387, 'recall_score': 0.8725402504472272, 'precision_score': 0.8590517241379311}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['zoo/japanese/gb_classifier.joblib']"
]
},
"metadata": {},
"execution_count": 5
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"source": [
"# Taiwan DATASET\r\n",
"taiwan = pd.read_excel('./zoo/data/taiwan.xls', index_col=0, header=1)\r\n",
"x_train, x_test, y_train, y_test = preprocessing(\"Taiwan\", taiwan, test_size)\r\n",
"\r\n",
"# Print Encoders\r\n",
"print(categorical)\r\n",
"print(label_encoders)\r\n",
"\r\n",
"# Set and Train the models\r\n",
"print('\\nRF')\r\n",
"RFmodel = RandomForest().train(x_train, y_train)\r\n",
"print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nSVC\")\r\n",
"SVCmodel = SVC().train(x_train, y_train)\r\n",
"print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nMLP\")\r\n",
"MLPmodel = MLP().train(x_train, y_train)\r\n",
"print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nGB\")\r\n",
"GBmodel = GradientBoost().train(x_train, y_train)\r\n",
"print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"# Save Training Data\r\n",
"joblib.dump(categorical, \"zoo/models/taiwan/categorical.joblib\", compress=True)\r\n",
"joblib.dump(label_encoders, \"zoo/models/taiwan/label_encoders.joblib\", compress=True)\r\n",
"\r\n",
"joblib.dump(RFmodel.model, \"zoo/models/taiwan/rf_classifier.joblib\", compress=True)\r\n",
"joblib.dump(SVCmodel.model, \"zoo/models/taiwan/svc_classifier.joblib\", compress=True)\r\n",
"joblib.dump(MLPmodel.model, \"zoo/models/taiwan/mlp_classifier.joblib\", compress=True)\r\n",
"joblib.dump(GBmodel.model, \"zoo/models/taiwan/gb_classifier.joblib\", compress=True)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[]\n",
"{}\n",
"\n",
"RF\n",
"Random Forest: {'accuracy': 0.8181666666666667, 'f1_score': 0.681903656460614, 'recall_score': 0.6583642128492948, 'precision_score': 0.7439043272511898}\n",
"\n",
"SVC\n",
"SVM: {'accuracy': 0.81, 'f1_score': 0.616125081264749, 'recall_score': 0.6001113663459882, 'precision_score': 0.7597362173568979}\n",
"\n",
"MLP\n",
"MLP: {'accuracy': 0.7716666666666666, 'f1_score': 0.628088225074129, 'recall_score': 0.6176235046432739, 'precision_score': 0.6493170130422868}\n",
"\n",
"GB\n",
"Gradient Boost: {'accuracy': 0.8228333333333333, 'f1_score': 0.6804605216188897, 'recall_score': 0.6544810313636326, 'precision_score': 0.7611662244399562}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['zoo/taiwan/gb_classifier.joblib']"
]
},
"metadata": {},
"execution_count": 6
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 15,
"source": [
"# Polish DATASET\r\n",
"from scipy.io import arff\r\n",
"\r\n",
"year_1 = pd.DataFrame(arff.loadarff('./zoo/data/polish/1year.arff')[0])\r\n",
"year_2 = pd.DataFrame(arff.loadarff('./zoo/data/polish/2year.arff')[0])\r\n",
"year_3 = pd.DataFrame(arff.loadarff('./zoo/data/polish/3year.arff')[0])\r\n",
"year_4 = pd.DataFrame(arff.loadarff('./zoo/data/polish/4year.arff')[0])\r\n",
"year_5 = pd.DataFrame(arff.loadarff('./zoo/data/polish/5year.arff')[0])\r\n",
"polish = pd.concat([year_1, year_2, year_3, year_4, year_5], ignore_index=True)\r\n",
"x_train, x_test, y_train, y_test = preprocessing(\"Polish\", polish, test_size)\r\n",
"\r\n",
"# Print Encoders\r\n",
"print(categorical)\r\n",
"print(label_encoders)\r\n",
"\r\n",
"# Set and Train the models\r\n",
"print('\\nRF')\r\n",
"RFmodel = RandomForest().train(x_train, y_train)\r\n",
"print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nSVC\")\r\n",
"SVCmodel = SVC().train(x_train, y_train)\r\n",
"print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nMLP\")\r\n",
"MLPmodel = MLP().train(x_train, y_train)\r\n",
"print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"print(\"\\nGB\")\r\n",
"GBmodel = GradientBoost().train(x_train, y_train)\r\n",
"print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n",
"\r\n",
"# Save Training Data\r\n",
"joblib.dump(categorical, \"zoo/models/polish/categorical.joblib\", compress=True)\r\n",
"joblib.dump(label_encoders, \"zoo/models/polish/label_encoders.joblib\", compress=True)\r\n",
"\r\n",
"joblib.dump(RFmodel.model, \"zoo/models/polish/rf_classifier.joblib\", compress=True)\r\n",
"joblib.dump(SVCmodel.model, \"zoo/models/polish/svc_classifier.joblib\", compress=True)\r\n",
"joblib.dump(MLPmodel.model, \"zoo/models/polish/mlp_classifier.joblib\", compress=True)\r\n",
"joblib.dump(GBmodel.model, \"zoo/models/polish/gb_classifier.joblib\", compress=True)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['class']\n",
"{'sex': LabelEncoder(), 'risk': LabelEncoder(), 'housing': LabelEncoder(), 'purpose': LabelEncoder(), 0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder(), 15: LabelEncoder(), 'class': LabelEncoder()}\n",
"Random Forest: {'accuracy': 0.9802203304957436, 'f1_score': 0.5073501054699149, 'recall_score': 0.5059245566321038, 'precision_score': 0.5466471490310888}\n",
"SVM: {'accuracy': 0.9817225838758137, 'f1_score': 0.49538850284270375, 'recall_score': 0.4998725140234574, 'precision_score': 0.49098422238918105}\n",
"MLP: {'accuracy': 0.9799699549323986, 'f1_score': 0.71321171997573, 'recall_score': 0.7103058246926172, 'precision_score': 0.7162043104703655}\n",
"Gradient Boost: {'accuracy': 0.9819729594391587, 'f1_score': 0.566873915558126, 'recall_score': 0.5409017508074112, 'precision_score': 0.7417127071823204}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['zoo/polish/gb_classifier.joblib']"
]
},
"metadata": {},
"execution_count": 15
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
}
]
}