| { |
| "nbformat": 4, |
| "nbformat_minor": 2, |
| "metadata": { |
| "colab": { |
| "name": "Untitled8.ipynb", |
| "provenance": [], |
| "collapsed_sections": [] |
| }, |
| "kernelspec": { |
| "name": "python3", |
| "display_name": "Python 3", |
| "language": "python" |
| }, |
| "metadata": { |
| "interpreter": { |
| "hash": "434fba307fd1171c9cfc17821a2afcf8929f30379beeaab9e3fdf8c6db2d1c93" |
| } |
| } |
| }, |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "source": [ |
| "import joblib\r\n", |
| "import numpy as np\r\n", |
| "import pandas as pd\r\n", |
| "\r\n", |
| "from sklearn.preprocessing import LabelEncoder\r\n", |
| "from sklearn.preprocessing import StandardScaler\r\n", |
| "from sklearn.model_selection import train_test_split\r\n", |
| "\r\n", |
| "test_size = 0.20\r\n", |
| "\r\n", |
| "processed_data = None\r\n", |
| "categorical = None\r\n", |
| "label_encoders = {}\r\n", |
| "\r\n", |
| "def preprocessing(dataset, data, test_size):\r\n", |
| " \"\"\"\r\n", |
| " Preprocess dataset\r\n", |
| "\r\n", |
| " Parameters\r\n", |
| " ----------\r\n", |
| " data: DataFrame\r\n", |
| " Pandas dataframe containing German dataset.\r\n", |
| " \"\"\"\r\n", |
| " \r\n", |
| " global processed_data\r\n", |
| " global categorical\r\n", |
| " global label_encoders\r\n", |
| "\r\n", |
| " # Reset global variables\r\n", |
| " \r\n", |
| " processed_data = None\r\n", |
| " categorical = None\r\n", |
| " label_encoders = {}\r\n", |
| "\r\n", |
| "\r\n", |
| " if dataset == \"German\":\r\n", |
| " # Drop savings account and checkings account columns as they contain a lot\r\n", |
| " # of NaN values and may not always be available in real life scenarios\r\n", |
| " data = data.drop(columns = ['Saving accounts', 'Checking account'])\r\n", |
| " \r\n", |
| " dat_dict = data.to_dict()\r\n", |
| " new_dat_dict = {}\r\n", |
| "\r\n", |
| " # rename columns(Make them lowercase and snakecase)\r\n", |
| " for key, value in dat_dict.items():\r\n", |
| " newKey = key\r\n", |
| " if type(key) == str:\r\n", |
| " newKey = newKey.lower().replace(' ', '_')\r\n", |
| " # if newKey != key:\r\n", |
| " new_dat_dict[newKey] = dat_dict[key]\r\n", |
| " del dat_dict\r\n", |
| "\r\n", |
| " data = pd.DataFrame.from_dict(new_dat_dict)\r\n", |
| " del new_dat_dict\r\n", |
| "\r\n", |
| "\r\n", |
| " # print(data.describe())\r\n", |
| " # print(data.describe(include='O'))\r\n", |
| "\r\n", |
| " cols = data.columns\r\n", |
| " num_cols = data._get_numeric_data().columns\r\n", |
| " categorical = list(set(cols) - set(num_cols))\r\n", |
| "\r\n", |
| " # Drop null rows\r\n", |
| " data = data.dropna()\r\n", |
| "\r\n", |
| " # Encode text columns to number values\r\n", |
| " for category in categorical:\r\n", |
| " le = LabelEncoder()\r\n", |
| " data[category] = le.fit_transform(data[category])\r\n", |
| " label_encoders[category] = le\r\n", |
| "\r\n", |
| " for col in data.columns:\r\n", |
| " if(col not in categorical):\r\n", |
| " data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))\r\n", |
| "\r\n", |
| " # print(data.describe())\r\n", |
| " # print(data.describe(include='O'))\r\n", |
| "\r\n", |
| " processed_data = data\r\n", |
| "\r\n", |
| " # Get Training parameters\r\n", |
| " if dataset == \"German\":\r\n", |
| " target_col = data.columns[-1]\r\n", |
| " x = data.drop(columns=target_col, axis=1)\r\n", |
| " y = data[target_col].astype('int')\r\n", |
| " elif dataset == \"Australian\":\r\n", |
| " x = data.drop(14, axis=1)\r\n", |
| " y = data[14].astype('int')\r\n", |
| " elif dataset == \"Japanese\":\r\n", |
| " x = data.drop(15, axis=1)\r\n", |
| " y = data[15].astype('int')\r\n", |
| " elif dataset == \"Taiwan\":\r\n", |
| " x = data.drop('default_payment_next_month', axis=1)\r\n", |
| " y = data['default_payment_next_month'].astype('int')\r\n", |
| " elif dataset == \"Polish\":\r\n", |
| " x = data.drop('class', axis=1)\r\n", |
| " y = data['class'].astype('int')\r\n", |
| "\r\n", |
| "\r\n", |
| " x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size)\r\n", |
| " x_train = pd.DataFrame(x_train)\r\n", |
| " y_train = pd.DataFrame(y_train)\r\n", |
| "\r\n", |
| " sc = StandardScaler()\r\n", |
| " x_train = sc.fit_transform(x_train)\r\n", |
| " x_test = sc.transform(x_test)\r\n", |
| "\r\n", |
| " return (x_train, x_test, y_train, y_test)\r\n" |
| ], |
| "outputs": [], |
| "metadata": { |
| "id": "Z5EFqHxB-zEn" |
| } |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "source": [ |
| "from sklearn.svm import SVC\r\n", |
| "from sklearn.neural_network import MLPClassifier\r\n", |
| "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\r\n", |
| "from sklearn.model_selection import GridSearchCV, ShuffleSplit\r\n", |
| "from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score\r\n", |
| "\r\n", |
| "class Model(object):\r\n", |
| " \"\"\"\r\n", |
| " Basic Scorecard Model\r\n", |
| "\r\n", |
| " Warning: This class should not be used directly. Use derived classes\r\n", |
| " instead.\r\n", |
| " \"\"\"\r\n", |
| "\r\n", |
| " def __init__(self,\r\n", |
| " classifier=None,\r\n", |
| " test_size=test_size,\r\n", |
| " n_splits=1,\r\n", |
| " random_state=None,\r\n", |
| " n_jobs=None,\r\n", |
| " params=None):\r\n", |
| " \r\n", |
| " self.classifier = classifier\r\n", |
| " self.params = params\r\n", |
| " self.random_state = random_state\r\n", |
| " self.test_size = test_size\r\n", |
| " self.n_splits = n_splits\r\n", |
| " self.n_jobs = n_jobs\r\n", |
| "\r\n", |
| " self.model = GridSearchCV(estimator=classifier,\r\n", |
| " param_grid=params,\r\n", |
| " n_jobs=n_jobs,\r\n", |
| " cv=ShuffleSplit(test_size=test_size,\r\n", |
| " n_splits=n_splits,\r\n", |
| " random_state=0))\r\n", |
| " \r\n", |
| " def __str__(self):\r\n", |
| " return f\"\"\"\r\n", |
| " Model Object\r\n", |
| " ----------------------------------------------------------------\r\n", |
| "\r\n", |
| " Classifier: {self.classifier.__class__.__name__}\r\n", |
| " Test Size: {self.test_size}\r\n", |
| " Random State: {self.random_state}\r\n", |
| " Number of Splits: {self.n_splits}\r\n", |
| " Parameter Grid: {self.params}\r\n", |
| "\r\n", |
| " {self.model}\r\n", |
| " \"\"\"\r\n", |
| " \r\n", |
| " def __repr__(self):\r\n", |
| " return self.__str__()\r\n", |
| "\r\n", |
| " def train(self, x_train, y_train):\r\n", |
| " \"\"\"\r\n", |
| " Train scorecard model\r\n", |
| " \r\n", |
| " Args:\r\n", |
| " x_train:\r\n", |
| " array of training parameters\r\n", |
| " y_train:\r\n", |
| " pandas dataframe with training labels\r\n", |
| " \"\"\"\r\n", |
| "\r\n", |
| " self.model = self.model.fit(x_train, y_train.values.ravel())\r\n", |
| " return self\r\n", |
| "\r\n", |
| " def predict(self, data):\r\n", |
| " \"\"\"\r\n", |
| " Predict scorecard model\r\n", |
| "\r\n", |
| " Args:\r\n", |
| " data: array\r\n", |
| " Data to perform prediction on.\r\n", |
| " \"\"\"\r\n", |
| "\r\n", |
| " return self.model.predict(data)\r\n", |
| "\r\n", |
| " def accuracy(self, x_test, y_test):\r\n", |
| " \"\"\"\r\n", |
| " Compute scorecard model accuracy\r\n", |
| "\r\n", |
| " Args:\r\n", |
| " x_test: array\r\n", |
| " The test parameters.\r\n", |
| " y_test: array\r\n", |
| " The labels\r\n", |
| " \"\"\"\r\n", |
| "\r\n", |
| " y_pred = self.predict(x_test)\r\n", |
| " return accuracy_score(y_test, y_pred, normalize=False)\r\n", |
| "\r\n", |
| " def metrics(self, x_test, y_test):\r\n", |
| " \"\"\"\r\n", |
| " Comput scorecard model metrics\r\n", |
| " \r\n", |
| " Args:\r\n", |
| " x_test: array\r\n", |
| " The test parameters.\r\n", |
| " y_test: array\r\n", |
| " The labels\r\n", |
| " \"\"\"\r\n", |
| "\r\n", |
| " y_pred = self.predict(x_test)\r\n", |
| " \r\n", |
| " cm = confusion_matrix(y_pred, y_test)\r\n", |
| " accuracy = accuracy_score(y_test, y_pred, normalize=True)\r\n", |
| " f1 = f1_score(y_test, y_pred, average=\"macro\")\r\n", |
| " recall = recall_score(y_test, y_pred, average=\"macro\")\r\n", |
| " precision = precision_score(y_test, y_pred, average=\"macro\")\r\n", |
| "\r\n", |
| " return {\"accuracy\" : accuracy,\r\n", |
| " \"f1_score\" : f1,\r\n", |
| " \"recall_score\" : recall,\r\n", |
| " \"precision_score\": precision}\r\n", |
| "\r\n", |
| "class RandomForest(Model):\r\n", |
| " def __init__(self,\r\n", |
| " classifier=RandomForestClassifier(),\r\n", |
| " test_size=test_size,\r\n", |
| " n_splits=1,\r\n", |
| " random_state=0,\r\n", |
| " n_jobs=None,\r\n", |
| " params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}): \r\n", |
| " super(RandomForest, self).__init__(classifier,\r\n", |
| " test_size,\r\n", |
| " n_splits,\r\n", |
| " random_state,\r\n", |
| " n_jobs,\r\n", |
| " params)\r\n", |
| "\r\n", |
| "class SVC(Model):\r\n", |
| " def __init__(self,\r\n", |
| " classifier=SVC(),\r\n", |
| " test_size=test_size,\r\n", |
| " n_splits=1,\r\n", |
| " random_state=0,\r\n", |
| " n_jobs=None,\r\n", |
| " params={'kernel' : ['poly'], 'degree' : [2, 3, 4]}):\r\n", |
| " super(SVC, self).__init__(classifier,\r\n", |
| " test_size,\r\n", |
| " n_splits,\r\n", |
| " random_state,\r\n", |
| " n_jobs,\r\n", |
| " params)\r\n", |
| "\r\n", |
| "class MLP(Model):\r\n", |
| " def __init__(self,\r\n", |
| " classifier=MLPClassifier(),\r\n", |
| " test_size=test_size,\r\n", |
| " n_splits=1,\r\n", |
| " random_state=0,\r\n", |
| " n_jobs=-1,\r\n", |
| " params={'hidden_layer_sizes' : [(100, 50 ,10)],\r\n", |
| " 'max_iter' : [500],\r\n", |
| " 'activation' : ['relu'],\r\n", |
| " 'solver' : ['adam'],\r\n", |
| " 'random_state' : [1]}):\r\n", |
| " super(MLP, self).__init__(classifier,\r\n", |
| " test_size,\r\n", |
| " n_splits,\r\n", |
| " random_state,\r\n", |
| " n_jobs,\r\n", |
| " params)\r\n", |
| "\r\n", |
| "class GradientBoost(Model):\r\n", |
| " def __init__(self,\r\n", |
| " classifier=GradientBoostingClassifier(),\r\n", |
| " test_size=test_size,\r\n", |
| " n_splits=1,\r\n", |
| " random_state=0,\r\n", |
| " n_jobs=None,\r\n", |
| " params={'n_estimators' : [100, 200, 50],\r\n", |
| " 'random_state' : [0],\r\n", |
| " 'learning_rate' : [1.0],\r\n", |
| " 'max_depth' : [1, 2, 3]}):\r\n", |
| " super(GradientBoost, self).__init__(classifier,\r\n", |
| " test_size,\r\n", |
| " n_splits,\r\n", |
| " random_state,\r\n", |
| " n_jobs,\r\n", |
| " params)\r\n", |
| "\r\n" |
| ], |
| "outputs": [], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "source": [ |
| "# GERMAN DATASET\r\n", |
| "german = pd.read_csv('./zoo/data/german.csv', index_col=0)\r\n", |
| "x_train, x_test, y_train, y_test = preprocessing(\"German\", german, test_size)\r\n", |
| "\r\n", |
| "# Print Encoders\r\n", |
| "print(categorical)\r\n", |
| "print(label_encoders)\r\n", |
| "\r\n", |
| "# Set and Train the models\r\n", |
| "print('\\nRF')\r\n", |
| "RFmodel = RandomForest().train(x_train, y_train)\r\n", |
| "print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nSVC\")\r\n", |
| "SVCmodel = SVC().train(x_train, y_train)\r\n", |
| "print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nMLP\")\r\n", |
| "MLPmodel = MLP().train(x_train, y_train)\r\n", |
| "print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nGB\")\r\n", |
| "GBmodel = GradientBoost().train(x_train, y_train)\r\n", |
| "print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "# Save Training Data\r\n", |
| "joblib.dump(categorical, \"zoo/models/german/categorical.joblib\", compress=True)\r\n", |
| "joblib.dump(label_encoders, \"zoo/models/german/label_encoders.joblib\", compress=True)\r\n", |
| "\r\n", |
| "joblib.dump(RFmodel.model, \"zoo/models/german/rf_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(SVCmodel.model, \"zoo/models/german/svc_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(MLPmodel.model, \"zoo/models/german/mlp_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(GBmodel.model, \"zoo/models/german/gb_classifier.joblib\", compress=True)" |
| ], |
| "outputs": [ |
| { |
| "output_type": "stream", |
| "name": "stdout", |
| "text": [ |
| "['sex', 'risk', 'purpose', 'housing']\n", |
| "{'sex': LabelEncoder(), 'risk': LabelEncoder(), 'purpose': LabelEncoder(), 'housing': LabelEncoder()}\n", |
| "\n", |
| "RF\n", |
| "Random Forest: {'accuracy': 0.675, 'f1_score': 0.5226029157944989, 'recall_score': 0.5362070998938554, 'precision_score': 0.5678603006189213}\n", |
| "\n", |
| "SVC\n", |
| "SVM: {'accuracy': 0.69, 'f1_score': 0.4859890565412038, 'recall_score': 0.5240004717537445, 'precision_score': 0.583710407239819}\n", |
| "\n", |
| "MLP\n", |
| "MLP: {'accuracy': 0.7, 'f1_score': 0.6101871101871102, 'recall_score': 0.6047883005071353, 'precision_score': 0.6316101318323212}\n", |
| "\n", |
| "GB\n", |
| "Gradient Boost: {'accuracy': 0.705, 'f1_score': 0.5736840203764586, 'recall_score': 0.5761882297440736, 'precision_score': 0.6341362126245846}\n" |
| ] |
| }, |
| { |
| "output_type": "execute_result", |
| "data": { |
| "text/plain": [ |
| "['zoo/german/gb_classifier.joblib']" |
| ] |
| }, |
| "metadata": {}, |
| "execution_count": 3 |
| } |
| ], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "source": [], |
| "outputs": [], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 4, |
| "source": [ |
| "# Australian DATASET\r\n", |
| "australian = [i.strip().split() for i in open(\"./zoo/data/australian.dat\").readlines()]\r\n", |
| "australian = pd.DataFrame(australian)\r\n", |
| "x_train, x_test, y_train, y_test = preprocessing(\"Australian\", australian, test_size)\r\n", |
| "\r\n", |
| "# Print Encoders\r\n", |
| "print(categorical)\r\n", |
| "print(label_encoders)\r\n", |
| "\r\n", |
| "# Set and Train the models\r\n", |
| "print('\\nRF')\r\n", |
| "RFmodel = RandomForest().train(x_train, y_train)\r\n", |
| "print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nSVC\")\r\n", |
| "SVCmodel = SVC().train(x_train, y_train)\r\n", |
| "print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nMLP\")\r\n", |
| "MLPmodel = MLP().train(x_train, y_train)\r\n", |
| "print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nGB\")\r\n", |
| "GBmodel = GradientBoost().train(x_train, y_train)\r\n", |
| "print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "# Save Training Data\r\n", |
| "joblib.dump(categorical, \"zoo/models/australian/categorical.joblib\", compress=True)\r\n", |
| "joblib.dump(label_encoders, \"zoo/australian/label_encoders.joblib\", compress=True)\r\n", |
| "\r\n", |
| "joblib.dump(RFmodel.model, \"zoo/models/australian/rf_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(SVCmodel.model, \"zoo/models/australian/svc_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(MLPmodel.model, \"zoo/models/australian/mlp_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(GBmodel.model, \"zoo/models/australian/gb_classifier.joblib\", compress=True)" |
| ], |
| "outputs": [ |
| { |
| "output_type": "stream", |
| "name": "stdout", |
| "text": [ |
| "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n", |
| "{0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder()}\n", |
| "\n", |
| "RF\n", |
| "Random Forest: {'accuracy': 0.855072463768116, 'f1_score': 0.8549505991170906, 'recall_score': 0.8562697576396207, 'precision_score': 0.855072463768116}\n", |
| "\n", |
| "SVC\n", |
| "SVM: {'accuracy': 0.8623188405797102, 'f1_score': 0.8623116105655622, 'recall_score': 0.8648050579557429, 'precision_score': 0.8642676767676767}\n", |
| "\n", |
| "MLP\n", |
| "MLP: {'accuracy': 0.855072463768116, 'f1_score': 0.8549505991170906, 'recall_score': 0.8562697576396207, 'precision_score': 0.855072463768116}\n", |
| "\n", |
| "GB\n", |
| "Gradient Boost: {'accuracy': 0.8695652173913043, 'f1_score': 0.8691253951527924, 'recall_score': 0.8691253951527924, 'precision_score': 0.8691253951527924}\n" |
| ] |
| }, |
| { |
| "output_type": "execute_result", |
| "data": { |
| "text/plain": [ |
| "['zoo/australian/gb_classifier.joblib']" |
| ] |
| }, |
| "metadata": {}, |
| "execution_count": 4 |
| } |
| ], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "source": [], |
| "outputs": [], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "source": [ |
| "# Japanese DATASET\r\n", |
| "japanese = [i.strip().split(\",\") for i in open(\"./zoo/data/japanese/japanese.data\").readlines()]\r\n", |
| "japanese = pd.DataFrame(japanese)\r\n", |
| "x_train, x_test, y_train, y_test = preprocessing(\"Japanese\", japanese, test_size)\r\n", |
| "\r\n", |
| "# Print Encoders\r\n", |
| "print(categorical)\r\n", |
| "print(label_encoders)\r\n", |
| "\r\n", |
| "# Set and Train the models\r\n", |
| "print('\\nRF')\r\n", |
| "RFmodel = RandomForest().train(x_train, y_train)\r\n", |
| "print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nSVC\")\r\n", |
| "SVCmodel = SVC().train(x_train, y_train)\r\n", |
| "print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nMLP\")\r\n", |
| "MLPmodel = MLP().train(x_train, y_train)\r\n", |
| "print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nGB\")\r\n", |
| "GBmodel = GradientBoost().train(x_train, y_train)\r\n", |
| "print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "# Save Training Data\r\n", |
| "joblib.dump(categorical, \"zoo/models/japanese/categorical.joblib\", compress=True)\r\n", |
| "joblib.dump(label_encoders, \"zoo/models/japanese/label_encoders.joblib\", compress=True)\r\n", |
| "\r\n", |
| "joblib.dump(RFmodel.model, \"zoo/models/japanese/rf_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(SVCmodel.model, \"zoo/models/japanese/svc_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(MLPmodel.model, \"zoo/models/japanese/mlp_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(GBmodel.model, \"zoo/models/japanese/gb_classifier.joblib\", compress=True)" |
| ], |
| "outputs": [ |
| { |
| "output_type": "stream", |
| "name": "stdout", |
| "text": [ |
| "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n", |
| "{0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder(), 15: LabelEncoder()}\n", |
| "\n", |
| "RF\n", |
| "Random Forest: {'accuracy': 0.9057971014492754, 'f1_score': 0.9020475020475021, 'recall_score': 0.9130143112701252, 'precision_score': 0.8962668955159837}\n", |
| "\n", |
| "SVC\n", |
| "SVM: {'accuracy': 0.8478260869565217, 'f1_score': 0.8442186744073537, 'recall_score': 0.8627012522361359, 'precision_score': 0.8418335089567967}\n", |
| "\n", |
| "MLP\n", |
| "MLP: {'accuracy': 0.8115942028985508, 'f1_score': 0.8057179987004548, 'recall_score': 0.8184257602862254, 'precision_score': 0.8022071307300509}\n", |
| "\n", |
| "GB\n", |
| "Gradient Boost: {'accuracy': 0.8695652173913043, 'f1_score': 0.8639649507119387, 'recall_score': 0.8725402504472272, 'precision_score': 0.8590517241379311}\n" |
| ] |
| }, |
| { |
| "output_type": "execute_result", |
| "data": { |
| "text/plain": [ |
| "['zoo/japanese/gb_classifier.joblib']" |
| ] |
| }, |
| "metadata": {}, |
| "execution_count": 5 |
| } |
| ], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "source": [], |
| "outputs": [], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "source": [ |
| "# Taiwan DATASET\r\n", |
| "taiwan = pd.read_excel('./zoo/data/taiwan.xls', index_col=0, header=1)\r\n", |
| "x_train, x_test, y_train, y_test = preprocessing(\"Taiwan\", taiwan, test_size)\r\n", |
| "\r\n", |
| "# Print Encoders\r\n", |
| "print(categorical)\r\n", |
| "print(label_encoders)\r\n", |
| "\r\n", |
| "# Set and Train the models\r\n", |
| "print('\\nRF')\r\n", |
| "RFmodel = RandomForest().train(x_train, y_train)\r\n", |
| "print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nSVC\")\r\n", |
| "SVCmodel = SVC().train(x_train, y_train)\r\n", |
| "print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nMLP\")\r\n", |
| "MLPmodel = MLP().train(x_train, y_train)\r\n", |
| "print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nGB\")\r\n", |
| "GBmodel = GradientBoost().train(x_train, y_train)\r\n", |
| "print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "# Save Training Data\r\n", |
| "joblib.dump(categorical, \"zoo/models/taiwan/categorical.joblib\", compress=True)\r\n", |
| "joblib.dump(label_encoders, \"zoo/models/taiwan/label_encoders.joblib\", compress=True)\r\n", |
| "\r\n", |
| "joblib.dump(RFmodel.model, \"zoo/models/taiwan/rf_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(SVCmodel.model, \"zoo/models/taiwan/svc_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(MLPmodel.model, \"zoo/models/taiwan/mlp_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(GBmodel.model, \"zoo/models/taiwan/gb_classifier.joblib\", compress=True)" |
| ], |
| "outputs": [ |
| { |
| "output_type": "stream", |
| "name": "stdout", |
| "text": [ |
| "[]\n", |
| "{}\n", |
| "\n", |
| "RF\n", |
| "Random Forest: {'accuracy': 0.8181666666666667, 'f1_score': 0.681903656460614, 'recall_score': 0.6583642128492948, 'precision_score': 0.7439043272511898}\n", |
| "\n", |
| "SVC\n", |
| "SVM: {'accuracy': 0.81, 'f1_score': 0.616125081264749, 'recall_score': 0.6001113663459882, 'precision_score': 0.7597362173568979}\n", |
| "\n", |
| "MLP\n", |
| "MLP: {'accuracy': 0.7716666666666666, 'f1_score': 0.628088225074129, 'recall_score': 0.6176235046432739, 'precision_score': 0.6493170130422868}\n", |
| "\n", |
| "GB\n", |
| "Gradient Boost: {'accuracy': 0.8228333333333333, 'f1_score': 0.6804605216188897, 'recall_score': 0.6544810313636326, 'precision_score': 0.7611662244399562}\n" |
| ] |
| }, |
| { |
| "output_type": "execute_result", |
| "data": { |
| "text/plain": [ |
| "['zoo/taiwan/gb_classifier.joblib']" |
| ] |
| }, |
| "metadata": {}, |
| "execution_count": 6 |
| } |
| ], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "source": [], |
| "outputs": [], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 15, |
| "source": [ |
| "# Polish DATASET\r\n", |
| "from scipy.io import arff\r\n", |
| "\r\n", |
| "year_1 = pd.DataFrame(arff.loadarff('./zoo/data/polish/1year.arff')[0])\r\n", |
| "year_2 = pd.DataFrame(arff.loadarff('./zoo/data/polish/2year.arff')[0])\r\n", |
| "year_3 = pd.DataFrame(arff.loadarff('./zoo/data/polish/3year.arff')[0])\r\n", |
| "year_4 = pd.DataFrame(arff.loadarff('./zoo/data/polish/4year.arff')[0])\r\n", |
| "year_5 = pd.DataFrame(arff.loadarff('./zoo/data/polish/5year.arff')[0])\r\n", |
| "polish = pd.concat([year_1, year_2, year_3, year_4, year_5], ignore_index=True)\r\n", |
| "x_train, x_test, y_train, y_test = preprocessing(\"Polish\", polish, test_size)\r\n", |
| "\r\n", |
| "# Print Encoders\r\n", |
| "print(categorical)\r\n", |
| "print(label_encoders)\r\n", |
| "\r\n", |
| "# Set and Train the models\r\n", |
| "print('\\nRF')\r\n", |
| "RFmodel = RandomForest().train(x_train, y_train)\r\n", |
| "print(f\"Random Forest: {RFmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nSVC\")\r\n", |
| "SVCmodel = SVC().train(x_train, y_train)\r\n", |
| "print(f\"SVM: {SVCmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nMLP\")\r\n", |
| "MLPmodel = MLP().train(x_train, y_train)\r\n", |
| "print(f\"MLP: {MLPmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "print(\"\\nGB\")\r\n", |
| "GBmodel = GradientBoost().train(x_train, y_train)\r\n", |
| "print(f\"Gradient Boost: {GBmodel.metrics(x_test, y_test)}\")\r\n", |
| "\r\n", |
| "# Save Training Data\r\n", |
| "joblib.dump(categorical, \"zoo/models/polish/categorical.joblib\", compress=True)\r\n", |
| "joblib.dump(label_encoders, \"zoo/models/polish/label_encoders.joblib\", compress=True)\r\n", |
| "\r\n", |
| "joblib.dump(RFmodel.model, \"zoo/models/polish/rf_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(SVCmodel.model, \"zoo/models/polish/svc_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(MLPmodel.model, \"zoo/models/polish/mlp_classifier.joblib\", compress=True)\r\n", |
| "joblib.dump(GBmodel.model, \"zoo/models/polish/gb_classifier.joblib\", compress=True)" |
| ], |
| "outputs": [ |
| { |
| "output_type": "stream", |
| "name": "stdout", |
| "text": [ |
| "['class']\n", |
| "{'sex': LabelEncoder(), 'risk': LabelEncoder(), 'housing': LabelEncoder(), 'purpose': LabelEncoder(), 0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder(), 15: LabelEncoder(), 'class': LabelEncoder()}\n", |
| "Random Forest: {'accuracy': 0.9802203304957436, 'f1_score': 0.5073501054699149, 'recall_score': 0.5059245566321038, 'precision_score': 0.5466471490310888}\n", |
| "SVM: {'accuracy': 0.9817225838758137, 'f1_score': 0.49538850284270375, 'recall_score': 0.4998725140234574, 'precision_score': 0.49098422238918105}\n", |
| "MLP: {'accuracy': 0.9799699549323986, 'f1_score': 0.71321171997573, 'recall_score': 0.7103058246926172, 'precision_score': 0.7162043104703655}\n", |
| "Gradient Boost: {'accuracy': 0.9819729594391587, 'f1_score': 0.566873915558126, 'recall_score': 0.5409017508074112, 'precision_score': 0.7417127071823204}\n" |
| ] |
| }, |
| { |
| "output_type": "execute_result", |
| "data": { |
| "text/plain": [ |
| "['zoo/polish/gb_classifier.joblib']" |
| ] |
| }, |
| "metadata": {}, |
| "execution_count": 15 |
| } |
| ], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "source": [], |
| "outputs": [], |
| "metadata": {} |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "source": [], |
| "outputs": [], |
| "metadata": {} |
| } |
| ] |
| } |