| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Documentation" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Sample Notebook To Build a Model and Make Predictions with the Titanic Dataset from Kaggle" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Table of Contents\n", |
| "\n", |
| "0. [Params](#Params)\n", |
| "1. [Acquisitor and Cleaner](#Acquisitor-and-Cleaner)\n", |
| "2. [Training Preparator](#Training-Preparator)\n", |
| "3. [Trainer](#Trainer)\n", |
| "4. [Metrics Evaluator](#Metrics-Evaluator)\n", |
| "5. [Prediction Preparator](#Prediction-Preparator)\n", |
| "6. [Predictor](#Predictor)\n", |
| "7. [Feedback](#Feedback)\n", |
| "8. [Sample Application](#Sample-Application)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Params" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "# puts this params in engine.params file to be used by dryrun and executor as default params\n", |
| "# use a full grid over all parameters\n", |
| "params = {\n", |
| " \"svm\": [\n", |
| " {\"C\": [1, 10, 100], \"gamma\": [0.01, 0.001], \"kernel\": [\"linear\"]},\n", |
| " {\"C\": [1, 10, 100],\"gamma\": [0.01, 0.001],\"kernel\": [\"rbf\"]}\n", |
| " ],\n", |
| " \"rf\": {\n", |
| " \"max_depth\": [3],\n", |
| " \"random_state\": [0],\n", |
| " \"min_samples_split\": [2],\n", |
| " \"min_samples_leaf\": [1],\n", |
| " \"n_estimators\": [20],\n", |
| " \"bootstrap\": [True, False],\n", |
| " \"criterion\": [\"gini\", \"entropy\"]\n", |
| " },\n", |
| " \"pred_cols\": [\"Age\", \"Pclass\", \"Sex\", \"Fare\"],\n", |
| " \"dep_var\": \"Survived\"\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "metadata": { |
| "marvin_cell": "acquisitor" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "891 samples to train with 12 features...\n", |
| "418 samples to test...\n" |
| ] |
| } |
| ], |
| "source": [ |
| "from marvin_python_toolbox.common.data import MarvinData\n", |
| "import pandas as pd\n", |
| "\n", |
| "train_df = pd.read_csv(MarvinData.download_file(\"https://s3.amazonaws.com/marvin-engines-data/titanic/train.csv\"))\n", |
| "test_df = pd.read_csv(MarvinData.download_file(\"https://s3.amazonaws.com/marvin-engines-data/titanic/test.csv\"))\n", |
| "\n", |
| "print (\"{} samples to train with {} features...\".format(train_df.shape[0], train_df.shape[1]))\n", |
| "print (\"{} samples to test...\".format(test_df.shape[0]))\n", |
| "\n", |
| "marvin_initial_dataset = {\n", |
| " 'train': train_df,\n", |
| " 'test': test_df\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Training Preparator" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 4, |
| "metadata": { |
| "marvin_cell": "tpreparator" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Length: 714\n", |
| "Length: 331\n", |
| "Preparation is Done!!!!\n" |
| ] |
| }, |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "/home/vagrant/.virtualenvs/titanic-engine-env/local/lib/python2.7/site-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: \n", |
| "A value is trying to be set on a copy of a slice from a DataFrame.\n", |
| "Try using .loc[row_indexer,col_indexer] = value instead\n", |
| "\n", |
| "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", |
| " self.obj[item] = s\n" |
| ] |
| } |
| ], |
| "source": [ |
| "from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV\n", |
| "\n", |
| "train_no_na = marvin_initial_dataset['train'][params[\"pred_cols\"] + [params[\"dep_var\"]]].dropna()\n", |
| "\n", |
| "print(\"Length: {}\".format(len(train_no_na)))\n", |
| "\n", |
| "# Feature Engineering\n", |
| "data_X = train_no_na[params[\"pred_cols\"]]\n", |
| "data_X.loc[:, 'Sex'] = data_X.loc[:, 'Sex'].map({'male': 1, 'female': 0})\n", |
| "data_y = train_no_na[params[\"dep_var\"]]\n", |
| "\n", |
| "# Prepare for Stratified Shuffle Split\n", |
| "sss = StratifiedShuffleSplit(n_splits=5, test_size=.6, random_state=0)\n", |
| "sss.get_n_splits(data_X, data_y)\n", |
| "\n", |
| "# Get Test Dataset\n", |
| "test_no_na = marvin_initial_dataset['test'][params[\"pred_cols\"]].dropna()\n", |
| "\n", |
| "print(\"Length: {}\".format(len(test_no_na)))\n", |
| "\n", |
| "# Feature Engineering\n", |
| "test_X = test_no_na[params[\"pred_cols\"]]\n", |
| "test_X.loc[:, 'Sex'] = test_X.loc[:, 'Sex'].map({'male': 1, 'female': 0})\n", |
| "\n", |
| "marvin_dataset = {\n", |
| " 'X_train': data_X,\n", |
| " 'y_train': data_y,\n", |
| " 'X_test': test_X,\n", |
| " 'sss': sss\n", |
| "}\n", |
| "\n", |
| "print (\"Preparation is Done!!!!\")" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Trainer" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "metadata": { |
| "marvin_cell": "trainer" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "\n", |
| "\n", |
| "Starting grid search using SVM!\n", |
| "Model Type: SVM\n", |
| "{'kernel': 'linear', 'C': 10, 'verbose': False, 'probability': False, 'degree': 3, 'shrinking': True, 'max_iter': -1, 'decision_function_shape': None, 'random_state': None, 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 0.01, 'class_weight': None}\n", |
| "Accuracy Score: 0.78%\n", |
| "\n", |
| "\n", |
| "Starting grid search using RandomForestClassifier!\n", |
| "Model Type: RF\n", |
| "{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 20, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'entropy', 'random_state': 0, 'min_impurity_split': 1e-07, 'max_features': 'auto', 'max_depth': 3, 'class_weight': None}\n", |
| "Accuracy Score: 0.7925%\n" |
| ] |
| } |
| ], |
| "source": [ |
| "from sklearn import svm, neighbors, tree\n", |
| "from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV\n", |
| "from sklearn.ensemble import RandomForestClassifier\n", |
| "from sklearn.preprocessing import StandardScaler, scale\n", |
| "from sklearn.linear_model import LogisticRegression\n", |
| "\n", |
| "print(\"\\n\\nStarting grid search using SVM!\")\n", |
| "\n", |
| "# Create a classifier with the parameter candidates\n", |
| "svm_grid = GridSearchCV(estimator=svm.SVC(), param_grid=params[\"svm\"], cv=marvin_dataset[\"sss\"], n_jobs=-1)\n", |
| "\n", |
| "# Train the classifier on training data\n", |
| "svm_grid.fit(\n", |
| " marvin_dataset['X_train'],\n", |
| " marvin_dataset['y_train']\n", |
| ")\n", |
| "\n", |
| "print(\"Model Type: SVM\\n{}\".format(svm_grid.best_estimator_.get_params()))\n", |
| "print(\"Accuracy Score: {}%\".format(round(svm_grid.best_score_,4)))\n", |
| "\n", |
| "print(\"\\n\\nStarting grid search using RandomForestClassifier!\")\n", |
| "\n", |
| "# run grid search\n", |
| "rf_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params[\"rf\"], cv=marvin_dataset[\"sss\"])\n", |
| "rf_grid.fit(\n", |
| " marvin_dataset['X_train'],\n", |
| " marvin_dataset['y_train']\n", |
| ")\n", |
| "\n", |
| "print(\"Model Type: RF\\n{}\".format(rf_grid.best_estimator_.get_params()))\n", |
| "print(\"Accuracy Score: {}%\".format(round(rf_grid.best_score_,4)))\n", |
| "\n", |
| "marvin_model = {\n", |
| " 'svm': svm_grid,\n", |
| " 'rf': rf_grid\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Metrics Evaluation" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": { |
| "marvin_cell": "evaluator" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Classification Report:\n", |
| "\n", |
| " precision recall f1-score support\n", |
| "\n", |
| " 0 0.95 0.79 0.86 512\n", |
| " 1 0.62 0.90 0.74 202\n", |
| "\n", |
| "avg / total 0.86 0.82 0.83 714\n", |
| "\n", |
| "Confusion Matrix:\n", |
| "\n", |
| "[[403 109]\n", |
| " [ 21 181]]\n", |
| "\n", |
| "\n", |
| "\n", |
| "Classification Report:\n", |
| "\n", |
| " precision recall f1-score support\n", |
| "\n", |
| " 0 0.85 0.79 0.82 453\n", |
| " 1 0.68 0.75 0.72 261\n", |
| "\n", |
| "avg / total 0.79 0.78 0.78 714\n", |
| "\n", |
| "Confusion Matrix:\n", |
| "\n", |
| "[[360 93]\n", |
| " [ 64 197]]\n", |
| "\n", |
| "\n", |
| "\n", |
| "Feature ranking:\n", |
| "1. feature Sex (0.542498)\n", |
| "2. feature Pclass (0.184832)\n", |
| "3. feature Fare (0.170240)\n", |
| "4. feature Age (0.102431)\n" |
| ] |
| } |
| ], |
| "source": [ |
| "from sklearn import metrics\n", |
| "import numpy as np\n", |
| "\n", |
| "all_metrics = {}\n", |
| "\n", |
| "_model = marvin_model\n", |
| "for model_type, fitted_model in _model.iteritems():\n", |
| " \n", |
| " y_predicted = fitted_model.predict(marvin_dataset['X_train'])\n", |
| " \n", |
| " all_metrics[model_type] = {}\n", |
| " all_metrics[model_type][\"report\"] = metrics.classification_report(y_predicted, marvin_dataset['y_train'])\n", |
| " all_metrics[model_type][\"confusion_matrix\"] = metrics.confusion_matrix(y_predicted, marvin_dataset['y_train']) \n", |
| " \n", |
| " # Print the classification report of `y_test` and `predicted`\n", |
| " print(\"Classification Report:\\n\")\n", |
| " print(all_metrics[model_type][\"report\"])\n", |
| " \n", |
| " # Print the confusion matrix\n", |
| " print(\"Confusion Matrix:\\n\")\n", |
| " print(all_metrics[model_type][\"confusion_matrix\"])\n", |
| " print(\"\\n\\n\")\n", |
| "\n", |
| "importances = _model[\"rf\"].best_estimator_.feature_importances_\n", |
| "indices = np.argsort(importances)[::-1]\n", |
| "\n", |
| "# Print the feature ranking\n", |
| "print(\"Feature ranking:\")\n", |
| "\n", |
| "all_metrics[\"feature_ranking\"] = []\n", |
| "for f in range(marvin_dataset['X_train'].shape[1]):\n", |
| " all_metrics[\"feature_ranking\"].append((f + 1, params[\"pred_cols\"][indices[f]], importances[indices[f]]))\n", |
| " print(\"%d. feature %s (%f)\" % all_metrics[\"feature_ranking\"][f])\n", |
| "\n", |
| "marvin_metrics = all_metrics" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 8, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlYAAAFUCAYAAADmhXKJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAGKNJREFUeJzt3X+0XWV95/H3x4QfFlAUIgIJPxS0poiK4UdnWceKHYk/QLFMYYotDh1krWYsY3+IVlmWli6lVtdIsQWEilgEhFWNGhd2anWUDshFERswNUZsAipRglBQIfCdP86+9BhvuDvkuTn33Lxfa53FfvZ+ztnfczZePzz72XunqpAkSdLWe8KoC5AkSZorDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFK0qyW5G+SvGPUdUhSH/E+VtLclOR2YC/g4aHVz6qqO7fiM18CfKSqFm5ddeMpyYeAdVX19lHXIml2csRKmtteXVW7Dr0ed6hqIcn8Ue5/aySZN+oaJM1+BitpO5TkqCT/nOSeJF/rRqImt70hyW1J7kuyJskbu/W7AJ8B9kny791rnyQfSvJnQ+9/SZJ1Q+3bk7wlyS3A/Unmd++7Jsn6JN9O8qbHqPXRz5/87CR/lOSuJN9N8pokr0jyr0nuTvK2ofe+M8nVSa7svs9XkjxvaPtzkny++x1WJjl2k/3+dZIVSe4HTgV+E/ij7rt/sut3ZpJvdZ9/a5LXDn3GKUm+lOQ9STZ033Xp0PanJvnbJHd22z8+tO1VSW7uavvnJIcObXtLkju6fa5KcnSPwy5pGzBYSduZJPsCnwb+DHgq8AfANUkWdF3uAl4FPAl4A/C+JIdV1f3AUuDOxzECdhLwSmB34BHgk8DXgH2Bo4Ezkry852c9Hdi5e+9ZwEXAycALgV8B3pHkwKH+xwEf677r5cDHk+yQZIeujs8CTwP+J/B3SZ499N7/BpwD7AZ8GPg74Nzuu7+66/Otbr9PBv4E+EiSvYc+40hgFbAncC5wcZJ02y4DfgH4pa6G9wEkeQFwCfBGYA/gAmB5kp26+pYBh1fVbsDLgdt7/naSZpjBSprbPt6NeNwzNBpyMrCiqlZU1SNV9Q/ABPAKgKr6dFV9qwa+wCB4/MpW1vH+qlpbVT8GDgcWVNXZVfVgVa1hEI5O7PlZDwHnVNVDwBUMAsv/rqr7qmolcCvwvKH+N1XV1V3/9zIIZUd1r12Bd3V1fA74FIMQOOkTVXVd9zv9ZKpiqupjVXVn1+dK4JvAEUNdvlNVF1XVw8ClwN7AXl34WgqcXlUbquqh7vcGOA24oKpuqKqHq+pS4KddzQ8DOwGLk+xQVbdX1bd6/naSZpjBSprbXlNVu3ev13Tr9gdOGApc9wAvYvB/+CRZmuT67rTaPQwC155bWcfaoeX9GZxOHN7/2xhMtO/jh11IAfhx98/vD23/MYPA9HP7rqpHgHXAPt1rbbdu0ncYjIRNVfeUkvzW0Cm7e4BD+Nnf63tD+3+gW9wVWATcXVUbpvjY/YHf3+Q3WgTsU1WrgTOAdwJ3JbkiyT7T1Slp2zBYSduftcBlQ4Fr96raparelWQn4BrgPcBeVbU7sAKYPHU11WXE9zM4nTXp6VP0GX7fWuDbm+x/t6p6xVZ/s6ktmlxI8gRgIXBn91rUrZu0H3DHZur+uXaS/RmMti0D9uh+r3/hP36vx7IWeGqS3Tez7ZxNfqNfqKqPAlTV5VX1IgYBrIB399ifpG3AYCVtfz4CvDrJy5PMS7JzNyl8IbAjg9NM64GN3UTr/zL03u8DeyR58tC6m4FXdBOxn85gNOWxfBm4r5uA/cSuhkOSHN7sG/6sFyY5PoMrEs9gcErteuAG4AEGk9F3yGAC/6sZnF7cnO8Dzxhq78Ig2KyHwcR/BiNW06qq7zK4GOADSZ7S1fDibvNFwOlJjszALklemWS3JM9O8tIuBP+EwQjdI5vZjaRtzGAlbWeqai2DCd1vYxAI1gJ/CDyhqu4D3gRcBWxgMHl7+dB7vwF8FFjTnaLah8EE7K8xmED9WeDKafb/MIPJ8c8Hvg38APggg8nfM+ETwG8w+D6vB47v5jM9yCBILe1q+ADwW9133JyLGcxtuifJx6vqVuAvgf/HIHQ9F7huC2p7PYM5Y99gcNHAGQBVNQH8D+CvurpXA6d079kJeFdX8/cYTHp/6xbsU9IM8gahkuasJO8EDqqqk0ddi6TtgyNWkiRJjRisJEmSGvFUoCRJUiOOWEmSJDVisJIkSWpkZE+a33PPPeuAAw4Y1e4lSZJ6u+mmm35QVQum6zeyYHXAAQcwMTExqt1LkiT1luQ7ffp5KlCSJKkRg5UkSVIjBitJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRkb2rEANSUZdwdxXNeoKJEnbAUesJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjBitJkqRGegWrJMckWZVkdZIzp9h+SpL1SW7uXr/TvlRJkqTZbf50HZLMA84Hfg1YB9yYZHlV3bpJ1yuratkM1ChJkjQW+oxYHQGsrqo1VfUgcAVw3MyWJUmSNH76BKt9gbVD7XXduk29LsktSa5OsqhJdZIkSWOk1eT1TwIHVNWhwD8Al07VKclpSSaSTKxfv77RriVJkmaHPsHqDmB4BGpht+5RVfXDqvpp1/wg8MKpPqiqLqyqJVW1ZMGCBY+nXkmSpFmrT7C6ETg4yYFJdgROBJYPd0iy91DzWOC2diVKkiSNh2mvCqyqjUmWAdcC84BLqmplkrOBiapaDrwpybHARuBu4JQZrFmSJGlWSlWNZMdLliypiYmJkex71klGXcHcN6J/zyVJc0OSm6pqyXT9vPO6JElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjBitJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktRIr2CV5Jgkq5KsTnLmY/R7XZJKsqRdiZIkSeNh2mCVZB5wPrAUWAyclGTxFP12A34PuKF1kZIkSeOgz4jVEcDqqlpTVQ8CVwDHTdHvT4F3Az9pWJ8kSdLY6BOs9gXWDrXXdeseleQwYFFVfbphbZIkSWNlqyevJ3kC8F7g93v0PS3JRJKJ9evXb+2uJUmSZpU+weoOYNFQe2G3btJuwCHA55PcDhwFLJ9qAntVXVhVS6pqyYIFCx5/1ZIkSbNQn2B1I3BwkgOT7AicCCyf3FhVP6qqPavqgKo6ALgeOLaqJmakYkmSpFlq2mBVVRuBZcC1wG3AVVW1MsnZSY6d6QIlSZLGxfw+napqBbBik3VnbabvS7a+LEmSpPHjndclSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjBitJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEZ6BaskxyRZlWR1kjOn2H56kq8nuTnJl5Isbl+qJEnS7DZtsEoyDzgfWAosBk6aIjhdXlXPrarnA+cC721eqSRJ0izXZ8TqCGB1Va2pqgeBK4DjhjtU1b1DzV2AaleiJEnSeJjfo8++wNqh9jrgyE07Jfld4M3AjsBLm1QnSZI0RppNXq+q86vqmcBbgLdP1SfJaUkmkkysX7++1a4lSZJmhT7B6g5g0VB7Ybduc64AXjPVhqq6sKqWVNWSBQsW9K9SkiRpDPQJVjcCByc5MMmOwInA8uEOSQ4ear4S+Ga7EiVJksbDtHOsqmpjkmXAtcA84JKqWpnkbGCiqpYDy5K8DHgI2AD89kwWLUmSNBv1mbxOVa0AVmyy7qyh5d9rXJckSdLY8c7rkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjBitJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjvYJVkmOSrEqyOsmZU2x/c5Jbk9yS5B+T7N++VEmSpNlt2mCVZB5wPrAUWAyclGTxJt2+CiypqkOBq4FzWxcqSZI02/UZsToCWF1Va6rqQeAK4LjhDlX1T1X1QNe8HljYtkxJkqTZr0+w2hdYO9Re163bnFOBz0y1IclpSSaSTKxfv75/lZIkSWOg6eT1JCcDS4C/mGp7VV1YVUuqasmCBQta7lqSJGnk5vfocwewaKi9sFv3M5K8DPhj4D9X1U/blCdJkjQ++oxY3QgcnOTAJDsCJwLLhzskeQFwAXBsVd3VvkxJkqTZb9pgVVUbgWXAtcBtwFVVtTLJ2UmO7br9BbAr8LEkNydZvpmPkyRJmrP6nAqkqlYAKzZZd9bQ8ssa1yVJkjR2vPO6JElSIwYrSZKkRnqdCpT0GJJRVzD3VY26AknqxRErSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1Ii3W5C0/fJWGduGt8vQdsQRK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjBitJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDXSK1glOSbJqiSrk5w5xfYXJ/lKko1Jfr19mZIkSbPftMEqyTzgfGApsBg4KcniTbr9G3AKcHnrAiVJksbF/B59jgBWV9UagCRXAMcBt052qKrbu22PzECNkiRJY6FPsNoXWDvUXgccOTPlSJLUUzLqCua+qlFXMHa26eT1JKclmUgysX79+m25a0mSpBnXJ1jdASwaai/s1m2xqrqwqpZU1ZIFCxY8no+QJEmatfoEqxuBg5McmGRH4ERg+cyWJUmSNH6mDVZVtRFYBlwL3AZcVVUrk5yd5FiAJIcnWQecAFyQZOVMFi1JkjQb9Zm8TlWtAFZssu6soeUbGZwilCRJ2m5553VJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqRGDlSRJUiMGK0mSpEYMVpIkSY0YrCRJkhoxWEmSJDVisJIkSWrEYCVJktSIwUqSJKkRg5UkSVIjBitJkqRGDFaSJEmNGKwkSZIaMVhJkiQ1YrCSJElqxGAlSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjRisJEmSGjFYSZIkNWKwkiRJasRgJUmS1IjBSpIkqZFewSrJMUlWJVmd5Mwptu+U5Mpu+w1JDmhdqCRJ0mw3bbBKMg84H1gKLAZOSrJ4k26nAhuq6iDgfcC7WxcqSZI02/UZsToCWF1Va6rqQeAK4LhN+hwHXNotXw0cnSTtypQkSZr9+gSrfYG1Q+113bop+1TVRuBHwB4tCpQkSRoX87flzpKcBpzWNf89yaptuX81tSfwg1EX0ZsDqMPG69iBx+9nefzGl8duvO3fp1OfYHUHsGiovbBbN1WfdUnmA08GfrjpB1XVhcCFfQrT7JZkoqqWjLoObTmP3Xjz+I0vj932oc+pwBuBg5McmGRH4ERg+SZ9lgO/3S3/OvC5qqp2ZUqSJM1+045YVdXGJMuAa4F5wCVVtTLJ2cBEVS0HLgYuS7IauJtB+JIkSdqu9JpjVVUrgBWbrDtraPknwAltS9Ms5ynd8eWxG28ev/HlsdsOxDN2kiRJbfhIG0mSpEYMVpIkSY0YrCRJkhoxWKmXJH/a3aNssv2kJH87yprUX5JnJtmpW35Jkjcl2X3UdamfDJyc5KyuvV+SI0Zdl6aXZK8kFyf5TNdenOTUUdelmWOwUl/zgRuSHJrk1xjc3+ymEdek/q4BHk5yEIMrkxYBl4+2JG2BDwC/DJzUte8Dzh9dOdoCH2Jwu6J9uva/AmeMrBrNuG36SBuNr6p6a5L/A9wAbABeXFWrR1yW+nukuyfda4Hzquq8JF8ddVHq7ciqOmzymFXVhu6GzZr99qyqq5K8FR69N+TDoy5KM8cRK/WS5MXA+4Gzgc8D5yXZ5zHfpNnkoSQnMXhCwqe6dTuMsB5tmYeSzAMKIMkC4JHRlqSe7k+yB/9x7I4CfjTakjSTHLFSX+8BTqiqWwGSHA98DvjFkValvt4AnA6cU1XfTnIgcNmIa1J/7wf+HnhaknMYPDrs7aMtST29mcFj356Z5DpgAYPjpznKG4SqlyTzqurhTdbtUVU/97BtzW5JngIsqqpbRl2L+kvyi8DRQIB/rKrbRlySeuou/Hk2g2O3qqoeGnFJmkEGK/WSZC/gz4F9q+qYJIuBX66qi0dcmnpI8nngWAaj1DcBdwHXVdWbR1mXptedAlxZVY4Oj6FudH9TPwK+XlV3bet6NPOcY6W+PsTgypa9u7ZXtoyXJ1fVvcDxwIer6kjgZSOuST10I8Wrkuw36lr0uJwKfBD4ze51EfAW4Lokrx9lYZoZzrFSX17ZMt7mJ9kb+K/AH4+6GG2xpwArk3wZuH9yZVUdO7qS1NN84DlV9X14dPT/w8CRwP/FuY5zjsFKfXlly3g7m8GI45eq6sYkzwC+OeKa1N87Rl2AHrdFk6Gqc1e37u4kzrWag5xjpV6SHAacBxwC/AvdlS1OgJakzUvyAWA/4GPdqtcB64A/BD5VVb86qto0MwxWekxJDgfWVtX3uitb3sjgD8OtwFlVdfdIC1QvSXZmMNfjl4CdJ9dX1X8fWVHqrRshPg94DrAjMA+4v6qeNNLCNK0kYTC38UXdqg3AXlX1u6OrSjPJyeuazgXAg93yf2IwP+d8Bn8cLhxVUdpilwFPB14OfAFYyOCxKBoPf8XgcTbfBJ4I/A4+0mYs1GD0Yg2wEXgt8KuAt8qYwxyx0mNK8rWqel63fD6wvqre2bVvrqrnj7I+9ZPkq1X1giS3VNWhSXYAvlhVR426Nk0vyURVLZk8ft26r1bVC0Zdm6aW5FkMwvBJwA+AK4E/qKr9R1qYZpyT1zWdeUnmV9VGBjcnPG1om//+jI/JSbL3JDkE+B7wtBHWoy3zQPdswJuTnAt8F884zHbfAL4IvGryuapJ/tdoS9K24P8wNZ2PAl9I8gngxwz+UJDkILwqcJxc2N1x/R0MHq9xK3DuaEvSFng9g7/XyxjcbmERg7mOmr2OZxCA/ynJRUkm75qvOc5TgZpWN3F2b+CzVXV/t+5ZwK5V9ZWRFifNYUn2q6p/G3UdevyS7AIcx+CU4EsZ3MPq76vqsyMtTDPGYCXNYUke85E1VfXebVWLtlySr1TVYd3yNVXlKNUY60aNTwB+o6qOHnU9mhnOkZHmtt1GXYC2yvCpo2eMrAo1UVWTV1N7RfUcZrCS5rCq+pNR16CtUptZljRLOXld2g4kuTTJ7kPtpyS5ZJQ1qZfnJbk3yX3Aod3yvUnuS3LvqIuT9PMcsZK2D4dW1T2TjarakMR7IM1yVTVv1DVI2jKOWEnbhyd0E2cBSPJU/A8rSWrOP6zS9uEvgeuTXNW1TwDOGWE9kjQnebsFaTuRZDGD++gAfK6qbh1lPZI0FxmspDksyc7A6cBBwNeBi7vHE0mSZoDBSprDklzJ4DmBXwSWArdX1RmjrUqS5i6DlTSHJfl6VT23W54PfHnyTt6SpPa8KlCa2x6aXPAUoCTNPEespDksycPA/ZNN4InAA91yVdWTRlWbJM1FBitJkqRGPBUoSZLUiMFKkiSpEYOVJElSIwYrSZKkRgxWkiRJjfx/Ong3pjs9IOMAAAAASUVORK5CYII=\n", |
| "text/plain": [ |
| "<matplotlib.figure.Figure at 0x7fc33cf95190>" |
| ] |
| }, |
| "metadata": {}, |
| "output_type": "display_data" |
| } |
| ], |
| "source": [ |
| "import matplotlib.pyplot as plt\n", |
| "%matplotlib inline\n", |
| "\n", |
| "# Plot the feature importances of the forest\n", |
| "plt.figure(figsize=(10,5))\n", |
| "plt.title(\"Feature importances\")\n", |
| "plt.bar(range(marvin_dataset[\"X_train\"].shape[1]), importances[indices], color=\"r\", align=\"center\")\n", |
| "\n", |
| "stats_order = [params[\"pred_cols\"][x] for x in indices]\n", |
| "\n", |
| "plt.xticks(range(marvin_dataset['X_train'].shape[1]), stats_order, rotation='vertical')\n", |
| "plt.xlim([-1, marvin_dataset['X_train'].shape[1]])\n", |
| "plt.show()" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Prediction Preparator" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 33, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "# put this values in engine.messages to be used as dryrun samples\n", |
| "# age, class, sex\n", |
| "# reminder: 'male': 1, 'female': 0\n", |
| "input_message = {\"Age\": 50, \"Pclass\": 3, \"Sex\": 0, \"Fare\": 5}" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 34, |
| "metadata": { |
| "marvin_cell": "ppreparator" |
| }, |
| "outputs": [], |
| "source": [ |
| "# Given the input: input_message = {\"age\": 50, \"class\": 3, \"sex\": 0}\n", |
| "# Transform the message into a correctly ordered list for the model\n", |
| "\n", |
| "key_order = {\"Age\":0, \"Pclass\":1, \"Sex\":2, \"Fare\":3}\n", |
| "input_message = [input_message[i] for i in sorted(input_message, key=key_order.__getitem__)]" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Predictor" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 35, |
| "metadata": { |
| "marvin_cell": "predictor" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "{'prediction_svm': 1, 'prediction_rf': 0}\n" |
| ] |
| } |
| ], |
| "source": [ |
| "final_prediction = {\n", |
| " \"prediction_rf\": marvin_model['rf'].predict([input_message])[0],\n", |
| " \"prediction_svm\": marvin_model['svm'].predict([input_message])[0]\n", |
| "}\n", |
| "\n", |
| "print(final_prediction)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Sample Application" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 45, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Passenger Information: {'Fare': 7.8292000000000002, 'Age': 34.5, 'Pclass': 3.0, 'Sex': 1.0}\n", |
| "Prediction: {'prediction_svm': 0, 'prediction_rf': 0}\n", |
| "\n", |
| "Passenger Information: {'Fare': 7.0, 'Age': 47.0, 'Pclass': 3.0, 'Sex': 0.0}\n", |
| "Prediction: {'prediction_svm': 1, 'prediction_rf': 0}\n", |
| "\n", |
| "Passenger Information: {'Fare': 9.6875, 'Age': 62.0, 'Pclass': 2.0, 'Sex': 1.0}\n", |
| "Prediction: {'prediction_svm': 0, 'prediction_rf': 0}\n", |
| "\n", |
| "Passenger Information: {'Fare': 8.6624999999999996, 'Age': 27.0, 'Pclass': 3.0, 'Sex': 1.0}\n", |
| "Prediction: {'prediction_svm': 0, 'prediction_rf': 0}\n", |
| "\n", |
| "Passenger Information: {'Fare': 12.2875, 'Age': 22.0, 'Pclass': 3.0, 'Sex': 0.0}\n", |
| "Prediction: {'prediction_svm': 1, 'prediction_rf': 1}\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "# Take all of the entries in the test dataset and make predictions for them\n", |
| "passengers = marvin_dataset[\"X_test\"].to_dict(orient='records')\n", |
| "for passenger in passengers[0:5]:\n", |
| " \n", |
| " # Prediction Preparator\n", |
| " key_order = {\"Age\":0, \"Pclass\":1, \"Sex\":2, \"Fare\":3}\n", |
| " input_message = [passenger[i] for i in sorted(passenger, key=key_order.__getitem__)]\n", |
| " \n", |
| " final_prediction = {\n", |
| " \"prediction_rf\": marvin_model['rf'].predict([input_message])[0],\n", |
| " \"prediction_svm\": marvin_model['svm'].predict([input_message])[0]\n", |
| " }\n", |
| "\n", |
| " print(\"Passenger Information: {0}\\nPrediction: {1}\\n\".format(passenger, final_prediction))" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.12" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |