blob: 0322611f5ad583b5167532fd52e083aff85c8bff [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Documentation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sample"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import marvin_titanic_engine\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/vagrant/projects/titanic-engine/notebooks\n"
]
}
],
"source": [
"print(os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#from marvin_python_toolbox.common.data import MarvinData\n",
"import pandas as pd\n",
"\n",
"train_df = pd.read_csv(\n",
" 'marvin_titanic_engine/data_files/train.csv'\n",
")\n",
"test_df = pd.read_csv(\n",
" 'marvin_titanic_engine/data_files/train.csv'\n",
")\n",
"marvin_initial_dataset = {\n",
" 'train': train_df,\n",
" 'test': test_df\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training Preparator"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Length: 1046\n"
]
}
],
"source": [
"from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV\n",
"\n",
"pred_cols = ['Age', 'Pclass', 'Sex']\n",
"dep_var = 'Survived'\n",
"train_no_na = marvin_initial_dataset['train'][\n",
" pred_cols + [dep_var]\n",
"].dropna()\n",
"print(\"Length: {}\".format(len(train_no_na)))\n",
"\n",
"# Feature Engineering\n",
"data_X = train_no_na[pred_cols]\n",
"data_X.loc[:, 'Sex'] = data_X['Sex'].map({'male': 1, 'female': 0})\n",
"data_y = train_no_na[dep_var]\n",
"\n",
"# Prepare for Stratified Shuffle Split\n",
"sss = StratifiedShuffleSplit(n_splits=5, test_size=.6, random_state=0)\n",
"sss.get_n_splits(data_X, data_y)\n",
"for train_index, test_index in sss.split(data_X, data_y):\n",
" X_train, X_test = data_X.iloc[train_index], data_X.iloc[test_index]\n",
" y_train, y_test = data_y.iloc[train_index], data_y.iloc[test_index]\n",
"marvin_dataset = {\n",
" 'X_train': X_train,\n",
" 'y_train': y_train,\n",
" 'X_test': X_test,\n",
" 'y_test': y_test,\n",
" 'sss': sss\n",
"}\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Trainer"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Best score for training data:', 0.76315789473684215)\n",
"('Best `C`:', 100)\n",
"('Best kernel:', 'rbf')\n",
"('Best `gamma`:', 0.001)\n"
]
}
],
"source": [
"from sklearn import svm, neighbors, tree\n",
"from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.preprocessing import StandardScaler, scale\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"\n",
"# Set the parameter candidates\n",
"parameter_candidates = [\n",
" {'C': [1, 10, 100], 'gamma': [0.01, 0.001], 'kernel': ['linear']},\n",
" {'C': [1, 10, 100], 'gamma': [0.01, 0.001], 'kernel': ['rbf']},\n",
"]\n",
"\n",
"# Create a classifier with the parameter candidates\n",
"svm_grid = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1)\n",
"\n",
"# Train the classifier on training data\n",
"svm_grid.fit(\n",
" marvin_dataset['X_train'],\n",
" marvin_dataset['y_train']\n",
")\n",
"\n",
"# use a full grid over all parameters\n",
"parameter_candidates = {\n",
" \"max_depth\": [3, None],\n",
" \"random_state\": [0],\n",
" \"min_samples_split\": [2, 3, 10],\n",
" \"min_samples_leaf\": [1, 3, 10],\n",
" \"n_estimators\": [20, 50],\n",
" \"bootstrap\": [True, False],\n",
" \"criterion\": [\"gini\", \"entropy\"]\n",
"}\n",
"\n",
"#clf = RandomForestClassifier(n_estimators=20)\n",
"\n",
"# run grid search\n",
"rf_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameter_candidates)\n",
"rf_grid.fit(\n",
" marvin_dataset['X_train'],\n",
" marvin_dataset['y_train']\n",
")\n",
"\n",
"marvin_model = {}\n",
"marvin_model['svm'] = svm_grid\n",
"marvin_model['rf'] = rf_grid\n"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Type: rf\n",
"{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 20, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'entropy', 'random_state': 0, 'min_impurity_split': 1e-07, 'max_features': 'auto', 'max_depth': 3, 'class_weight': None}\n",
"Accuracy Score: 0.7703%\n",
"Classification Report:\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.89 0.81 0.85 409\n",
" 1.0 0.70 0.82 0.75 219\n",
"\n",
"avg / total 0.83 0.81 0.82 628\n",
"\n",
"Confusion Matrix:\n",
"\n",
"[[332 77]\n",
" [ 40 179]]\n",
"\n",
"\n",
"\n",
"Model Type: svm\n",
"{'kernel': 'rbf', 'C': 100, 'verbose': False, 'probability': False, 'degree': 3, 'shrinking': True, 'max_iter': -1, 'decision_function_shape': None, 'random_state': None, 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 0.001, 'class_weight': None}\n",
"Accuracy Score: 0.7632%\n",
"Classification Report:\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0.0 0.83 0.84 0.83 371\n",
" 1.0 0.76 0.76 0.76 257\n",
"\n",
"avg / total 0.80 0.80 0.80 628\n",
"\n",
"Confusion Matrix:\n",
"\n",
"[[310 61]\n",
" [ 62 195]]\n",
"\n",
"\n",
"\n"
]
}
],
"source": [
"from sklearn import metrics\n",
"for model_type, fitted_model in marvin_model.iteritems():\n",
" print(\"Model Type: {0}\\n{1}\".format(model_type, fitted_model.best_estimator_.get_params()))\n",
" print(\"Accuracy Score: {}%\".format(round(fitted_model.best_score_,4)))\n",
" # Print the classification report of `y_test` and `predicted`\n",
" print(\"Classification Report:\\n\")\n",
" print(metrics.classification_report(fitted_model.predict(marvin_dataset['X_test']), marvin_dataset['y_test']))\n",
"\n",
" # Print the confusion matrix\n",
" print(\"Confusion Matrix:\\n\")\n",
" print(metrics.confusion_matrix(fitted_model.predict(marvin_dataset['X_test']), marvin_dataset['y_test']))\n",
" print(\"\\n\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature ranking:\n",
"1. feature sex (0.486342)\n",
"2. feature pclass (0.265882)\n",
"3. feature age (0.247776)\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7fe437993a10>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"#marvin_dataset['X_train']\n",
"\n",
"importances = rf_grid.best_estimator_.feature_importances_\n",
"\n",
"#std = np.std([rf_grid.best_estimator_.feature_importances_ for tree in rf_grid.estimators_],\n",
"# axis=0)\n",
"indices = np.argsort(importances)[::-1]\n",
"\n",
"# Print the feature ranking\n",
"print(\"Feature ranking:\")\n",
"\n",
"for f in range(marvin_dataset['X_train'].shape[1]):\n",
" print(\"%d. feature %s (%f)\" % (f + 1, pred_cols[indices[f]], importances[indices[f]]))\n",
"\n",
"# Plot the feature importances of the forest\n",
"plt.figure(figsize=(10,5))\n",
"plt.title(\"Feature importances\")\n",
"plt.bar(range(X_train.shape[1]), importances[indices],\n",
" color=\"r\", align=\"center\")\n",
"stats_order = [pred_cols[x] for x in indices]\n",
"plt.xticks(range(marvin_dataset['X_train'].shape[1]), stats_order, rotation='vertical')\n",
"plt.xlim([-1, marvin_dataset['X_train'].shape[1]])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predictor"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'prediction1': 0.0, 'prediction2': 1.0}"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# age, class, sex\n",
"input_message = [[50, 3, 0]]\n",
"\n",
"\n",
"final_result = {\n",
" \"prediction1\": marvin_model['rf'].predict(input_message)[0],\n",
" \"prediction2\": marvin_model['svm'].predict(input_message)[0]\n",
"\n",
"}\n",
"\n",
"final_result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}