public-engines/iris-species-engine/notebooks/sample.ipynb - incubator-marvin - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Iris from Kaggle to Marvin engine template"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Data Handler"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Acquisitor and Cleanning Action"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "from marvin_python_toolbox.common.data import MarvinData\n",
     "import pandas as pd\n",
     "\n",
     "file_path = MarvinData.download_file(url=\"https://s3.amazonaws.com/marvin-engines-data/Iris.csv\")\n",
     "\n",
     "iris = pd.read_csv(file_path)\n",
     "iris.drop('Id',axis=1,inplace=True)\n",
     "\n",
     "initial_dataset = iris"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Tranning Preparator Action"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "/home/taka/.virtualenvs/iris-species-engine-env/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
      ]
     }
    ],
    "source": [
     "from sklearn.cross_validation import train_test_split\n",
     "\n",
     "# Joined dataset \n",
     "train, test = train_test_split(initial_dataset, test_size = 0.3)\n",
     "\n",
     "train_X = train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]\n",
     "train_y = train.Species\n",
     "\n",
     "test_X = test[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]\n",
     "test_y =test.Species\n",
     "\n",
     "# Separeted dataset\n",
     "petal = initial_dataset[['PetalLengthCm','PetalWidthCm','Species']]\n",
     "sepal = initial_dataset[['SepalLengthCm','SepalWidthCm','Species']]\n",
     "\n",
     "train_p, test_p = train_test_split(petal, test_size=0.3, random_state=0)\n",
     "train_x_p = train_p[['PetalWidthCm','PetalLengthCm']]\n",
     "train_y_p = train_p.Species\n",
     "test_x_p = test_p[['PetalWidthCm','PetalLengthCm']]\n",
     "test_y_p = test_p.Species\n",
     "\n",
     "train_s, test_s = train_test_split(sepal, test_size=0.3, random_state=0)\n",
     "train_x_s = train_s[['SepalWidthCm','SepalLengthCm']]\n",
     "train_y_s = train_s.Species\n",
     "test_x_s = test_s[['SepalWidthCm','SepalLengthCm']]\n",
     "test_y_s = test_s.Species\n",
     "\n",
     "dataset = {\n",
     "    'petals' : {'train_X': train_x_p, 'train_y': train_y_p, 'test_X': test_x_p, 'test_y': test_y_p},\n",
     "    'sepals' : {'train_X': train_x_s, 'train_y': train_y_s, 'test_X': test_x_s, 'test_y': test_y_s},\n",
     "    'joined' : {'train_X': train_X, 'train_y': train_y, 'test_X': test_X, 'test_y': test_y}\n",
     "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Traning Action"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn import svm\n",
     "from sklearn.linear_model import LogisticRegression\n",
     "from sklearn.tree import DecisionTreeClassifier\n",
     "from sklearn.neighbors import KNeighborsClassifier\n",
     "\n",
     "algorithms = {\n",
     "    'svm': svm.SVC, \n",
     "    'lr': LogisticRegression,\n",
     "    'dt': DecisionTreeClassifier,\n",
     "    'knn': KNeighborsClassifier\n",
     "}\n",
     "\n",
     "_model = {}\n",
     "for name in algorithms.keys():\n",
     "    algorithm = algorithms[name]\n",
     "    _model[name + '_petals'] = algorithm().fit(dataset['petals']['train_X'], dataset['petals']['train_y'])\n",
     "    _model[name + '_sepals'] = algorithm().fit(dataset['sepals']['train_X'], dataset['sepals']['train_y'])\n",
     "    _model[name + '_joined'] = algorithm().fit(dataset['joined']['train_X'], dataset['joined']['train_y'])\n",
     "    \n",
     "model = _model"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Evaluator action"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn import metrics as sk_metrics\n",
     "\n",
     "_metrics = {}\n",
     "\n",
     "for m in model.keys():\n",
     "    dataset_key = m.split(\"_\")[-1]\n",
     "    \n",
     "    _test_X = dataset[dataset_key]['test_X']\n",
     "    _test_y = dataset[dataset_key]['test_y']\n",
     "    \n",
     "    model[m].predict(_test_X)\n",
     "    prediction=model[m].predict(_test_X)\n",
     "    _metrics[m] = sk_metrics.accuracy_score(prediction, _test_y)\n",
     "\n",
     "_metrics = sorted(_metrics.iteritems(), key=lambda (k,v): (v,k), reverse=True)\n",
     "\n",
     "metrics = {\n",
     "    \"best_model\" : _metrics[0],\n",
     "    \"all_metrics\": _metrics\n",
     "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Prediction Preparator (not used in this example)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Prediction"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "/home/taka/.virtualenvs/iris-species-engine-env/local/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
       "  DeprecationWarning)\n",
       "/home/taka/.virtualenvs/iris-species-engine-env/local/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
       "  DeprecationWarning)\n"
      ]
     },
     {
      "data": {
       "text/plain": [
        "{'svm_petals_prediction': 'Iris-virginica',\n",
        " 'svm_sepals_prediction': 'Iris-virginica'}"
       ]
      },
      "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "input_message = ['12', '34']\n",
     "\n",
     "\n",
     "final_result = {\n",
     "    \"svm_petals_prediction\": model['svm_petals'].predict(input_message)[0],\n",
     "    \"svm_sepals_prediction\": model['svm_sepals'].predict(input_message)[0]\n",
     "}\n",
     "\n",
     "final_result"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "Python 2",
    "language": "python",
    "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
    "version": "2.7.12"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Iris from Kaggle to Marvin engine template"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Data Handler"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Acquisitor and Cleanning Action"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"from marvin_python_toolbox.common.data import MarvinData\n",
	"import pandas as pd\n",
	"\n",
	"file_path = MarvinData.download_file(url=\"https://s3.amazonaws.com/marvin-engines-data/Iris.csv\")\n",
	"\n",
	"iris = pd.read_csv(file_path)\n",
	"iris.drop('Id',axis=1,inplace=True)\n",
	"\n",
	"initial_dataset = iris"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Tranning Preparator Action"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/taka/.virtualenvs/iris-species-engine-env/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
	" \"This module will be removed in 0.20.\", DeprecationWarning)\n"
	]
	}
	],
	"source": [
	"from sklearn.cross_validation import train_test_split\n",
	"\n",
	"# Joined dataset \n",
	"train, test = train_test_split(initial_dataset, test_size = 0.3)\n",
	"\n",
	"train_X = train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]\n",
	"train_y = train.Species\n",
	"\n",
	"test_X = test[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]\n",
	"test_y =test.Species\n",
	"\n",
	"# Separeted dataset\n",
	"petal = initial_dataset[['PetalLengthCm','PetalWidthCm','Species']]\n",
	"sepal = initial_dataset[['SepalLengthCm','SepalWidthCm','Species']]\n",
	"\n",
	"train_p, test_p = train_test_split(petal, test_size=0.3, random_state=0)\n",
	"train_x_p = train_p[['PetalWidthCm','PetalLengthCm']]\n",
	"train_y_p = train_p.Species\n",
	"test_x_p = test_p[['PetalWidthCm','PetalLengthCm']]\n",
	"test_y_p = test_p.Species\n",
	"\n",
	"train_s, test_s = train_test_split(sepal, test_size=0.3, random_state=0)\n",
	"train_x_s = train_s[['SepalWidthCm','SepalLengthCm']]\n",
	"train_y_s = train_s.Species\n",
	"test_x_s = test_s[['SepalWidthCm','SepalLengthCm']]\n",
	"test_y_s = test_s.Species\n",
	"\n",
	"dataset = {\n",
	" 'petals' : {'train_X': train_x_p, 'train_y': train_y_p, 'test_X': test_x_p, 'test_y': test_y_p},\n",
	" 'sepals' : {'train_X': train_x_s, 'train_y': train_y_s, 'test_X': test_x_s, 'test_y': test_y_s},\n",
	" 'joined' : {'train_X': train_X, 'train_y': train_y, 'test_X': test_X, 'test_y': test_y}\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Traning Action"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn import svm\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.tree import DecisionTreeClassifier\n",
	"from sklearn.neighbors import KNeighborsClassifier\n",
	"\n",
	"algorithms = {\n",
	" 'svm': svm.SVC, \n",
	" 'lr': LogisticRegression,\n",
	" 'dt': DecisionTreeClassifier,\n",
	" 'knn': KNeighborsClassifier\n",
	"}\n",
	"\n",
	"_model = {}\n",
	"for name in algorithms.keys():\n",
	" algorithm = algorithms[name]\n",
	" _model[name + '_petals'] = algorithm().fit(dataset['petals']['train_X'], dataset['petals']['train_y'])\n",
	" _model[name + '_sepals'] = algorithm().fit(dataset['sepals']['train_X'], dataset['sepals']['train_y'])\n",
	" _model[name + '_joined'] = algorithm().fit(dataset['joined']['train_X'], dataset['joined']['train_y'])\n",
	" \n",
	"model = _model"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Evaluator action"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn import metrics as sk_metrics\n",
	"\n",
	"_metrics = {}\n",
	"\n",
	"for m in model.keys():\n",
	" dataset_key = m.split(\"_\")[-1]\n",
	" \n",
	" _test_X = dataset[dataset_key]['test_X']\n",
	" _test_y = dataset[dataset_key]['test_y']\n",
	" \n",
	" model[m].predict(_test_X)\n",
	" prediction=model[m].predict(_test_X)\n",
	" _metrics[m] = sk_metrics.accuracy_score(prediction, _test_y)\n",
	"\n",
	"_metrics = sorted(_metrics.iteritems(), key=lambda (k,v): (v,k), reverse=True)\n",
	"\n",
	"metrics = {\n",
	" \"best_model\" : _metrics[0],\n",
	" \"all_metrics\": _metrics\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Prediction Preparator (not used in this example)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Prediction"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/taka/.virtualenvs/iris-species-engine-env/local/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
	" DeprecationWarning)\n",
	"/home/taka/.virtualenvs/iris-species-engine-env/local/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
	" DeprecationWarning)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"{'svm_petals_prediction': 'Iris-virginica',\n",
	" 'svm_sepals_prediction': 'Iris-virginica'}"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"input_message = ['12', '34']\n",
	"\n",
	"\n",
	"final_result = {\n",
	" \"svm_petals_prediction\": model['svm_petals'].predict(input_message)[0],\n",
	" \"svm_sepals_prediction\": model['svm_sepals'].predict(input_message)[0]\n",
	"}\n",
	"\n",
	"final_result"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}