public-engines/sms-spam-engine/notebooks/.ipynb_checkpoints/marvin_solution-checkpoint.ipynb - incubator-marvin - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Acquisitor and Cleaner"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Download data file\n",
     "Drop unused columns\n",
     "Rename text(feature) and label columns\n",
     "Rename label value to 0 and 1"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "from marvin_python_toolbox.common.data import MarvinData\n",
     "import pandas as pd"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
     "marvin_cell": "acquisitor"
    },
    "outputs": [],
    "source": [
     "data_file = MarvinData.download_file(\"https://s3.amazonaws.com/marvin-engines-data/spam.csv\")\n",
     "data = pd.read_csv(data_file, encoding='latin-1')\n",
     "data = data.drop([\"Unnamed: 2\", \"Unnamed: 3\", \"Unnamed: 4\"], axis=1)\n",
     "data = data.rename(columns={\"v1\": \"label\", \"v2\": \"text\"})\n",
     "data['label_num'] = data.label.map({'ham': 0, 'spam': 1})\n",
     "\n",
     "marvin_initial_dataset = data"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Training Preparator"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Split text and label datas in test and train proportion\n",
     "Text transformation using sklearn.feature_extraction library\n",
     "Learn a vocabulary dictionary of all tokens in the raw documents"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.feature_extraction.text import CountVectorizer\n",
     "from sklearn.model_selection import train_test_split"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "params = {\"test_size\": 0.3, \"random_state\": 10}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {
     "marvin_cell": "tpreparator"
    },
    "outputs": [],
    "source": [
     "X_train, X_test, y_train, y_test = train_test_split(\n",
     "            marvin_initial_dataset[\"text\"], marvin_initial_dataset[\"label\"],\n",
     "            test_size=params[\"test_size\"], random_state=params[\"random_state\"])\n",
     "\n",
     "vect = CountVectorizer()\n",
     "vect.fit(X_train)\n",
     "\n",
     "marvin_dataset = {\n",
     "    \"X_train\": vect.transform(X_train),\n",
     "    \"X_test\": vect.transform(X_test),\n",
     "    \"y_train\": y_train,\n",
     "    \"y_test\": y_test,\n",
     "    \"vect\": vect\n",
     "    }"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Trainer"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Create classifier\n",
     "Multinomial Naive Bayes has good performance for text data"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.naive_bayes import MultinomialNB"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {
     "marvin_cell": "trainer"
    },
    "outputs": [],
    "source": [
     "clf = MultinomialNB()\n",
     "clf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n",
     "\n",
     "marvin_model = {\n",
     "    \"clf\": clf,\n",
     "    \"vect\": marvin_dataset[\"vect\"]\n",
     "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Metrics Evaluator"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Find predction accuracy using sklearn.metrics library"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.metrics import accuracy_score"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {
     "marvin_cell": "evaluator"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Prediction accuracy: 0.989234449761\n"
      ]
     }
    ],
    "source": [
     "prediction = marvin_model[\"clf\"].predict(marvin_dataset[\"X_test\"])\n",
     "metrics = accuracy_score(prediction, marvin_dataset[\"y_test\"])\n",
     "\n",
     "marvin_metrics = metrics\n",
     "\n",
     "print(\"Prediction accuracy: \" + str(metrics))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Prediction Preparator"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Input message is processed by CountVectorizer before going to predictor"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
     "input_message = [\"This is me.....\"]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
    "metadata": {
     "marvin_cell": "ppreparator"
    },
    "outputs": [],
    "source": [
     "input_message = marvin_model[\"vect\"].transform(input_message)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Predictor"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Do prediction"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 18,
    "metadata": {
     "marvin_cell": "predictor"
    },
    "outputs": [],
    "source": [
     "final_prediction = marvin_model[\"clf\"].predict(input_message)[0]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Predicted value: ham\n"
      ]
     }
    ],
    "source": [
     "print(\"Predicted value: \" + final_prediction)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "Python 2",
    "language": "python",
    "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
    "version": "2.7.12"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Acquisitor and Cleaner"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download data file\n",
	"Drop unused columns\n",
	"Rename text(feature) and label columns\n",
	"Rename label value to 0 and 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"from marvin_python_toolbox.common.data import MarvinData\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"marvin_cell": "acquisitor"
	},
	"outputs": [],
	"source": [
	"data_file = MarvinData.download_file(\"https://s3.amazonaws.com/marvin-engines-data/spam.csv\")\n",
	"data = pd.read_csv(data_file, encoding='latin-1')\n",
	"data = data.drop([\"Unnamed: 2\", \"Unnamed: 3\", \"Unnamed: 4\"], axis=1)\n",
	"data = data.rename(columns={\"v1\": \"label\", \"v2\": \"text\"})\n",
	"data['label_num'] = data.label.map({'ham': 0, 'spam': 1})\n",
	"\n",
	"marvin_initial_dataset = data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Training Preparator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Split text and label datas in test and train proportion\n",
	"Text transformation using sklearn.feature_extraction library\n",
	"Learn a vocabulary dictionary of all tokens in the raw documents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.feature_extraction.text import CountVectorizer\n",
	"from sklearn.model_selection import train_test_split"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {\"test_size\": 0.3, \"random_state\": 10}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"marvin_cell": "tpreparator"
	},
	"outputs": [],
	"source": [
	"X_train, X_test, y_train, y_test = train_test_split(\n",
	" marvin_initial_dataset[\"text\"], marvin_initial_dataset[\"label\"],\n",
	" test_size=params[\"test_size\"], random_state=params[\"random_state\"])\n",
	"\n",
	"vect = CountVectorizer()\n",
	"vect.fit(X_train)\n",
	"\n",
	"marvin_dataset = {\n",
	" \"X_train\": vect.transform(X_train),\n",
	" \"X_test\": vect.transform(X_test),\n",
	" \"y_train\": y_train,\n",
	" \"y_test\": y_test,\n",
	" \"vect\": vect\n",
	" }"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Trainer"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create classifier\n",
	"Multinomial Naive Bayes has good performance for text data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.naive_bayes import MultinomialNB"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"marvin_cell": "trainer"
	},
	"outputs": [],
	"source": [
	"clf = MultinomialNB()\n",
	"clf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n",
	"\n",
	"marvin_model = {\n",
	" \"clf\": clf,\n",
	" \"vect\": marvin_dataset[\"vect\"]\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Metrics Evaluator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Find predction accuracy using sklearn.metrics library"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.metrics import accuracy_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"marvin_cell": "evaluator"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Prediction accuracy: 0.989234449761\n"
	]
	}
	],
	"source": [
	"prediction = marvin_model[\"clf\"].predict(marvin_dataset[\"X_test\"])\n",
	"metrics = accuracy_score(prediction, marvin_dataset[\"y_test\"])\n",
	"\n",
	"marvin_metrics = metrics\n",
	"\n",
	"print(\"Prediction accuracy: \" + str(metrics))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Prediction Preparator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Input message is processed by CountVectorizer before going to predictor"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"input_message = [\"This is me.....\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"marvin_cell": "ppreparator"
	},
	"outputs": [],
	"source": [
	"input_message = marvin_model[\"vect\"].transform(input_message)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Predictor"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Do prediction"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"marvin_cell": "predictor"
	},
	"outputs": [],
	"source": [
	"final_prediction = marvin_model[\"clf\"].predict(input_message)[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Predicted value: ham\n"
	]
	}
	],
	"source": [
	"print(\"Predicted value: \" + final_prediction)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}