blob: 1ea8bb4fe6ab4caba83be199b05b302567de6cff [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Acquisitor and Cleaner"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download data file\n",
"Drop unused columns\n",
"Rename text(feature) and label columns\n",
"Rename label value to 0 and 1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from marvin_python_toolbox.common.data import MarvinData\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"marvin_cell": "acquisitor"
},
"outputs": [],
"source": [
"data_file = MarvinData.download_file(\"https://s3.amazonaws.com/marvin-engines-data/spam.csv\")\n",
"data = pd.read_csv(data_file, encoding='latin-1')\n",
"data = data.drop([\"Unnamed: 2\", \"Unnamed: 3\", \"Unnamed: 4\"], axis=1)\n",
"data = data.rename(columns={\"v1\": \"label\", \"v2\": \"text\"})\n",
"data['label_num'] = data.label.map({'ham': 0, 'spam': 1})\n",
"\n",
"marvin_initial_dataset = data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training Preparator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Split text and label datas in test and train proportion\n",
"Text transformation using sklearn.feature_extraction library\n",
"Learn a vocabulary dictionary of all tokens in the raw documents"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"params = {\"test_size\": 0.3, \"random_state\": 10}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"marvin_cell": "tpreparator"
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(\n",
" marvin_initial_dataset[\"text\"], marvin_initial_dataset[\"label\"],\n",
" test_size=params[\"test_size\"], random_state=params[\"random_state\"])\n",
"\n",
"vect = CountVectorizer()\n",
"vect.fit(X_train)\n",
"\n",
"marvin_dataset = {\n",
" \"X_train\": vect.transform(X_train),\n",
" \"X_test\": vect.transform(X_test),\n",
" \"y_train\": y_train,\n",
" \"y_test\": y_test,\n",
" \"vect\": vect\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Trainer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create classifier\n",
"Multinomial Naive Bayes has good performance for text data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import MultinomialNB"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"marvin_cell": "trainer"
},
"outputs": [],
"source": [
"clf = MultinomialNB()\n",
"clf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n",
"\n",
"marvin_model = {\n",
" \"clf\": clf,\n",
" \"vect\": marvin_dataset[\"vect\"]\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metrics Evaluator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Find predction accuracy using sklearn.metrics library"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"marvin_cell": "evaluator"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction accuracy: 0.989234449761\n"
]
}
],
"source": [
"prediction = marvin_model[\"clf\"].predict(marvin_dataset[\"X_test\"])\n",
"metrics = accuracy_score(prediction, marvin_dataset[\"y_test\"])\n",
"\n",
"marvin_metrics = metrics\n",
"\n",
"print(\"Prediction accuracy: \" + str(metrics))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prediction Preparator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Input message is processed by CountVectorizer before going to predictor"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"input_message = [\"This is me.....\"]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"marvin_cell": "ppreparator"
},
"outputs": [],
"source": [
"input_message = marvin_model[\"vect\"].transform(input_message)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Predictor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Do prediction"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"marvin_cell": "predictor"
},
"outputs": [],
"source": [
"final_prediction = marvin_model[\"clf\"].predict(input_message)[0]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted value: ham\n"
]
}
],
"source": [
"print(\"Predicted value: \" + final_prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}