public-engines/nlp-ner-engine/notebooks/NLP_Marvin_Solution.ipynb - incubator-marvin - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Acquisitor and Cleaner"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Download the dataset, store in marvin_initial_dataset."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import nltk\n",
     "import os"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {
     "marvin_cell": "acquisitor"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "[nltk_data] Downloading package conll2002 to /home/zhang/nltk_data...\n",
       "[nltk_data]   Package conll2002 is already up-to-date!\n"
      ]
     }
    ],
    "source": [
     "nltk.download('conll2002')\n",
     "train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))\n",
     "test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))\n",
     "\n",
     "marvin_initial_dataset = {\n",
     "    'train_sents': train_sents,\n",
     "    'test_sents': test_sents\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "[[(u'Melbourne', u'NP', u'B-LOC'), (u'(', u'Fpa', u'O'), (u'Australia', u'NP', u'B-LOC'), (u')', u'Fpt', u'O'), (u',', u'Fc', u'O'), (u'25', u'Z', u'O'), (u'may', u'NC', u'O'), (u'(', u'Fpa', u'O'), (u'EFE', u'NC', u'B-ORG'), (u')', u'Fpt', u'O'), (u'.', u'Fp', u'O')], [(u'-', u'Fg', u'O')]]\n"
      ]
     }
    ],
    "source": [
     "print(train_sents[0:2])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Training Preparator"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Feature engineering, the initial datasets are splitted into feature datasets and label datasets."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
     "marvin_cell": "tpreparator"
    },
    "outputs": [],
    "source": [
     "def word2features(sent, i):\n",
     "    word = sent[i][0]\n",
     "    postag = sent[i][1]\n",
     "    \n",
     "    features = {\n",
     "        'bias': 1.0,\n",
     "        'word.lower()': word.lower(),\n",
     "        'word[-3:]': word[-3:],\n",
     "        'word[-2:]': word[-2:],\n",
     "        'word.isupper()': word.isupper(),\n",
     "        'word.istitle()': word.istitle(),\n",
     "        'word.isdigit()': word.isdigit(),\n",
     "        'postag': postag,\n",
     "        'postag[:2]': postag[:2],        \n",
     "    }\n",
     "    if i > 0:\n",
     "        word1 = sent[i-1][0]\n",
     "        postag1 = sent[i-1][1]\n",
     "        features.update({\n",
     "            '-1:word.lower()': word1.lower(),\n",
     "            '-1:word.istitle()': word1.istitle(),\n",
     "            '-1:word.isupper()': word1.isupper(),\n",
     "            '-1:postag': postag1,\n",
     "            '-1:postag[:2]': postag1[:2],\n",
     "        })\n",
     "    else:\n",
     "        features['BOS'] = True\n",
     "        \n",
     "    if i < len(sent)-1:\n",
     "        word1 = sent[i+1][0]\n",
     "        postag1 = sent[i+1][1]\n",
     "        features.update({\n",
     "            '+1:word.lower()': word1.lower(),\n",
     "            '+1:word.istitle()': word1.istitle(),\n",
     "            '+1:word.isupper()': word1.isupper(),\n",
     "            '+1:postag': postag1,\n",
     "            '+1:postag[:2]': postag1[:2],\n",
     "        })\n",
     "    else:\n",
     "        features['EOS'] = True\n",
     "                \n",
     "    return features\n",
     "\n",
     "\n",
     "def sent2features(sent):\n",
     "    return [word2features(sent, i) for i in range(len(sent))]\n",
     "\n",
     "def sent2labels(sent):\n",
     "    return [label for token, postag, label in sent]\n",
     "\n",
     "X_train = [sent2features(s) for s in marvin_initial_dataset['train_sents']]\n",
     "y_train = [sent2labels(s) for s in marvin_initial_dataset['train_sents']]\n",
     "\n",
     "X_test = [sent2features(s) for s in marvin_initial_dataset['test_sents']]\n",
     "y_test = [sent2labels(s) for s in marvin_initial_dataset['test_sents']]\n",
     "\n",
     "marvin_dataset = {\n",
     "    'X_train': X_train,\n",
     "    'y_train': y_train,\n",
     "    'X_test': X_test,\n",
     "    'y_test': y_test\n",
     "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Trainer"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Model training."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "import sklearn_crfsuite"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {
     "marvin_cell": "trainer"
    },
    "outputs": [],
    "source": [
     "crf = sklearn_crfsuite.CRF(\n",
     "    algorithm='lbfgs', \n",
     "    c1=0.10789964607864502, \n",
     "    c2=0.082422264927260847, \n",
     "    max_iterations=100, \n",
     "    all_possible_transitions=True\n",
     ")\n",
     "crf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n",
     "\n",
     "marvin_model = {\n",
     "    'crf': crf\n",
     "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Metrics Evaluator"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Creating evaluation metrics for trained model."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn_crfsuite import scorers\n",
     "from sklearn_crfsuite import metrics"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {
     "marvin_cell": "evaluator"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Balanced F-score: 0.797607625209\n",
       "\n",
       "Classification Report: \n",
       "             precision    recall  f1-score   support\n",
       "\n",
       "      B-LOC      0.806     0.784     0.795      1084\n",
       "      I-LOC      0.697     0.631     0.662       325\n",
       "     B-MISC      0.749     0.555     0.637       339\n",
       "     I-MISC      0.743     0.582     0.653       557\n",
       "      B-ORG      0.807     0.835     0.821      1400\n",
       "      I-ORG      0.841     0.800     0.820      1104\n",
       "      B-PER      0.845     0.887     0.865       735\n",
       "      I-PER      0.894     0.940     0.916       634\n",
       "\n",
       "avg / total      0.812     0.788     0.798      6178\n",
       "\n"
      ]
     }
    ],
    "source": [
     "labels = list(marvin_model['crf'].classes_)\n",
     "labels.remove('O')\n",
     "y_pred = marvin_model['crf'].predict(marvin_dataset['X_test'])\n",
     "\n",
     "score = metrics.flat_f1_score(marvin_dataset['y_test'], y_pred, average='weighted', labels=labels)\n",
     "\n",
     "sorted_labels = sorted(\n",
     "    labels, \n",
     "    key=lambda name: (name[1:], name[0])\n",
     ")\n",
     "report = metrics.flat_classification_report(\n",
     "    marvin_dataset['y_test'], y_pred, labels=sorted_labels, digits=3\n",
     ")\n",
     "\n",
     "marvin_metrics = {\n",
     "    'score': score,\n",
     "    'report': report\n",
     "}\n",
     "\n",
     "print('Balanced F-score: ' + str(score))\n",
     "print('\\nClassification Report: \\n' + str(report))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Prediction Preparator"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Applying feature engineering method in input_message, preparing it for prediction."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "input_message = [(u'Melbourne', u'NP', u'B-LOC'),\n",
     " (u'(', u'Fpa', u'O'),\n",
     " (u'Australia', u'NP', u'B-LOC'),\n",
     " (u')', u'Fpt', u'O'),\n",
     " (u',', u'Fc', u'O'),\n",
     " (u'25', u'Z', u'O'),\n",
     " (u'may', u'NC', u'O'),\n",
     " (u'(', u'Fpa', u'O'),\n",
     " (u'EFE', u'NC', u'B-ORG'),\n",
     " (u')', u'Fpt', u'O'),\n",
     " (u'.', u'Fp', u'O')]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "[u'B-LOC', u'O', u'B-LOC', u'O', u'O', u'O', u'O', u'O', u'B-ORG', u'O', u'O']\n"
      ]
     }
    ],
    "source": [
     "input_label = sent2labels(input_message)\n",
     "print(input_label)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Feature engineering methods is implemented again here in Prediction Preparator, because in Marvin code structure, each action is separated."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {
     "marvin_cell": "ppreparator"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "{'+1:word.isupper()': False, 'word.isupper()': False, 'BOS': True, 'word[-2:]': u'ne', '+1:postag': u'Fpa', 'word.isdigit()': False, 'postag': u'NP', 'bias': 1.0, 'postag[:2]': u'NP', '+1:word.lower()': u'(', '+1:word.istitle()': False, 'word.istitle()': True, 'word.lower()': u'melbourne', 'word[-3:]': u'rne', '+1:postag[:2]': u'Fp'}\n"
      ]
     }
    ],
    "source": [
     "def word2features(sent, i):\n",
     "    word = sent[i][0]\n",
     "    postag = sent[i][1]\n",
     "    \n",
     "    features = {\n",
     "        'bias': 1.0,\n",
     "        'word.lower()': word.lower(),\n",
     "        'word[-3:]': word[-3:],\n",
     "        'word[-2:]': word[-2:],\n",
     "        'word.isupper()': word.isupper(),\n",
     "        'word.istitle()': word.istitle(),\n",
     "        'word.isdigit()': word.isdigit(),\n",
     "        'postag': postag,\n",
     "        'postag[:2]': postag[:2],        \n",
     "    }\n",
     "    if i > 0:\n",
     "        word1 = sent[i-1][0]\n",
     "        postag1 = sent[i-1][1]\n",
     "        features.update({\n",
     "            '-1:word.lower()': word1.lower(),\n",
     "            '-1:word.istitle()': word1.istitle(),\n",
     "            '-1:word.isupper()': word1.isupper(),\n",
     "            '-1:postag': postag1,\n",
     "            '-1:postag[:2]': postag1[:2],\n",
     "        })\n",
     "    else:\n",
     "        features['BOS'] = True\n",
     "        \n",
     "    if i < len(sent)-1:\n",
     "        word1 = sent[i+1][0]\n",
     "        postag1 = sent[i+1][1]\n",
     "        features.update({\n",
     "            '+1:word.lower()': word1.lower(),\n",
     "            '+1:word.istitle()': word1.istitle(),\n",
     "            '+1:word.isupper()': word1.isupper(),\n",
     "            '+1:postag': postag1,\n",
     "            '+1:postag[:2]': postag1[:2],\n",
     "        })\n",
     "    else:\n",
     "        features['EOS'] = True\n",
     "                \n",
     "    return features\n",
     "\n",
     "\n",
     "def sent2features(sent):\n",
     "    return [word2features(sent, i) for i in range(len(sent))]\n",
     "\n",
     "input_message = sent2features(input_message)\n",
     "print(input_message[0])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "{'entities_found': {'B-ORG': u'melbourne', 'I-ORG': u'( australia ) , 25 may ( efe ) .'}, 'sentence': u'melbourne ( australia ) , 25 may ( efe ) .'}\n"
      ]
     }
    ],
    "source": [
     "sentence = []\n",
     "entities = {}\n",
     "        \n",
     "for i, token in enumerate(input_message):\n",
     "    word = token[\"word.lower()\"]\n",
     "    sentence.append(word)\n",
     "            \n",
     "    label = marvin_model['crf'].predict(input_message)[0][i]\n",
     "    if label != \"O\":\n",
     "        if label in entities:\n",
     "            entities[label].append(word)\n",
     "        else:\n",
     "            entities[label] = [word]\n",
     "example_of_prediction = {}\n",
     "example_of_prediction[\"sentence\"] = ' '.join(sentence)\n",
     "example_of_prediction[\"entities_found\"] = {}\n",
     "for k, v in entities.items():\n",
     "    example_of_prediction[\"entities_found\"][k] = ' '.join(v)\n",
     "\n",
     "print(example_of_prediction)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Predictor"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Do prediction."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {
     "marvin_cell": "predictor"
    },
    "outputs": [],
    "source": [
     "final_prediction = marvin_model['crf'].predict(input_message)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "[['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']]\n"
      ]
     }
    ],
    "source": [
     "print(final_prediction)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "Python 2",
    "language": "python",
    "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
    "version": "2.7.12"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Acquisitor and Cleaner"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download the dataset, store in marvin_initial_dataset."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import nltk\n",
	"import os"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"marvin_cell": "acquisitor"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package conll2002 to /home/zhang/nltk_data...\n",
	"[nltk_data] Package conll2002 is already up-to-date!\n"
	]
	}
	],
	"source": [
	"nltk.download('conll2002')\n",
	"train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))\n",
	"test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))\n",
	"\n",
	"marvin_initial_dataset = {\n",
	" 'train_sents': train_sents,\n",
	" 'test_sents': test_sents\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[(u'Melbourne', u'NP', u'B-LOC'), (u'(', u'Fpa', u'O'), (u'Australia', u'NP', u'B-LOC'), (u')', u'Fpt', u'O'), (u',', u'Fc', u'O'), (u'25', u'Z', u'O'), (u'may', u'NC', u'O'), (u'(', u'Fpa', u'O'), (u'EFE', u'NC', u'B-ORG'), (u')', u'Fpt', u'O'), (u'.', u'Fp', u'O')], [(u'-', u'Fg', u'O')]]\n"
	]
	}
	],
	"source": [
	"print(train_sents[0:2])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Training Preparator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Feature engineering, the initial datasets are splitted into feature datasets and label datasets."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"marvin_cell": "tpreparator"
	},
	"outputs": [],
	"source": [
	"def word2features(sent, i):\n",
	" word = sent[i][0]\n",
	" postag = sent[i][1]\n",
	" \n",
	" features = {\n",
	" 'bias': 1.0,\n",
	" 'word.lower()': word.lower(),\n",
	" 'word[-3:]': word[-3:],\n",
	" 'word[-2:]': word[-2:],\n",
	" 'word.isupper()': word.isupper(),\n",
	" 'word.istitle()': word.istitle(),\n",
	" 'word.isdigit()': word.isdigit(),\n",
	" 'postag': postag,\n",
	" 'postag[:2]': postag[:2], \n",
	" }\n",
	" if i > 0:\n",
	" word1 = sent[i-1][0]\n",
	" postag1 = sent[i-1][1]\n",
	" features.update({\n",
	" '-1:word.lower()': word1.lower(),\n",
	" '-1:word.istitle()': word1.istitle(),\n",
	" '-1:word.isupper()': word1.isupper(),\n",
	" '-1:postag': postag1,\n",
	" '-1:postag[:2]': postag1[:2],\n",
	" })\n",
	" else:\n",
	" features['BOS'] = True\n",
	" \n",
	" if i < len(sent)-1:\n",
	" word1 = sent[i+1][0]\n",
	" postag1 = sent[i+1][1]\n",
	" features.update({\n",
	" '+1:word.lower()': word1.lower(),\n",
	" '+1:word.istitle()': word1.istitle(),\n",
	" '+1:word.isupper()': word1.isupper(),\n",
	" '+1:postag': postag1,\n",
	" '+1:postag[:2]': postag1[:2],\n",
	" })\n",
	" else:\n",
	" features['EOS'] = True\n",
	" \n",
	" return features\n",
	"\n",
	"\n",
	"def sent2features(sent):\n",
	" return [word2features(sent, i) for i in range(len(sent))]\n",
	"\n",
	"def sent2labels(sent):\n",
	" return [label for token, postag, label in sent]\n",
	"\n",
	"X_train = [sent2features(s) for s in marvin_initial_dataset['train_sents']]\n",
	"y_train = [sent2labels(s) for s in marvin_initial_dataset['train_sents']]\n",
	"\n",
	"X_test = [sent2features(s) for s in marvin_initial_dataset['test_sents']]\n",
	"y_test = [sent2labels(s) for s in marvin_initial_dataset['test_sents']]\n",
	"\n",
	"marvin_dataset = {\n",
	" 'X_train': X_train,\n",
	" 'y_train': y_train,\n",
	" 'X_test': X_test,\n",
	" 'y_test': y_test\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Trainer"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Model training."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"import sklearn_crfsuite"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"marvin_cell": "trainer"
	},
	"outputs": [],
	"source": [
	"crf = sklearn_crfsuite.CRF(\n",
	" algorithm='lbfgs', \n",
	" c1=0.10789964607864502, \n",
	" c2=0.082422264927260847, \n",
	" max_iterations=100, \n",
	" all_possible_transitions=True\n",
	")\n",
	"crf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n",
	"\n",
	"marvin_model = {\n",
	" 'crf': crf\n",
	"}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Metrics Evaluator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Creating evaluation metrics for trained model."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn_crfsuite import scorers\n",
	"from sklearn_crfsuite import metrics"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"marvin_cell": "evaluator"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Balanced F-score: 0.797607625209\n",
	"\n",
	"Classification Report: \n",
	" precision recall f1-score support\n",
	"\n",
	" B-LOC 0.806 0.784 0.795 1084\n",
	" I-LOC 0.697 0.631 0.662 325\n",
	" B-MISC 0.749 0.555 0.637 339\n",
	" I-MISC 0.743 0.582 0.653 557\n",
	" B-ORG 0.807 0.835 0.821 1400\n",
	" I-ORG 0.841 0.800 0.820 1104\n",
	" B-PER 0.845 0.887 0.865 735\n",
	" I-PER 0.894 0.940 0.916 634\n",
	"\n",
	"avg / total 0.812 0.788 0.798 6178\n",
	"\n"
	]
	}
	],
	"source": [
	"labels = list(marvin_model['crf'].classes_)\n",
	"labels.remove('O')\n",
	"y_pred = marvin_model['crf'].predict(marvin_dataset['X_test'])\n",
	"\n",
	"score = metrics.flat_f1_score(marvin_dataset['y_test'], y_pred, average='weighted', labels=labels)\n",
	"\n",
	"sorted_labels = sorted(\n",
	" labels, \n",
	" key=lambda name: (name[1:], name[0])\n",
	")\n",
	"report = metrics.flat_classification_report(\n",
	" marvin_dataset['y_test'], y_pred, labels=sorted_labels, digits=3\n",
	")\n",
	"\n",
	"marvin_metrics = {\n",
	" 'score': score,\n",
	" 'report': report\n",
	"}\n",
	"\n",
	"print('Balanced F-score: ' + str(score))\n",
	"print('\\nClassification Report: \\n' + str(report))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Prediction Preparator"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Applying feature engineering method in input_message, preparing it for prediction."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"input_message = [(u'Melbourne', u'NP', u'B-LOC'),\n",
	" (u'(', u'Fpa', u'O'),\n",
	" (u'Australia', u'NP', u'B-LOC'),\n",
	" (u')', u'Fpt', u'O'),\n",
	" (u',', u'Fc', u'O'),\n",
	" (u'25', u'Z', u'O'),\n",
	" (u'may', u'NC', u'O'),\n",
	" (u'(', u'Fpa', u'O'),\n",
	" (u'EFE', u'NC', u'B-ORG'),\n",
	" (u')', u'Fpt', u'O'),\n",
	" (u'.', u'Fp', u'O')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[u'B-LOC', u'O', u'B-LOC', u'O', u'O', u'O', u'O', u'O', u'B-ORG', u'O', u'O']\n"
	]
	}
	],
	"source": [
	"input_label = sent2labels(input_message)\n",
	"print(input_label)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Feature engineering methods is implemented again here in Prediction Preparator, because in Marvin code structure, each action is separated."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"marvin_cell": "ppreparator"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'+1:word.isupper()': False, 'word.isupper()': False, 'BOS': True, 'word[-2:]': u'ne', '+1:postag': u'Fpa', 'word.isdigit()': False, 'postag': u'NP', 'bias': 1.0, 'postag[:2]': u'NP', '+1:word.lower()': u'(', '+1:word.istitle()': False, 'word.istitle()': True, 'word.lower()': u'melbourne', 'word[-3:]': u'rne', '+1:postag[:2]': u'Fp'}\n"
	]
	}
	],
	"source": [
	"def word2features(sent, i):\n",
	" word = sent[i][0]\n",
	" postag = sent[i][1]\n",
	" \n",
	" features = {\n",
	" 'bias': 1.0,\n",
	" 'word.lower()': word.lower(),\n",
	" 'word[-3:]': word[-3:],\n",
	" 'word[-2:]': word[-2:],\n",
	" 'word.isupper()': word.isupper(),\n",
	" 'word.istitle()': word.istitle(),\n",
	" 'word.isdigit()': word.isdigit(),\n",
	" 'postag': postag,\n",
	" 'postag[:2]': postag[:2], \n",
	" }\n",
	" if i > 0:\n",
	" word1 = sent[i-1][0]\n",
	" postag1 = sent[i-1][1]\n",
	" features.update({\n",
	" '-1:word.lower()': word1.lower(),\n",
	" '-1:word.istitle()': word1.istitle(),\n",
	" '-1:word.isupper()': word1.isupper(),\n",
	" '-1:postag': postag1,\n",
	" '-1:postag[:2]': postag1[:2],\n",
	" })\n",
	" else:\n",
	" features['BOS'] = True\n",
	" \n",
	" if i < len(sent)-1:\n",
	" word1 = sent[i+1][0]\n",
	" postag1 = sent[i+1][1]\n",
	" features.update({\n",
	" '+1:word.lower()': word1.lower(),\n",
	" '+1:word.istitle()': word1.istitle(),\n",
	" '+1:word.isupper()': word1.isupper(),\n",
	" '+1:postag': postag1,\n",
	" '+1:postag[:2]': postag1[:2],\n",
	" })\n",
	" else:\n",
	" features['EOS'] = True\n",
	" \n",
	" return features\n",
	"\n",
	"\n",
	"def sent2features(sent):\n",
	" return [word2features(sent, i) for i in range(len(sent))]\n",
	"\n",
	"input_message = sent2features(input_message)\n",
	"print(input_message[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'entities_found': {'B-ORG': u'melbourne', 'I-ORG': u'( australia ) , 25 may ( efe ) .'}, 'sentence': u'melbourne ( australia ) , 25 may ( efe ) .'}\n"
	]
	}
	],
	"source": [
	"sentence = []\n",
	"entities = {}\n",
	" \n",
	"for i, token in enumerate(input_message):\n",
	" word = token[\"word.lower()\"]\n",
	" sentence.append(word)\n",
	" \n",
	" label = marvin_model['crf'].predict(input_message)[0][i]\n",
	" if label != \"O\":\n",
	" if label in entities:\n",
	" entities[label].append(word)\n",
	" else:\n",
	" entities[label] = [word]\n",
	"example_of_prediction = {}\n",
	"example_of_prediction[\"sentence\"] = ' '.join(sentence)\n",
	"example_of_prediction[\"entities_found\"] = {}\n",
	"for k, v in entities.items():\n",
	" example_of_prediction[\"entities_found\"][k] = ' '.join(v)\n",
	"\n",
	"print(example_of_prediction)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Predictor"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Do prediction."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"marvin_cell": "predictor"
	},
	"outputs": [],
	"source": [
	"final_prediction = marvin_model['crf'].predict(input_message)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']]\n"
	]
	}
	],
	"source": [
	"print(final_prediction)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}