| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Acquisitor and Cleaner" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Download the dataset, store in marvin_initial_dataset." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "import nltk\n", |
| "import os" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "metadata": { |
| "marvin_cell": "acquisitor" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "[nltk_data] Downloading package conll2002 to /home/zhang/nltk_data...\n", |
| "[nltk_data] Package conll2002 is already up-to-date!\n" |
| ] |
| } |
| ], |
| "source": [ |
| "nltk.download('conll2002')\n", |
| "train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))\n", |
| "test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))\n", |
| "\n", |
| "marvin_initial_dataset = {\n", |
| " 'train_sents': train_sents,\n", |
| " 'test_sents': test_sents\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "[[(u'Melbourne', u'NP', u'B-LOC'), (u'(', u'Fpa', u'O'), (u'Australia', u'NP', u'B-LOC'), (u')', u'Fpt', u'O'), (u',', u'Fc', u'O'), (u'25', u'Z', u'O'), (u'may', u'NC', u'O'), (u'(', u'Fpa', u'O'), (u'EFE', u'NC', u'B-ORG'), (u')', u'Fpt', u'O'), (u'.', u'Fp', u'O')], [(u'-', u'Fg', u'O')]]\n" |
| ] |
| } |
| ], |
| "source": [ |
| "print(train_sents[0:2])" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Training Preparator" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Feature engineering, the initial datasets are splitted into feature datasets and label datasets." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 4, |
| "metadata": { |
| "marvin_cell": "tpreparator" |
| }, |
| "outputs": [], |
| "source": [ |
| "def word2features(sent, i):\n", |
| " word = sent[i][0]\n", |
| " postag = sent[i][1]\n", |
| " \n", |
| " features = {\n", |
| " 'bias': 1.0,\n", |
| " 'word.lower()': word.lower(),\n", |
| " 'word[-3:]': word[-3:],\n", |
| " 'word[-2:]': word[-2:],\n", |
| " 'word.isupper()': word.isupper(),\n", |
| " 'word.istitle()': word.istitle(),\n", |
| " 'word.isdigit()': word.isdigit(),\n", |
| " 'postag': postag,\n", |
| " 'postag[:2]': postag[:2], \n", |
| " }\n", |
| " if i > 0:\n", |
| " word1 = sent[i-1][0]\n", |
| " postag1 = sent[i-1][1]\n", |
| " features.update({\n", |
| " '-1:word.lower()': word1.lower(),\n", |
| " '-1:word.istitle()': word1.istitle(),\n", |
| " '-1:word.isupper()': word1.isupper(),\n", |
| " '-1:postag': postag1,\n", |
| " '-1:postag[:2]': postag1[:2],\n", |
| " })\n", |
| " else:\n", |
| " features['BOS'] = True\n", |
| " \n", |
| " if i < len(sent)-1:\n", |
| " word1 = sent[i+1][0]\n", |
| " postag1 = sent[i+1][1]\n", |
| " features.update({\n", |
| " '+1:word.lower()': word1.lower(),\n", |
| " '+1:word.istitle()': word1.istitle(),\n", |
| " '+1:word.isupper()': word1.isupper(),\n", |
| " '+1:postag': postag1,\n", |
| " '+1:postag[:2]': postag1[:2],\n", |
| " })\n", |
| " else:\n", |
| " features['EOS'] = True\n", |
| " \n", |
| " return features\n", |
| "\n", |
| "\n", |
| "def sent2features(sent):\n", |
| " return [word2features(sent, i) for i in range(len(sent))]\n", |
| "\n", |
| "def sent2labels(sent):\n", |
| " return [label for token, postag, label in sent]\n", |
| "\n", |
| "X_train = [sent2features(s) for s in marvin_initial_dataset['train_sents']]\n", |
| "y_train = [sent2labels(s) for s in marvin_initial_dataset['train_sents']]\n", |
| "\n", |
| "X_test = [sent2features(s) for s in marvin_initial_dataset['test_sents']]\n", |
| "y_test = [sent2labels(s) for s in marvin_initial_dataset['test_sents']]\n", |
| "\n", |
| "marvin_dataset = {\n", |
| " 'X_train': X_train,\n", |
| " 'y_train': y_train,\n", |
| " 'X_test': X_test,\n", |
| " 'y_test': y_test\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Trainer" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Model training." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "import sklearn_crfsuite" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": { |
| "marvin_cell": "trainer" |
| }, |
| "outputs": [], |
| "source": [ |
| "crf = sklearn_crfsuite.CRF(\n", |
| " algorithm='lbfgs', \n", |
| " c1=0.10789964607864502, \n", |
| " c2=0.082422264927260847, \n", |
| " max_iterations=100, \n", |
| " all_possible_transitions=True\n", |
| ")\n", |
| "crf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n", |
| "\n", |
| "marvin_model = {\n", |
| " 'crf': crf\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Metrics Evaluator" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Creating evaluation metrics for trained model." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 7, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "from sklearn_crfsuite import scorers\n", |
| "from sklearn_crfsuite import metrics" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 8, |
| "metadata": { |
| "marvin_cell": "evaluator" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Balanced F-score: 0.797607625209\n", |
| "\n", |
| "Classification Report: \n", |
| " precision recall f1-score support\n", |
| "\n", |
| " B-LOC 0.806 0.784 0.795 1084\n", |
| " I-LOC 0.697 0.631 0.662 325\n", |
| " B-MISC 0.749 0.555 0.637 339\n", |
| " I-MISC 0.743 0.582 0.653 557\n", |
| " B-ORG 0.807 0.835 0.821 1400\n", |
| " I-ORG 0.841 0.800 0.820 1104\n", |
| " B-PER 0.845 0.887 0.865 735\n", |
| " I-PER 0.894 0.940 0.916 634\n", |
| "\n", |
| "avg / total 0.812 0.788 0.798 6178\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "labels = list(marvin_model['crf'].classes_)\n", |
| "labels.remove('O')\n", |
| "y_pred = marvin_model['crf'].predict(marvin_dataset['X_test'])\n", |
| "\n", |
| "score = metrics.flat_f1_score(marvin_dataset['y_test'], y_pred, average='weighted', labels=labels)\n", |
| "\n", |
| "sorted_labels = sorted(\n", |
| " labels, \n", |
| " key=lambda name: (name[1:], name[0])\n", |
| ")\n", |
| "report = metrics.flat_classification_report(\n", |
| " marvin_dataset['y_test'], y_pred, labels=sorted_labels, digits=3\n", |
| ")\n", |
| "\n", |
| "marvin_metrics = {\n", |
| " 'score': score,\n", |
| " 'report': report\n", |
| "}\n", |
| "\n", |
| "print('Balanced F-score: ' + str(score))\n", |
| "print('\\nClassification Report: \\n' + str(report))" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Prediction Preparator" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Applying feature engineering method in input_message, preparing it for prediction." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 9, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "input_message = [(u'Melbourne', u'NP', u'B-LOC'),\n", |
| " (u'(', u'Fpa', u'O'),\n", |
| " (u'Australia', u'NP', u'B-LOC'),\n", |
| " (u')', u'Fpt', u'O'),\n", |
| " (u',', u'Fc', u'O'),\n", |
| " (u'25', u'Z', u'O'),\n", |
| " (u'may', u'NC', u'O'),\n", |
| " (u'(', u'Fpa', u'O'),\n", |
| " (u'EFE', u'NC', u'B-ORG'),\n", |
| " (u')', u'Fpt', u'O'),\n", |
| " (u'.', u'Fp', u'O')]" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 10, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "[u'B-LOC', u'O', u'B-LOC', u'O', u'O', u'O', u'O', u'O', u'B-ORG', u'O', u'O']\n" |
| ] |
| } |
| ], |
| "source": [ |
| "input_label = sent2labels(input_message)\n", |
| "print(input_label)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Feature engineering methods is implemented again here in Prediction Preparator, because in Marvin code structure, each action is separated." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 11, |
| "metadata": { |
| "marvin_cell": "ppreparator" |
| }, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "{'+1:word.isupper()': False, 'word.isupper()': False, 'BOS': True, 'word[-2:]': u'ne', '+1:postag': u'Fpa', 'word.isdigit()': False, 'postag': u'NP', 'bias': 1.0, 'postag[:2]': u'NP', '+1:word.lower()': u'(', '+1:word.istitle()': False, 'word.istitle()': True, 'word.lower()': u'melbourne', 'word[-3:]': u'rne', '+1:postag[:2]': u'Fp'}\n" |
| ] |
| } |
| ], |
| "source": [ |
| "def word2features(sent, i):\n", |
| " word = sent[i][0]\n", |
| " postag = sent[i][1]\n", |
| " \n", |
| " features = {\n", |
| " 'bias': 1.0,\n", |
| " 'word.lower()': word.lower(),\n", |
| " 'word[-3:]': word[-3:],\n", |
| " 'word[-2:]': word[-2:],\n", |
| " 'word.isupper()': word.isupper(),\n", |
| " 'word.istitle()': word.istitle(),\n", |
| " 'word.isdigit()': word.isdigit(),\n", |
| " 'postag': postag,\n", |
| " 'postag[:2]': postag[:2], \n", |
| " }\n", |
| " if i > 0:\n", |
| " word1 = sent[i-1][0]\n", |
| " postag1 = sent[i-1][1]\n", |
| " features.update({\n", |
| " '-1:word.lower()': word1.lower(),\n", |
| " '-1:word.istitle()': word1.istitle(),\n", |
| " '-1:word.isupper()': word1.isupper(),\n", |
| " '-1:postag': postag1,\n", |
| " '-1:postag[:2]': postag1[:2],\n", |
| " })\n", |
| " else:\n", |
| " features['BOS'] = True\n", |
| " \n", |
| " if i < len(sent)-1:\n", |
| " word1 = sent[i+1][0]\n", |
| " postag1 = sent[i+1][1]\n", |
| " features.update({\n", |
| " '+1:word.lower()': word1.lower(),\n", |
| " '+1:word.istitle()': word1.istitle(),\n", |
| " '+1:word.isupper()': word1.isupper(),\n", |
| " '+1:postag': postag1,\n", |
| " '+1:postag[:2]': postag1[:2],\n", |
| " })\n", |
| " else:\n", |
| " features['EOS'] = True\n", |
| " \n", |
| " return features\n", |
| "\n", |
| "\n", |
| "def sent2features(sent):\n", |
| " return [word2features(sent, i) for i in range(len(sent))]\n", |
| "\n", |
| "input_message = sent2features(input_message)\n", |
| "print(input_message[0])" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 12, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "{'entities_found': {'B-ORG': u'melbourne', 'I-ORG': u'( australia ) , 25 may ( efe ) .'}, 'sentence': u'melbourne ( australia ) , 25 may ( efe ) .'}\n" |
| ] |
| } |
| ], |
| "source": [ |
| "sentence = []\n", |
| "entities = {}\n", |
| " \n", |
| "for i, token in enumerate(input_message):\n", |
| " word = token[\"word.lower()\"]\n", |
| " sentence.append(word)\n", |
| " \n", |
| " label = marvin_model['crf'].predict(input_message)[0][i]\n", |
| " if label != \"O\":\n", |
| " if label in entities:\n", |
| " entities[label].append(word)\n", |
| " else:\n", |
| " entities[label] = [word]\n", |
| "example_of_prediction = {}\n", |
| "example_of_prediction[\"sentence\"] = ' '.join(sentence)\n", |
| "example_of_prediction[\"entities_found\"] = {}\n", |
| "for k, v in entities.items():\n", |
| " example_of_prediction[\"entities_found\"][k] = ' '.join(v)\n", |
| "\n", |
| "print(example_of_prediction)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Predictor" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Do prediction." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 13, |
| "metadata": { |
| "marvin_cell": "predictor" |
| }, |
| "outputs": [], |
| "source": [ |
| "final_prediction = marvin_model['crf'].predict(input_message)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 14, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "[['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']]\n" |
| ] |
| } |
| ], |
| "source": [ |
| "print(final_prediction)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.12" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 2 |
| } |