blob: d9f3344da771fa268e6ec671a3a66f2af3cd5a67 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Acquisitor and Cleaner"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download the dataset, store in marvin_initial_dataset."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"marvin_cell": "acquisitor"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package conll2002 to /home/zhang/nltk_data...\n",
"[nltk_data] Package conll2002 is already up-to-date!\n"
]
}
],
"source": [
"nltk.download('conll2002')\n",
"train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))\n",
"test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))\n",
"\n",
"marvin_initial_dataset = {\n",
" 'train_sents': train_sents,\n",
" 'test_sents': test_sents\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[(u'Melbourne', u'NP', u'B-LOC'), (u'(', u'Fpa', u'O'), (u'Australia', u'NP', u'B-LOC'), (u')', u'Fpt', u'O'), (u',', u'Fc', u'O'), (u'25', u'Z', u'O'), (u'may', u'NC', u'O'), (u'(', u'Fpa', u'O'), (u'EFE', u'NC', u'B-ORG'), (u')', u'Fpt', u'O'), (u'.', u'Fp', u'O')], [(u'-', u'Fg', u'O')]]\n"
]
}
],
"source": [
"print(train_sents[0:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training Preparator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Feature engineering, the initial datasets are splitted into feature datasets and label datasets."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"marvin_cell": "tpreparator"
},
"outputs": [],
"source": [
"def word2features(sent, i):\n",
" word = sent[i][0]\n",
" postag = sent[i][1]\n",
" \n",
" features = {\n",
" 'bias': 1.0,\n",
" 'word.lower()': word.lower(),\n",
" 'word[-3:]': word[-3:],\n",
" 'word[-2:]': word[-2:],\n",
" 'word.isupper()': word.isupper(),\n",
" 'word.istitle()': word.istitle(),\n",
" 'word.isdigit()': word.isdigit(),\n",
" 'postag': postag,\n",
" 'postag[:2]': postag[:2], \n",
" }\n",
" if i > 0:\n",
" word1 = sent[i-1][0]\n",
" postag1 = sent[i-1][1]\n",
" features.update({\n",
" '-1:word.lower()': word1.lower(),\n",
" '-1:word.istitle()': word1.istitle(),\n",
" '-1:word.isupper()': word1.isupper(),\n",
" '-1:postag': postag1,\n",
" '-1:postag[:2]': postag1[:2],\n",
" })\n",
" else:\n",
" features['BOS'] = True\n",
" \n",
" if i < len(sent)-1:\n",
" word1 = sent[i+1][0]\n",
" postag1 = sent[i+1][1]\n",
" features.update({\n",
" '+1:word.lower()': word1.lower(),\n",
" '+1:word.istitle()': word1.istitle(),\n",
" '+1:word.isupper()': word1.isupper(),\n",
" '+1:postag': postag1,\n",
" '+1:postag[:2]': postag1[:2],\n",
" })\n",
" else:\n",
" features['EOS'] = True\n",
" \n",
" return features\n",
"\n",
"\n",
"def sent2features(sent):\n",
" return [word2features(sent, i) for i in range(len(sent))]\n",
"\n",
"def sent2labels(sent):\n",
" return [label for token, postag, label in sent]\n",
"\n",
"X_train = [sent2features(s) for s in marvin_initial_dataset['train_sents']]\n",
"y_train = [sent2labels(s) for s in marvin_initial_dataset['train_sents']]\n",
"\n",
"X_test = [sent2features(s) for s in marvin_initial_dataset['test_sents']]\n",
"y_test = [sent2labels(s) for s in marvin_initial_dataset['test_sents']]\n",
"\n",
"marvin_dataset = {\n",
" 'X_train': X_train,\n",
" 'y_train': y_train,\n",
" 'X_test': X_test,\n",
" 'y_test': y_test\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Trainer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Model training."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import sklearn_crfsuite"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"marvin_cell": "trainer"
},
"outputs": [],
"source": [
"crf = sklearn_crfsuite.CRF(\n",
" algorithm='lbfgs', \n",
" c1=0.10789964607864502, \n",
" c2=0.082422264927260847, \n",
" max_iterations=100, \n",
" all_possible_transitions=True\n",
")\n",
"crf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])\n",
"\n",
"marvin_model = {\n",
" 'crf': crf\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metrics Evaluator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Creating evaluation metrics for trained model."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from sklearn_crfsuite import scorers\n",
"from sklearn_crfsuite import metrics"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"marvin_cell": "evaluator"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Balanced F-score: 0.797607625209\n",
"\n",
"Classification Report: \n",
" precision recall f1-score support\n",
"\n",
" B-LOC 0.806 0.784 0.795 1084\n",
" I-LOC 0.697 0.631 0.662 325\n",
" B-MISC 0.749 0.555 0.637 339\n",
" I-MISC 0.743 0.582 0.653 557\n",
" B-ORG 0.807 0.835 0.821 1400\n",
" I-ORG 0.841 0.800 0.820 1104\n",
" B-PER 0.845 0.887 0.865 735\n",
" I-PER 0.894 0.940 0.916 634\n",
"\n",
"avg / total 0.812 0.788 0.798 6178\n",
"\n"
]
}
],
"source": [
"labels = list(marvin_model['crf'].classes_)\n",
"labels.remove('O')\n",
"y_pred = marvin_model['crf'].predict(marvin_dataset['X_test'])\n",
"\n",
"score = metrics.flat_f1_score(marvin_dataset['y_test'], y_pred, average='weighted', labels=labels)\n",
"\n",
"sorted_labels = sorted(\n",
" labels, \n",
" key=lambda name: (name[1:], name[0])\n",
")\n",
"report = metrics.flat_classification_report(\n",
" marvin_dataset['y_test'], y_pred, labels=sorted_labels, digits=3\n",
")\n",
"\n",
"marvin_metrics = {\n",
" 'score': score,\n",
" 'report': report\n",
"}\n",
"\n",
"print('Balanced F-score: ' + str(score))\n",
"print('\\nClassification Report: \\n' + str(report))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prediction Preparator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Applying feature engineering method in input_message, preparing it for prediction."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"input_message = [(u'Melbourne', u'NP', u'B-LOC'),\n",
" (u'(', u'Fpa', u'O'),\n",
" (u'Australia', u'NP', u'B-LOC'),\n",
" (u')', u'Fpt', u'O'),\n",
" (u',', u'Fc', u'O'),\n",
" (u'25', u'Z', u'O'),\n",
" (u'may', u'NC', u'O'),\n",
" (u'(', u'Fpa', u'O'),\n",
" (u'EFE', u'NC', u'B-ORG'),\n",
" (u')', u'Fpt', u'O'),\n",
" (u'.', u'Fp', u'O')]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[u'B-LOC', u'O', u'B-LOC', u'O', u'O', u'O', u'O', u'O', u'B-ORG', u'O', u'O']\n"
]
}
],
"source": [
"input_label = sent2labels(input_message)\n",
"print(input_label)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Feature engineering methods is implemented again here in Prediction Preparator, because in Marvin code structure, each action is separated."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"marvin_cell": "ppreparator"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'+1:word.isupper()': False, 'word.isupper()': False, 'BOS': True, 'word[-2:]': u'ne', '+1:postag': u'Fpa', 'word.isdigit()': False, 'postag': u'NP', 'bias': 1.0, 'postag[:2]': u'NP', '+1:word.lower()': u'(', '+1:word.istitle()': False, 'word.istitle()': True, 'word.lower()': u'melbourne', 'word[-3:]': u'rne', '+1:postag[:2]': u'Fp'}\n"
]
}
],
"source": [
"def word2features(sent, i):\n",
" word = sent[i][0]\n",
" postag = sent[i][1]\n",
" \n",
" features = {\n",
" 'bias': 1.0,\n",
" 'word.lower()': word.lower(),\n",
" 'word[-3:]': word[-3:],\n",
" 'word[-2:]': word[-2:],\n",
" 'word.isupper()': word.isupper(),\n",
" 'word.istitle()': word.istitle(),\n",
" 'word.isdigit()': word.isdigit(),\n",
" 'postag': postag,\n",
" 'postag[:2]': postag[:2], \n",
" }\n",
" if i > 0:\n",
" word1 = sent[i-1][0]\n",
" postag1 = sent[i-1][1]\n",
" features.update({\n",
" '-1:word.lower()': word1.lower(),\n",
" '-1:word.istitle()': word1.istitle(),\n",
" '-1:word.isupper()': word1.isupper(),\n",
" '-1:postag': postag1,\n",
" '-1:postag[:2]': postag1[:2],\n",
" })\n",
" else:\n",
" features['BOS'] = True\n",
" \n",
" if i < len(sent)-1:\n",
" word1 = sent[i+1][0]\n",
" postag1 = sent[i+1][1]\n",
" features.update({\n",
" '+1:word.lower()': word1.lower(),\n",
" '+1:word.istitle()': word1.istitle(),\n",
" '+1:word.isupper()': word1.isupper(),\n",
" '+1:postag': postag1,\n",
" '+1:postag[:2]': postag1[:2],\n",
" })\n",
" else:\n",
" features['EOS'] = True\n",
" \n",
" return features\n",
"\n",
"\n",
"def sent2features(sent):\n",
" return [word2features(sent, i) for i in range(len(sent))]\n",
"\n",
"input_message = sent2features(input_message)\n",
"print(input_message[0])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'entities_found': {'B-ORG': u'melbourne', 'I-ORG': u'( australia ) , 25 may ( efe ) .'}, 'sentence': u'melbourne ( australia ) , 25 may ( efe ) .'}\n"
]
}
],
"source": [
"sentence = []\n",
"entities = {}\n",
" \n",
"for i, token in enumerate(input_message):\n",
" word = token[\"word.lower()\"]\n",
" sentence.append(word)\n",
" \n",
" label = marvin_model['crf'].predict(input_message)[0][i]\n",
" if label != \"O\":\n",
" if label in entities:\n",
" entities[label].append(word)\n",
" else:\n",
" entities[label] = [word]\n",
"example_of_prediction = {}\n",
"example_of_prediction[\"sentence\"] = ' '.join(sentence)\n",
"example_of_prediction[\"entities_found\"] = {}\n",
"for k, v in entities.items():\n",
" example_of_prediction[\"entities_found\"][k] = ' '.join(v)\n",
"\n",
"print(example_of_prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Predictor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Do prediction."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"marvin_cell": "predictor"
},
"outputs": [],
"source": [
"final_prediction = marvin_model['crf'].predict(input_message)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']]\n"
]
}
],
"source": [
"print(final_prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}