Fix missing unk vector (#35)
diff --git a/tf-ner-poc/src/main/python/namefinder/namefinder.py b/tf-ner-poc/src/main/python/namefinder/namefinder.py
index e7fe436..548a3a9 100644
--- a/tf-ner-poc/src/main/python/namefinder/namefinder.py
+++ b/tf-ner-poc/src/main/python/namefinder/namefinder.py
@@ -29,6 +29,10 @@
import os
from tempfile import TemporaryDirectory
+# global variables for unknown word and numbers
+__UNK__ = '__UNK__'
+__NUM__ = '__NUM__'
+
# Parse the OpenNLP Name Finder format into begin, end, type triples
class NameSample:
@@ -87,7 +91,7 @@
if word_dict.get(token) is not None:
vector = word_dict[token]
else:
- vector = word_dict['__UNK__']
+ vector = word_dict[__UNK__]
sentence.append(vector)
@@ -103,7 +107,7 @@
labels.append(label)
for label_string in label:
- if not label_string in self.label_dict:
+ if label_string not in self.label_dict:
self.label_dict[label_string] = len(self.label_dict)
return sentences, labels, chars_set
@@ -350,12 +354,20 @@
vector_size = len(parts) - 1
if len(parts) != vector_size + 1:
- # print("Bad Vector: ",len(line),len(parts), line)
raise VectorException("Bad Vector in line: {}, size: {} vector: {}".format(len(line), len(parts), line))
continue
word_dict[parts[0]] = len(word_dict)
embeddings.append(np.array(parts[1:], dtype=np.float32))
+ # add unknown word symbol and number symbol
+ if __UNK__ not in word_dict:
+ word_dict[__UNK__] = len(word_dict)
+ unk_random = 0.08 * np.random.random_sample(vector_size) - 0.04
+ embeddings.append(unk_random.astype(np.float32))
+ if __NUM__ not in word_dict:
+ word_dict[__NUM__] = len(word_dict)
+ embeddings.append(np.zeros(vector_size, dtype=np.float32))
+
# Create a reverse word dict
rev_word_dict = {}
for word, id in word_dict.items():