Adjust settings to match namefinder.py trainer
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
index 3954092..738a952 100644
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
@@ -17,8 +17,6 @@
package org.apache.opennlp.namefinder;
-import opennlp.tools.util.StringUtil;
-
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
@@ -28,6 +26,8 @@
import java.util.Map;
import java.util.regex.Pattern;
+import opennlp.tools.util.StringUtil;
+
public class WordIndexer {
private final Map<Character, Integer> char2idx;
@@ -35,11 +35,9 @@
public static String UNK = "$UNK$";
public static String NUM = "$NUM$";
- public static String NONE = "O";
- //private boolean useChars = true;
- private boolean lowerCase = true;
- private boolean allowUnk = true;
+ private boolean lowerCase = false;
+ private boolean allowUnk = false;
private Pattern digitPattern = Pattern.compile("\\d+(,\\d+)*(\\.\\d+)?");
@@ -114,8 +112,9 @@
if (lowerCase) {
word = StringUtil.toLowerCase(word);
}
- if (digitPattern.matcher(word).find())
- word = NUM;
+
+ // if (digitPattern.matcher(word).find())
+ // word = NUM;
// 2. get id of word
Integer wordId;
@@ -140,7 +139,6 @@
return tokenIds;
}
-
public class Ids {
private int[] chars;
@@ -162,5 +160,4 @@
this.word = word;
}
}
-
}