Merge pull request #3 from kojisekig/OPENNLP-1221
OPENNLP-1221: FeatureGeneratorUtil.tokenFeature() is too specific for…
diff --git a/japanese-addon/build.xml b/japanese-addon/build.xml
index bc13015..925f888 100644
--- a/japanese-addon/build.xml
+++ b/japanese-addon/build.xml
@@ -22,6 +22,7 @@
<property name="src.dir" value="src"/>
<property name="cls.dir" value="classes"/>
<property name="lib.dir" value="lib"/>
+ <property name="test.result.dir" value="test-result"/>
<property name="product.jar" value="opennlp-japanese-addon-1.0-SNAPSHOT.jar"/>
<target name="compile" description="compile source and test code">
@@ -32,14 +33,35 @@
<fileset dir="${lib.dir}" includes="**/*.jar"/>
</classpath>
</javac>
-<!--
<javac srcdir="${src.dir}/test/java" destdir="${cls.dir}" debug="on" includes="**/*Test.java">
<classpath path="${cls.dir}"/>
<classpath>
<fileset dir="${lib.dir}" includes="**/*.jar"/>
</classpath>
</javac>
--->
+ </target>
+
+ <target name="test" depends="compile" description="run all tests">
+ <mkdir dir="${test.result.dir}"/>
+ <junit printsummary="on"
+ haltonfailure="no"
+ errorProperty="tests.failed"
+ failureProperty="tests.failed"
+ dir="${src.dir}/test/resources">
+ <classpath path="${cls.dir}"/>
+ <classpath>
+ <fileset dir="${lib.dir}" includes="**/*.jar"/>
+ </classpath>
+ <jvmarg value="-ea"/>
+ <formatter type="plain"/>
+ <batchtest fork="yes" todir="${test.result.dir}" unless="testcase">
+ <fileset dir="${src.dir}/test/java" includes="**/*Test.java"/>
+ </batchtest>
+ <batchtest fork="yes" todir="${test.result.dir}" if="testcase">
+ <fileset dir="${src.dir}/test/java" includes="**/${testcase}.java"/>
+ </batchtest>
+ </junit>
+ <fail if="tests.failed">***** Tests failed! *****</fail>
</target>
<target name="jar" depends="compile" description="create a jar file">
@@ -48,6 +70,7 @@
<target name="clean" description="clean all">
<delete dir="${cls.dir}"/>
+ <delete dir="${test.result.dir}"/>
<delete>
<fileset dir="." includes="*.jar"/>
</delete>
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGenerator.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGenerator.java
new file mode 100644
index 0000000..d1a6305
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGenerator.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.List;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class BigramNameFeatureGenerator implements AdaptiveFeatureGenerator {
+
+ public void createFeatures(List<String> features, String[] tokens, int index,
+ String[] previousOutcomes) {
+ String wc = FeatureGeneratorUtil.tokenFeature(tokens[index]);
+ //bi-gram features
+ if (index > 0) {
+ features.add("pw,w=" + tokens[index - 1] + "," + tokens[index]);
+ String pwc = FeatureGeneratorUtil.tokenFeature(tokens[index - 1]);
+ features.add("pwc,wc=" + pwc + "," + wc);
+ }
+ if (index + 1 < tokens.length) {
+ features.add("w,nw=" + tokens[index] + "," + tokens[index + 1]);
+ String nwc = FeatureGeneratorUtil.tokenFeature(tokens[index + 1]);
+ features.add("wc,nc=" + wc + "," + nwc);
+ }
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGeneratorFactory.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGeneratorFactory.java
new file mode 100644
index 0000000..50e2dfa
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGeneratorFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.GeneratorFactory;
+
+public class BigramNameFeatureGeneratorFactory
+ extends GeneratorFactory.AbstractXmlFeatureGeneratorFactory {
+
+ public BigramNameFeatureGeneratorFactory() {
+ super();
+ }
+
+ @Override
+ public AdaptiveFeatureGenerator create() throws InvalidFormatException {
+ return new BigramNameFeatureGenerator();
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BrownClusterTokenClassFeatureGeneratorFactory.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BrownClusterTokenClassFeatureGeneratorFactory.java
new file mode 100644
index 0000000..6d005d3
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BrownClusterTokenClassFeatureGeneratorFactory.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.BrownCluster;
+import opennlp.tools.util.featuregen.GeneratorFactory;
+import opennlp.tools.util.model.ArtifactSerializer;
+
+/**
+ * Generates Brown clustering features for token classes.
+ */
+public class BrownClusterTokenClassFeatureGeneratorFactory
+ extends GeneratorFactory.AbstractXmlFeatureGeneratorFactory {
+
+ public BrownClusterTokenClassFeatureGeneratorFactory() {
+ super();
+ }
+
+ @Override
+ public AdaptiveFeatureGenerator create() throws InvalidFormatException {
+ // if resourceManager is null, we don't instantiate
+ if (resourceManager == null)
+ return null;
+
+ String dictResourceKey = getStr("dict");
+ Object dictResource = resourceManager.getResource(dictResourceKey);
+ if (!(dictResource instanceof BrownCluster)) {
+ throw new InvalidFormatException("Not a BrownLexicon resource for key: " + dictResourceKey);
+ }
+
+ return new BrownTokenClassFeatureGenerator((BrownCluster) dictResource);
+ }
+
+ @Override
+ public Map<String, ArtifactSerializer<?>> getArtifactSerializerMapping() throws InvalidFormatException {
+ Map<String, ArtifactSerializer<?>> mapping = new HashMap<>();
+ mapping.put(getStr("dict"), new BrownCluster.BrownClusterSerializer());
+ return mapping;
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BrownTokenClassFeatureGenerator.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BrownTokenClassFeatureGenerator.java
new file mode 100644
index 0000000..341f46a
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/BrownTokenClassFeatureGenerator.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.List;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.BrownCluster;
+import opennlp.tools.util.featuregen.BrownTokenClasses;
+
+/**
+ * Generates Brown cluster features for current token and token class.
+ */
+public class BrownTokenClassFeatureGenerator implements AdaptiveFeatureGenerator {
+
+ private BrownCluster brownLexicon;
+
+ public BrownTokenClassFeatureGenerator(BrownCluster dict) {
+ this.brownLexicon = dict;
+ }
+
+ public void createFeatures(List<String> features, String[] tokens, int index,
+ String[] previousOutcomes) {
+
+ String wordShape = FeatureGeneratorUtil.tokenFeature(tokens[index]);
+ List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownLexicon);
+
+ for (int i = 0; i < wordClasses.size(); i++) {
+ features.add("c," + "browncluster" + "=" + wordShape + "," + wordClasses.get(i));
+ }
+ }
+
+}
+
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtil.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtil.java
new file mode 100644
index 0000000..a6c603a
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtil.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.Objects;
+
+/**
+ * This class provide common utilities for feature generation.
+ */
+public class FeatureGeneratorUtil {
+
+ /**
+ * Generates a class name for the specified token.
+ * The classes are as follows where the first matching class is used:
+ * <ul>
+ * <li>hira - Japanese Hiragana</li>
+ * <li>kata - Japanese Katakana</li>
+ * <li>digit - all number letters</li>
+ * <li>alpha - all alphabet letters</li>
+ * <li>other - other </li>
+ * </ul>
+ * @param token A token or word.
+ * @return The class name that the specified token belongs in.
+ */
+ public static String tokenFeature(String token) {
+
+ Objects.requireNonNull(token, "token must be not null!");
+
+ if (token.length() == 0) return "other";
+
+ // scan token only once
+ char c = token.charAt(0);
+ if (Character.isDigit(c)) {
+ for (int i = 1; i < token.length(); i++) {
+ c = token.charAt(i);
+ if (!Character.isDigit(c)) return "other";
+ }
+ return "digit";
+ }
+ else {
+ Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
+ if (ub.equals(Character.UnicodeBlock.HIRAGANA)) {
+ for (int i = 1; i < token.length(); i++) {
+ c = token.charAt(i);
+ if (c != '・' && c != 'ー' && c != '〜') {
+ ub = Character.UnicodeBlock.of(c);
+ if (!ub.equals(Character.UnicodeBlock.HIRAGANA)) return "other";
+ }
+ }
+ return "hira";
+ }
+ else if (ub.equals(Character.UnicodeBlock.KATAKANA)) {
+ for (int i = 1; i < token.length(); i++) {
+ c = token.charAt(i);
+ if (c != '・' && c != 'ー' && c != '〜') {
+ ub = Character.UnicodeBlock.of(c);
+ if (!ub.equals(Character.UnicodeBlock.KATAKANA)) return "other";
+ }
+ }
+ return "kata";
+ }
+ else if (Character.isAlphabetic(c) &&
+ !ub.equals(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)) {
+ for (int i = 1; i < token.length(); i++) {
+ c = token.charAt(i);
+ if (!Character.isAlphabetic(c)) return "other";
+ }
+ return "alpha";
+ }
+ }
+
+ return "other";
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGenerator.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGenerator.java
new file mode 100644
index 0000000..14cff33
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGenerator.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.List;
+
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+
+/**
+ * Generates features for different for the class of the token.
+ */
+public class TokenClassFeatureGenerator implements AdaptiveFeatureGenerator {
+
+ private static final String TOKEN_CLASS_PREFIX = "wc";
+ private static final String TOKEN_AND_CLASS_PREFIX = "w&c";
+
+ private boolean generateWordAndClassFeature;
+
+ public TokenClassFeatureGenerator() {
+ this(false);
+ }
+
+ public TokenClassFeatureGenerator(boolean generateWordAndClassFeature) {
+ this.generateWordAndClassFeature = generateWordAndClassFeature;
+ }
+
+ public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) {
+ String wordClass = FeatureGeneratorUtil.tokenFeature(tokens[index]);
+ features.add(TOKEN_CLASS_PREFIX + "=" + wordClass);
+
+ if (generateWordAndClassFeature) {
+ features.add(TOKEN_AND_CLASS_PREFIX + "=" + StringUtil.toLowerCase(tokens[index]) +
+ "," + wordClass);
+ }
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGeneratorFactory.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGeneratorFactory.java
new file mode 100644
index 0000000..dd7c0ed
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGeneratorFactory.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.GeneratorFactory;
+
+/**
+ * @see TokenClassFeatureGenerator
+ */
+public class TokenClassFeatureGeneratorFactory
+ extends GeneratorFactory.AbstractXmlFeatureGeneratorFactory {
+
+ public TokenClassFeatureGeneratorFactory() {
+ super();
+ }
+
+ @Override
+ public AdaptiveFeatureGenerator create() throws InvalidFormatException {
+ return new TokenClassFeatureGenerator(getBool("wordAndClass", true));
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGenerator.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGenerator.java
new file mode 100644
index 0000000..7771ed7
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGenerator.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.List;
+import java.util.regex.Pattern;
+
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+/**
+ * Partitions tokens into sub-tokens based on character classes and generates
+ * class features for each of the sub-tokens and combinations of those sub-tokens.
+ */
+public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator {
+
+ private Pattern noLetters = Pattern.compile("[^a-zA-Z]");
+ private Tokenizer tokenizer;
+
+ /**
+ * Initializes a new instance.
+ * For tokinization the {@link SimpleTokenizer} is used.
+ */
+ public TokenPatternFeatureGenerator() {
+ this(SimpleTokenizer.INSTANCE);
+ }
+
+ /**
+ * Initializes a new instance.
+ *
+ * @param supportTokenizer
+ */
+ public TokenPatternFeatureGenerator(Tokenizer supportTokenizer) {
+ tokenizer = supportTokenizer;
+ }
+
+ public void createFeatures(List<String> feats, String[] toks, int index, String[] preds) {
+
+ String[] tokenized = tokenizer.tokenize(toks[index]);
+
+ if (tokenized.length == 1) {
+ feats.add("st=" + StringUtil.toLowerCase(toks[index]));
+ return;
+ }
+
+ feats.add("stn=" + tokenized.length);
+
+ StringBuilder pattern = new StringBuilder();
+
+ for (int i = 0; i < tokenized.length; i++) {
+
+ if (i < tokenized.length - 1) {
+ feats.add("pt2=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
+ FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]));
+ }
+
+ if (i < tokenized.length - 2) {
+ feats.add("pt3=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
+ FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]) +
+ FeatureGeneratorUtil.tokenFeature(tokenized[i + 2]));
+ }
+
+ pattern.append(FeatureGeneratorUtil.tokenFeature(tokenized[i]));
+
+ if (!noLetters.matcher(tokenized[i]).find()) {
+ feats.add("st=" + StringUtil.toLowerCase(tokenized[i]));
+ }
+ }
+
+ feats.add("pta=" + pattern.toString());
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGeneratorFactory.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGeneratorFactory.java
new file mode 100644
index 0000000..35ccc08
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGeneratorFactory.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.GeneratorFactory;
+
+/**
+ * @see TokenPatternFeatureGenerator
+ */
+public class TokenPatternFeatureGeneratorFactory
+ extends GeneratorFactory.AbstractXmlFeatureGeneratorFactory {
+
+ public TokenPatternFeatureGeneratorFactory() {
+ super();
+ }
+
+ @Override
+ public AdaptiveFeatureGenerator create() throws InvalidFormatException {
+ return new TokenPatternFeatureGenerator();
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGenerator.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGenerator.java
new file mode 100644
index 0000000..10a6af8
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGenerator.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.List;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+/**
+ * Adds trigram features based on tokens and token classes.
+ *
+ */
+public class TrigramNameFeatureGenerator implements AdaptiveFeatureGenerator {
+
+ public void createFeatures(List<String> features, String[] tokens, int index,
+ String[] previousOutcomes) {
+ String wc = FeatureGeneratorUtil.tokenFeature(tokens[index]);
+ // trigram features
+ if (index > 1) {
+ features.add("ppw,pw,w=" + tokens[index - 2] + "," + tokens[index - 1] + "," + tokens[index]);
+ String pwc = FeatureGeneratorUtil.tokenFeature(tokens[index - 1]);
+ String ppwc = FeatureGeneratorUtil.tokenFeature(tokens[index - 2]);
+ features.add("ppwc,pwc,wc=" + ppwc + "," + pwc + "," + wc);
+ }
+ if (index + 2 < tokens.length) {
+ features.add("w,nw,nnw=" + tokens[index] + "," + tokens[index + 1] + "," + tokens[index + 2]);
+ String nwc = FeatureGeneratorUtil.tokenFeature(tokens[index + 1]);
+ String nnwc = FeatureGeneratorUtil.tokenFeature(tokens[index + 2]);
+ features.add("wc,nwc,nnwc=" + wc + "," + nwc + "," + nnwc);
+ }
+ }
+}
diff --git a/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGeneratorFactory.java b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGeneratorFactory.java
new file mode 100644
index 0000000..5b007b0
--- /dev/null
+++ b/japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGeneratorFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+import opennlp.tools.util.featuregen.GeneratorFactory;
+
+public class TrigramNameFeatureGeneratorFactory
+ extends GeneratorFactory.AbstractXmlFeatureGeneratorFactory {
+
+ public TrigramNameFeatureGeneratorFactory() {
+ super();
+ }
+
+ @Override
+ public AdaptiveFeatureGenerator create() throws InvalidFormatException {
+ return new TrigramNameFeatureGenerator();
+ }
+}
diff --git a/japanese-addon/src/main/resources/opennlp/tools/namefind/ner-auxinfo-features.xml b/japanese-addon/src/main/resources/opennlp/tools/namefind/ner-auxinfo-features.xml
index 9344997..1a1c272 100644
--- a/japanese-addon/src/main/resources/opennlp/tools/namefind/ner-auxinfo-features.xml
+++ b/japanese-addon/src/main/resources/opennlp/tools/namefind/ner-auxinfo-features.xml
@@ -23,7 +23,7 @@
<generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
<int name="prevLength">2</int>
<int name="nextLength">2</int>
- <generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/>
+ <generator class="opennlp.tools.util.featuregen.lang.jpn.TokenClassFeatureGeneratorFactory"/>
</generator>
</generator>
<generator class="opennlp.tools.util.featuregen.AuxiliaryInfoAwareDelegateFeatureGeneratorFactory">
@@ -48,7 +48,7 @@
<generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>
</generator>
<generator class="opennlp.tools.util.featuregen.AuxiliaryInfoAwareDelegateFeatureGeneratorFactory">
- <generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/>
+ <generator class="opennlp.tools.util.featuregen.lang.jpn.BigramNameFeatureGeneratorFactory"/>
</generator>
<generator class="opennlp.tools.util.featuregen.AuxiliaryInfoAwareDelegateFeatureGeneratorFactory">
<generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory">
diff --git a/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGeneratorTest.java b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGeneratorTest.java
new file mode 100644
index 0000000..ddfa024
--- /dev/null
+++ b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/BigramNameFeatureGeneratorTest.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class BigramNameFeatureGeneratorTest {
+
+ private List<String> features;
+ static String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"};
+
+ @Before
+ public void setUp() throws Exception {
+ features = new ArrayList<>();
+ }
+
+ @Test
+ public void testBegin() {
+
+ final int testTokenIndex = 0;
+
+ AdaptiveFeatureGenerator generator = new BigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertEquals("w,nw=This,is", features.get(0));
+ Assert.assertEquals("wc,nc=alpha,alpha", features.get(1));
+ }
+
+ @Test
+ public void testMiddle() {
+
+ final int testTokenIndex = 2;
+
+ AdaptiveFeatureGenerator generator = new BigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(4, features.size());
+ Assert.assertEquals("pw,w=is,an", features.get(0));
+ Assert.assertEquals("pwc,wc=alpha,alpha", features.get(1));
+ Assert.assertEquals("w,nw=an,example", features.get(2));
+ Assert.assertEquals("wc,nc=alpha,alpha", features.get(3));
+ }
+
+ @Test
+ public void testEnd() {
+
+ final int testTokenIndex = 4;
+
+ AdaptiveFeatureGenerator generator = new BigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertEquals("pw,w=example,sentence", features.get(0));
+ Assert.assertEquals("pwc,wc=alpha,alpha", features.get(1));
+ }
+
+ @Test
+ public void testShort() {
+
+ String[] shortSentence = new String[] {"word"};
+
+ final int testTokenIndex = 0;
+
+ AdaptiveFeatureGenerator generator = new BigramNameFeatureGenerator();
+
+ generator.createFeatures(features, shortSentence, testTokenIndex, null);
+
+ Assert.assertEquals(0, features.size());
+ }
+}
diff --git a/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtilTest.java b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtilTest.java
new file mode 100644
index 0000000..ce5816f
--- /dev/null
+++ b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtilTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class FeatureGeneratorUtilTest {
+
+ @Test
+ public void test() {
+ // digits
+ Assert.assertEquals("digit", FeatureGeneratorUtil.tokenFeature("12"));
+ Assert.assertEquals("digit", FeatureGeneratorUtil.tokenFeature("1234"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("abcd234"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("1234-56"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("4/6/2017"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("1,234,567"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("12.34567"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("123(456)7890"));
+
+ // letters
+ Assert.assertEquals("alpha", FeatureGeneratorUtil.tokenFeature("opennlp"));
+ Assert.assertEquals("alpha", FeatureGeneratorUtil.tokenFeature("O"));
+ Assert.assertEquals("alpha", FeatureGeneratorUtil.tokenFeature("OPENNLP"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("A."));
+ Assert.assertEquals("alpha", FeatureGeneratorUtil.tokenFeature("Mike"));
+ Assert.assertEquals("alpha", FeatureGeneratorUtil.tokenFeature("somethingStupid"));
+
+ // symbols
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature(","));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("."));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("?"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("!"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("#"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("%"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("&"));
+ }
+
+ @Test
+ public void testJapanese() {
+ // Hiragana
+ Assert.assertEquals("hira", FeatureGeneratorUtil.tokenFeature("そういえば"));
+ Assert.assertEquals("hira", FeatureGeneratorUtil.tokenFeature("おーぷん・そ〜す・そふとうぇあ"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("あぱっち・そふとうぇあ財団"));
+
+ // Katakana
+ Assert.assertEquals("kata", FeatureGeneratorUtil.tokenFeature("ジャパン"));
+ Assert.assertEquals("kata", FeatureGeneratorUtil.tokenFeature("オープン・ソ〜ス・ソフトウェア"));
+ Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("アパッチ・ソフトウェア財団"));
+ }
+}
diff --git a/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGeneratorTest.java b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGeneratorTest.java
new file mode 100644
index 0000000..be9359f
--- /dev/null
+++ b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TokenClassFeatureGeneratorTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class TokenClassFeatureGeneratorTest {
+
+ private List<String> features;
+ static String[] testSentence = new String[] {"This", "is", "an", "Example", "sentence"};
+
+ @Before
+ public void setUp() throws Exception {
+ features = new ArrayList<>();
+ }
+
+ @Test
+ public void testGenWAC() {
+
+ final int testTokenIndex = 3;
+
+ AdaptiveFeatureGenerator generator = new TokenClassFeatureGenerator(true);
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertEquals("wc=alpha", features.get(0));
+ Assert.assertEquals("w&c=example,alpha", features.get(1));
+ }
+
+ @Test
+ public void testNoWAC() {
+
+ final int testTokenIndex = 3;
+
+ AdaptiveFeatureGenerator generator = new TokenClassFeatureGenerator(false);
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(1, features.size());
+ Assert.assertEquals("wc=alpha", features.get(0));
+ }
+}
diff --git a/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGeneratorTest.java b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGeneratorTest.java
new file mode 100644
index 0000000..d74051e
--- /dev/null
+++ b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGeneratorTest.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class TokenPatternFeatureGeneratorTest {
+
+ private List<String> features;
+
+ @Before
+ public void setUp() throws Exception {
+ features = new ArrayList<>();
+ }
+
+ @Test
+ public void testSingleToken() {
+
+ String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"};
+ final int testTokenIndex = 3;
+
+ AdaptiveFeatureGenerator generator = new TokenPatternFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+ Assert.assertEquals(1, features.size());
+ Assert.assertEquals("st=example", features.get(0));
+ }
+
+ @Test
+ public void testSentence() {
+
+ String[] testSentence = new String[] {"This is an example sentence"};
+ final int testTokenIndex = 0;
+
+ AdaptiveFeatureGenerator generator = new TokenPatternFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+ Assert.assertEquals(14, features.size());
+ Assert.assertEquals("stn=5", features.get(0));
+ Assert.assertEquals("pt2=alphaalpha", features.get(1));
+ Assert.assertEquals("pt3=alphaalphaalpha", features.get(2));
+ Assert.assertEquals("st=this", features.get(3));
+ Assert.assertEquals("pt2=alphaalpha", features.get(4));
+ Assert.assertEquals("pt3=alphaalphaalpha", features.get(5));
+ Assert.assertEquals("st=is", features.get(6));
+ Assert.assertEquals("pt2=alphaalpha", features.get(7));
+ Assert.assertEquals("pt3=alphaalphaalpha", features.get(8));
+ Assert.assertEquals("st=an", features.get(9));
+ Assert.assertEquals("pt2=alphaalpha", features.get(10));
+ Assert.assertEquals("st=example", features.get(11));
+ Assert.assertEquals("st=sentence", features.get(12));
+ Assert.assertEquals("pta=alphaalphaalphaalphaalpha", features.get(13));
+ }
+}
diff --git a/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGeneratorTest.java b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGeneratorTest.java
new file mode 100644
index 0000000..546a0bd
--- /dev/null
+++ b/japanese-addon/src/test/java/opennlp/tools/util/featuregen/lang/jpn/TrigramNameFeatureGeneratorTest.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen.lang.jpn;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class TrigramNameFeatureGeneratorTest {
+
+ private List<String> features;
+ static String[] testSentence = new String[] {"This", "is", "an", "example", "sentence"};
+
+ @Before
+ public void setUp() throws Exception {
+ features = new ArrayList<>();
+ }
+
+ @Test
+ public void testBegin() {
+
+ final int testTokenIndex = 0;
+
+ AdaptiveFeatureGenerator generator = new TrigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertEquals("w,nw,nnw=This,is,an", features.get(0));
+ Assert.assertEquals("wc,nwc,nnwc=alpha,alpha,alpha", features.get(1));
+ }
+
+ @Test
+ public void testNextOfBegin() {
+
+ final int testTokenIndex = 1;
+
+ AdaptiveFeatureGenerator generator = new TrigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertEquals("w,nw,nnw=is,an,example", features.get(0));
+ Assert.assertEquals("wc,nwc,nnwc=alpha,alpha,alpha", features.get(1));
+ }
+
+ @Test
+ public void testMiddle() {
+
+ final int testTokenIndex = 2;
+
+ AdaptiveFeatureGenerator generator = new TrigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(4, features.size());
+ Assert.assertEquals("ppw,pw,w=This,is,an", features.get(0));
+ Assert.assertEquals("ppwc,pwc,wc=alpha,alpha,alpha", features.get(1));
+ Assert.assertEquals("w,nw,nnw=an,example,sentence", features.get(2));
+ Assert.assertEquals("wc,nwc,nnwc=alpha,alpha,alpha", features.get(3));
+ }
+
+ @Test
+ public void testEnd() {
+
+ final int testTokenIndex = 4;
+
+ AdaptiveFeatureGenerator generator = new TrigramNameFeatureGenerator();
+
+ generator.createFeatures(features, testSentence, testTokenIndex, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertEquals("ppw,pw,w=an,example,sentence", features.get(0));
+ Assert.assertEquals("ppwc,pwc,wc=alpha,alpha,alpha", features.get(1));
+ }
+
+ @Test
+ public void testShort() {
+
+ String[] shortSentence = new String[] {"I", "know", "it"};
+
+ final int testTokenIndex = 1;
+
+ AdaptiveFeatureGenerator generator = new TrigramNameFeatureGenerator();
+
+ generator.createFeatures(features, shortSentence, testTokenIndex, null);
+
+ Assert.assertEquals(0, features.size());
+ }
+}