OPENNLP-844 ngram feature range in doccat now as parameter
git-svn-id: https://svn.apache.org/repos/asf/opennlp/trunk@1741643 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
index 1c94411..49e1736 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
@@ -17,22 +17,69 @@
package opennlp.tools.doccat;
+import opennlp.tools.util.InvalidFormatException;
+
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/**
+ * Generates ngram features for a document.
* n-gram {@link FeatureGenerator}
*/
public class NGramFeatureGenerator implements FeatureGenerator {
+ //default values for bigrams
+ private int minGram = 2;
+ private int maxGram = 2;
+
+ /**
+ * Constructor for ngrams.
+ *
+ * @param minGram minGram value - which means minimum words in ngram features
+ * @param maxGram maxGram value - which means maximum words in ngram features
+ * @throws InvalidFormatException
+ */
+ public NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException {
+ if (minGram > 0 && maxGram > 0) {
+ if (minGram <= maxGram) {
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ } else {
+ throw new InvalidFormatException("Minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!");
+ }
+ } else {
+ throw new InvalidFormatException("Both minimum range value (minGram) & maximum range value (maxGram) should be greater than or equal to 1!");
+ }
+ }
+
+ /**
+ * Default constructor for Bi grams
+ */
+ public NGramFeatureGenerator() {
+ }
+
+ /**
+ * Extract ngram features from given text fragments
+ *
+ * @param text the text fragments to extract features from
+ * @param extraInfo optional extra information
+ * @return a collection of n gram features
+ */
public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {
List<String> features = new ArrayList<String>();
- for (int i = 0; i < text.length - 1; i++) {
- features.add("ng=" + text[i] + ":" + text[i + 1]);
+ for (int i = 0; i <= text.length - minGram; i++) {
+ String feature = "ng=";
+ for (int y = 0; y < maxGram && i + y < text.length; y++) {
+ feature = feature + ":" + text[i + y];
+ int gramCount = y + 1;
+ if (maxGram >= gramCount && gramCount >= minGram) {
+ features.add(feature);
+ }
+ }
}
return features;
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
index 5cd3aaf..786e708 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
@@ -84,7 +84,7 @@
@Test
public void testCustom() throws IOException {
FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(),
- new NGramFeatureGenerator() };
+ new NGramFeatureGenerator(), new NGramFeatureGenerator(2,3) };
DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE,
featureGenerators);
@@ -102,11 +102,12 @@
assertNotNull(factory);
- assertEquals(2, factory.getFeatureGenerators().length);
+ assertEquals(3, factory.getFeatureGenerators().length);
assertEquals(BagOfWordsFeatureGenerator.class,
factory.getFeatureGenerators()[0].getClass());
assertEquals(NGramFeatureGenerator.class,
factory.getFeatureGenerators()[1].getClass());
+ assertEquals(NGramFeatureGenerator.class,factory.getFeatureGenerators()[2].getClass());
assertEquals(SimpleTokenizer.INSTANCE.getClass(), factory.getTokenizer()
.getClass());