OPENNLP-844 ngram feature range in doccat now as parameter git-svn-id: https://svn.apache.org/repos/asf/opennlp/trunk@1741643 13f79535-47bb-0310-9956-ffa450edef68

commit: 164331477b1cea0942dcf6f07714fd50d8e2687e [log] [tgz]
author: Rodrigo Agerri <ragerri@apache.org> Fri Apr 29 15:12:56 2016 +0000
committer: Rodrigo Agerri <ragerri@apache.org> Fri Apr 29 15:12:56 2016 +0000
tree: 259a6da579f2c77e8207b02f663bd8910282746f
parent: e35eb556174312a12e9be9efd46569f663a04810 [diff]
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
index 1c94411..49e1736 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java

@@ -17,22 +17,69 @@
 
 package opennlp.tools.doccat;
 
+import opennlp.tools.util.InvalidFormatException;
+
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 
 /**
+ * Generates ngram features for a document.
  * n-gram {@link FeatureGenerator}
  */
 public class NGramFeatureGenerator implements FeatureGenerator {
 
+  //default values for bigrams
+  private int minGram = 2;
+  private int maxGram = 2;
+
+  /**
+   * Constructor for ngrams.
+   *
+   * @param minGram minGram value - which means minimum words in ngram features
+   * @param maxGram maxGram value - which means maximum words in ngram features
+   * @throws InvalidFormatException
+   */
+  public NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException {
+    if (minGram > 0 && maxGram > 0) {
+      if (minGram <= maxGram) {
+        this.minGram = minGram;
+        this.maxGram = maxGram;
+      } else {
+        throw new InvalidFormatException("Minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!");
+      }
+    } else {
+      throw new InvalidFormatException("Both minimum range value (minGram) & maximum range value (maxGram) should be greater than or equal to 1!");
+    }
+  }
+
+  /**
+   * Default constructor for Bi grams
+   */
+  public NGramFeatureGenerator() {
+  }
+
+  /**
+   * Extract ngram features from given text fragments
+   *
+   * @param text      the text fragments to extract features from
+   * @param extraInfo optional extra information
+   * @return a collection of n gram features
+   */
   public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {
 
     List<String> features = new ArrayList<String>();
 
-    for (int i = 0; i < text.length - 1; i++) {
-      features.add("ng=" + text[i] + ":" + text[i + 1]);
+    for (int i = 0; i <= text.length - minGram; i++) {
+      String feature = "ng=";
+      for (int y = 0; y < maxGram && i + y < text.length; y++) {
+        feature = feature + ":" + text[i + y];
+        int gramCount = y + 1;
+        if (maxGram >= gramCount && gramCount >= minGram) {
+          features.add(feature);
+        }
+      }
     }
 
     return features;

diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
index 5cd3aaf..786e708 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java

@@ -84,7 +84,7 @@
   @Test
   public void testCustom() throws IOException {
     FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(),
-        new NGramFeatureGenerator() };
+        new NGramFeatureGenerator(), new NGramFeatureGenerator(2,3) };
     DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE,
         featureGenerators);
 
@@ -102,11 +102,12 @@
 
     assertNotNull(factory);
 
-    assertEquals(2, factory.getFeatureGenerators().length);
+    assertEquals(3, factory.getFeatureGenerators().length);
     assertEquals(BagOfWordsFeatureGenerator.class,
         factory.getFeatureGenerators()[0].getClass());
     assertEquals(NGramFeatureGenerator.class,
         factory.getFeatureGenerators()[1].getClass());
+    assertEquals(NGramFeatureGenerator.class,factory.getFeatureGenerators()[2].getClass());
 
     assertEquals(SimpleTokenizer.INSTANCE.getClass(), factory.getTokenizer()
         .getClass());
commit	164331477b1cea0942dcf6f07714fd50d8e2687e	[log] [tgz]
author	Rodrigo Agerri <ragerri@apache.org>	Fri Apr 29 15:12:56 2016 +0000
committer	Rodrigo Agerri <ragerri@apache.org>	Fri Apr 29 15:12:56 2016 +0000
tree	259a6da579f2c77e8207b02f663bd8910282746f
parent	e35eb556174312a12e9be9efd46569f663a04810 [diff]