OPENNLP-713 - fixed some javadocs, using generics in ngrams utils, added more tests to cfg and language modeling packages

commit: 531ccb884335644713e69877bbd069859660951b [log] [tgz]
author: Tommaso Teofili <tommaso@apache.org> Tue Sep 16 08:57:11 2014 +0000
committer: Tommaso Teofili <tommaso@apache.org> Tue Sep 16 08:57:11 2014 +0000
tree: aff4c61f7ba63fcce54092443d2442822a8b9a0e
parent: 627d985fb32a1288bd0a623354bbc5df5bb9ed7b [diff]
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
index e9bd6a0..7d3b33e 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java

@@ -61,7 +61,8 @@
     }
 
     public ContextFreeGrammar build() {
-        assert nonTerminalSymbols != null && terminalSymbols != null && rules != null && startSymbol != null;
+      assert nonTerminalSymbols != null && terminalSymbols != null && rules != null && startSymbol != null :
+              "missing definitions { V : " + nonTerminalSymbols + ", ∑ : " + terminalSymbols + ", R : " + rules + ", S : " + startSymbol + "}";
         return new ContextFreeGrammar(nonTerminalSymbols, terminalSymbols, rules, startSymbol, randomExpansion);
     }
 }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
index de64d80..9687d36 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java

@@ -28,6 +28,7 @@
  * A context free grammar
  */
 public class ContextFreeGrammar {
+
   private final Collection<String> nonTerminalSymbols;
   private final Collection<String> terminalSymbols;
   private final Collection<Rule> rules;

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
index 62d6bc7..1a670e6 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java

@@ -23,5 +23,5 @@
  */
 public interface NaiveBayesClassifier<I, O> {
 
-  public O calculateClass(I inputDocument) throws Exception;
+  O calculateClass(I inputDocument) throws Exception;
 }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
index 26b6ef8..e84f48c 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java

@@ -21,18 +21,20 @@
 import java.util.Collection;
 
 /**
- * A language model calculate the probability <i>p</i> (between 0 and 1) of a
+ * A language model can calculate the probability <i>p</i> (between 0 and 1) of a
  * certain set of <code>T</code> objects, given a vocabulary.
+ * <code>T</code> is usually an {@link java.lang.Iterable} or an array as language models are very commonly used for
+ * sentences, so that T is e.g. an array of <code>String</code>s.
  */
 public interface LanguageModel<T> {
 
   /**
-   * Calculate the probability of a sentence given a vocabulary
+   * Calculate the probability of a sample, given a vocabulary
    *
    * @param vocabulary a {@link Collection} of objects of type <code>T</code>
    * @param sample     the sample to evaluate the probability for
    * @return a <code>double</code> between <code>0</code> and <code>1</code>
    */
-  public double calculateProbability(Collection<T> vocabulary, T sample);
+  double calculateProbability(Collection<T> vocabulary, T sample);
 
 }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
index 1d44e23..abadc23 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java

@@ -26,8 +26,10 @@
  * a sentence over the no. of sentences in the vocabulary.
  */
 public class NaiveSentenceLanguageModel<T> implements LanguageModel<T[]> {
+
   @Override
   public double calculateProbability(Collection<T[]> vocabulary, T[] sentence) {
-    return Collections.frequency(vocabulary, sentence) / vocabulary.size();
+    return vocabulary.isEmpty() ? 0 : Collections.frequency(vocabulary, sentence) / vocabulary.size();
   }
+
 }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
index 1c02ec8..fd2b81d 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java

@@ -44,6 +44,6 @@
 
   public abstract Double calculatePrior(String word);
 
-  public abstract Double calculateLikelihood(String mispelledWord, String word);
+  public abstract Double calculateLikelihood(String misspelledWord, String word);
 
 }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
index 3dc6152..a916cd3 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java

@@ -21,70 +21,77 @@
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Set;
-
 import org.apache.opennlp.utils.ngram.NGramUtils;
 
 /**
  * A simple trigram language model for sentences made of <code>String</code> arrays
  */
-public class TrigramSentenceLanguageModel implements LanguageModel<String[]> {
+public class TrigramSentenceLanguageModel<T> implements LanguageModel<T[]> {
+
   @Override
-  public double calculateProbability(Collection<String[]> vocabulary, String[] sample) {
-    double probability = 1d;
-    for (Trigram trigram : getTrigrams(sample)) {
-      if (trigram.getX0() != null && trigram.getX1() != null) {
-        // default
-        probability *= NGramUtils.calculateTrigramMLProbability(trigram.getX0(), trigram.getX1(), trigram.getX2(), vocabulary);
-      } else if (trigram.getX0() == null && trigram.getX1() != null) {
-        // bigram
-        probability *= NGramUtils.calculateBigramMLProbability(trigram.getX2(), trigram.getX1(), vocabulary);
-      } else if (trigram.getX0() == null && trigram.getX1() == null) {
-        // unigram
-        probability *= NGramUtils.calculateUnigramMLProbability(trigram.getX2(), vocabulary);
-      } else {
-        // unexpected
+  public double calculateProbability(Collection<T[]> vocabulary, T[] sample) {
+    double probability = 0d;
+    if (!vocabulary.isEmpty()) {
+      for (Trigram trigram : getTrigrams(sample)) {
+        if (trigram.getX0() != null && trigram.getX1() != null) {
+          // default
+          probability += Math.log(NGramUtils.calculateTrigramMLProbability(trigram.getX0(), trigram.getX1(), trigram.getX2(), vocabulary));
+        } else if (trigram.getX0() == null && trigram.getX1() != null) {
+          // bigram
+          probability += Math.log(NGramUtils.calculateBigramMLProbability(trigram.getX2(), trigram.getX1(), vocabulary));
+        } else if (trigram.getX0() == null) {
+          // unigram
+          probability += Math.log(NGramUtils.calculateUnigramMLProbability(trigram.getX2(), vocabulary));
+        } else {
+          throw new RuntimeException("unexpected");
+        }
+      }
+      if (!Double.isNaN(probability)) {
+        probability = Math.exp(probability);
       }
     }
     return probability;
   }
 
-  private Set<Trigram> getTrigrams(String[] sample) {
+  private Set<Trigram> getTrigrams(T[] sample) {
     Set<Trigram> trigrams = new HashSet<Trigram>();
-    for (int i = 0; i < sample.length - 2; i++) {
-      String x0 = null;
-      String x1 = null;
-      String x2 = sample[i];
-      if (i > 1) {
+    for (int i = 0; i < sample.length; i++) {
+      T x0 = null;
+      T x1 = null;
+      T x2 = sample[i];
+      if (i > 0) {
         x1 = sample[i - 1];
       }
-      if (i > 2) {
+      if (i > 1) {
         x0 = sample[i - 2];
       }
-      trigrams.add(new Trigram(x0, x1, x2));
+      if (x0 != null && x1 != null && x2 != null) {
+        trigrams.add(new Trigram(x0, x1, x2));
+      }
     }
     return trigrams;
   }
 
   private class Trigram {
-    private final String x0;
-    private final String x1;
-    private final String x2;
+    private final T x0;
+    private final T x1;
+    private final T x2;
 
-    private Trigram(String x0, String x1, String x2) {
+    private Trigram(T x0, T x1, T x2) {
       this.x0 = x0;
       this.x1 = x1;
       this.x2 = x2;
     }
 
-    public String getX0() {
+    public T getX0() {
       return x0;
     }
 
-    public String getX1() {
+    public T getX1() {
       return x1;
     }
 
-    public String getX2() {
+    public T getX2() {
       return x2;
     }
   }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
index 2f3d1ac..88eb721 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java

@@ -27,9 +27,9 @@
  */
 public class NGramUtils {
 
-  private static Double count(String x0, String x1, String x2, Collection<String[]> sentences) {
+  private static <T> Double count(T x0, T x1, T x2, Collection<T[]> sentences) {
     Double count = 0d;
-    for (String[] sentence : sentences) {
+    for (T[] sentence : sentences) {
       int idx0 = contains(sentence, x0);
       if (idx0 >= 0) {
         if (idx0 + 2 < sentence.length && x1.equals(sentence[idx0+1]) && x2.equals(sentence[idx0+2])) {
@@ -40,7 +40,7 @@
     return count;
   }
 
-  private static int contains(String[] sentence, String word) {
+  private static <T> int contains(T[] sentence, T word) {
     for (int i = 0; i < sentence.length; i++) {
       if (word.equals(sentence[i])){
         return i;
@@ -49,11 +49,11 @@
     return -1;
   }
 
-  private static Double count(String sequentWord, String precedingWord, Collection<String[]> set) {
+  private static <T> Double count(T sequentWord, T precedingWord, Collection<T[]> set) {
     Double result = 0d;
     boolean foundPreceding = false;
-    for (String[] sentence : set) {
-      for (String w : sentence) {
+    for (T[] sentence : set) {
+      for (T w : sentence) {
         if (precedingWord.equals(w)) {
           foundPreceding = true;
           continue;
@@ -69,10 +69,10 @@
     return result;
   }
 
-  private static Double count(String word, Collection<String[]> set) {
+  private static <T> Double count(T word, Collection<T[]> set) {
     Double result = 0d;
-    for (String[] sentence : set) {
-      for (String w : sentence) {
+    for (T[] sentence : set) {
+      for (T w : sentence) {
         if (word.equals(w))
           result++;
       }
@@ -80,15 +80,15 @@
     return result;
   }
 
-  public static Double calculateLaplaceSmoothingProbability(String sequentWord, String precedingWord, Collection<String[]> set, Double k) {
+  public static <T> Double calculateLaplaceSmoothingProbability(T sequentWord, T precedingWord, Collection<T[]> set, Double k) {
     return (count(sequentWord, precedingWord, set) + k) / (count(precedingWord, set) + k * set.size());
   }
 
-  public static Double calculateBigramMLProbability(String sequentWord, String precedingWord, Collection<String[]> set) {
+  public static <T> Double calculateBigramMLProbability(T sequentWord, T precedingWord, Collection<T[]> set) {
     return count(sequentWord, precedingWord, set)/ count(precedingWord, set);
   }
 
-  public static Double calculateTrigramMLProbability(String x0, String x1, String x2, Collection<String[]> sentences) {
+  public static <T> Double calculateTrigramMLProbability(T x0, T x1, T x2, Collection<T[]> sentences) {
     return count(x0, x1, x2, sentences)/ count(x1, x0, sentences);
   }
 
@@ -96,18 +96,18 @@
     return (count(sequentWord, precedingWord, set) + k * calculateUnigramMLProbability(sequentWord, set)) / (count(precedingWord, set) + k * set.size());
   }
 
-  public static Double calculateUnigramMLProbability(String word, Collection<String[]> set) {
+  public static <T> Double calculateUnigramMLProbability(T word, Collection<T[]> set) {
     double vocSize = 0d;
-    for (String[] s : set) {
+    for (T[] s : set) {
       vocSize+= s.length;
     }
     return count(word, set) / vocSize;
   }
 
-  public static Double calculateLinearInterpolationProbability(String x0, String x1, String x2, Collection<String[]> sentences,
+  public static <T> Double calculateLinearInterpolationProbability(T x0, T x1, T x2, Collection<T[]> sentences,
                                                                Double lambda1, Double lambda2, Double lambda3) {
     assert lambda1 + lambda2 + lambda3 == 1 : "lambdas sum should be equals to 1";
-    assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should be greater than 0";
+    assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should all be greater than 0";
 
     return  lambda1 * calculateTrigramMLProbability(x0, x1, x2, sentences) +
             lambda2 * calculateBigramMLProbability(x2, x1, sentences) +
@@ -115,18 +115,18 @@
 
   }
 
-  private static Collection<String> flatSet(Collection<String[]> set) {
-    Collection<String> flatSet = new HashSet<String>();
-    for (String[] sentence : set){
+  private static <T> Collection<T> flatSet(Collection<T[]> set) {
+    Collection<T> flatSet = new HashSet<T>();
+    for (T[] sentence : set){
       flatSet.addAll(Arrays.asList(sentence));
     }
     return flatSet;
   }
 
-  public static Double calculateMissingBigramProbabilityMass(String x1, Double discount, Collection<String[]> set) {
+  public static <T> Double calculateMissingBigramProbabilityMass(T x1, Double discount, Collection<T[]> set) {
     Double missingMass = 0d;
     Double countWord = count(x1, set);
-    for (String word : flatSet(set)) {
+    for (T word : flatSet(set)) {
       missingMass += (count(word, x1, set) - discount)/ countWord;
     }
     return 1 - missingMass;

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
index 25920f6..9f66c7c 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java

@@ -29,12 +29,12 @@
    * @param inputs an array of inputs as <code>double</code>
    * @return a <code>double</code> representing the output
    */
-  public double calculateOutput(double[] inputs);
+  double calculateOutput(double[] inputs);
 
   /**
    * update the internal model's parameters.
    *
    * @param parameters an array of <code>double</code> containing the updated parameters
    */
-  public void updateParameters(double[] parameters);
+  void updateParameters(double[] parameters);
 }

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
index 2c52876..1d97b5b 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java

@@ -19,7 +19,7 @@
 package org.apache.opennlp.utils.regression;
 
 /**
- * Simplest {@link Hypothesis} which just linear combines inputs with weights
+ * Simplest {@link Hypothesis} which just linearly combines inputs with weights
  */
 public class LinearCombinationHypothesis implements Hypothesis {
   private double[] weights;

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
index 1281ea4..5d4b84f 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java

@@ -18,8 +18,9 @@
  */
 package org.apache.opennlp.utils;
 
-import org.apache.opennlp.utils.TrainingExample;
-import org.apache.opennlp.utils.TrainingSet;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Random;
 import org.junit.Ignore;
 
 /**
@@ -28,6 +29,8 @@
 @Ignore
 public class TestUtils {
 
+  private static Random r = new Random();
+
   public static void fillTrainingSet(TrainingSet trainingSet, int size, int dimension) {
     for (int i = 0; i < size; i++) {
       double[] inputs = new double[dimension];
@@ -38,4 +41,24 @@
       trainingSet.add(new TrainingExample(inputs, out));
     }
   }
+
+  public static Collection<String[]> generateRandomVocabulary() {
+    int size = r.nextInt(1000);
+    Collection<String[]> vocabulary = new ArrayList<String[]>(size);
+    for (int i = 0; i < size; i++) {
+      String[] sentence = generateRandomSentence();
+      vocabulary.add(sentence);
+    }
+    return vocabulary;
+  }
+
+  public static String[] generateRandomSentence() {
+    int dimension = r.nextInt(10);
+    String[] sentence = new String[dimension];
+    for (int j = 0; j < dimension; j++) {
+      char c = (char) r.nextInt(10);
+      sentence[j] = c + "-" + c + "-" + c;
+    }
+    return sentence;
+  }
 }

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
index 04399b7..365cb83 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java

@@ -31,8 +31,9 @@
  * Testcase for {@link org.apache.opennlp.utils.anomalydetection.AnomalyDetectionUtils}
  */
 public class AnomalyDetectionUtilsTest {
+
   @Test
-  public void testGaussianDistributionProbability() throws Exception {
+  public void testGaussianDistributionProbabilityFromFitParameters() throws Exception {
     TrainingSet trainingSet = new TrainingSet();
     TestUtils.fillTrainingSet(trainingSet, 100, 5);
     double[] mus = AnomalyDetectionUtils.fitMus(trainingSet);
@@ -46,7 +47,7 @@
   }
 
   @Test
-  public void testGaussianDistributionProbability2() throws Exception {
+  public void testGaussianDistributionProbabilityFromTrainingSet() throws Exception {
     TrainingSet trainingSet = new TrainingSet();
     TestUtils.fillTrainingSet(trainingSet, 100, 5);
     TrainingExample newInput = new TrainingExample(new double[]{1d, 2d, 1000d, 123d, 0.1d}, 0d);

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGBuilderTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGBuilderTest.java
new file mode 100644
index 0000000..323ef6a
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGBuilderTest.java

@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.Arrays;
+import java.util.Collections;
+import org.junit.Test;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.cfg.CFGBuilder}
+ */
+public class CFGBuilderTest {
+
+  @Test
+  public void testVoidBuild() throws Exception {
+    CFGBuilder builder = CFGBuilder.createCFG();
+    assertNotNull(builder);
+    try {
+      builder.build();
+      fail("cannot build a grammar without V, ∑, R and S");
+    } catch (AssertionError e) {
+      // expected to fail
+    }
+  }
+
+  @Test
+  public void testBuildWithEmptySets() throws Exception {
+    CFGBuilder builder = CFGBuilder.createCFG().
+            withNonTerminals(Collections.<String>emptyList()).
+            withTerminals(Collections.<String>emptyList()).
+            withRules(Collections.<Rule>emptyList()).
+            withStartSymbol("");
+    try {
+      assertNotNull(builder.build());
+      fail("cannot build a grammar whose start symbol doesn't belong to the non terminals symbols set");
+    } catch (AssertionError e) {
+      // expected to fail
+    }
+  }
+
+  @Test
+  public void testBuildWithMinimalGrammarSettings() throws Exception {
+    CFGBuilder builder = CFGBuilder.createCFG().
+            withNonTerminals(Arrays.asList("")).
+            withTerminals(Collections.<String>emptyList()).
+            withRules(Collections.<Rule>emptyList()).
+            withStartSymbol("");
+    assertNotNull(builder.build());
+  }
+
+  @Test
+  public void testBuildWithMinimalGrammarSettingsAndRandomExpansion() throws Exception {
+    CFGBuilder builder = CFGBuilder.createCFG().
+            withNonTerminals(Arrays.asList("")).
+            withTerminals(Collections.<String>emptyList()).
+            withRules(Collections.<Rule>emptyList()).
+            withRandomExpansion(true).
+            withStartSymbol("");
+    assertNotNull(builder.build());
+  }
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
index 7f7971f..1534027 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java

@@ -18,13 +18,8 @@
  */
 package org.apache.opennlp.utils.cfg;
 
-import java.util.Arrays;
 import java.util.Collection;
-import java.util.HashSet;
 import java.util.LinkedList;
-import java.util.Set;
-import java.util.TreeSet;
-import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java
new file mode 100644
index 0000000..10e0fac
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java

@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collections;
+import org.apache.opennlp.utils.TestUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.languagemodel.NaiveSentenceLanguageModel}
+ */
+public class NaiveSentenceLanguageModelTest {
+
+  @Test
+  public void testEmptyVocabularyProbability() throws Exception {
+    NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<String>();
+    assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+            new String[0]), 0d);
+    assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+            new String[]{"1", "2", "3"}), 0d);
+  }
+
+  @Test
+  public void testRandomVocabularyAndSentence() throws Exception {
+    NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<String>();
+    double probability = model.calculateProbability(TestUtils.generateRandomVocabulary(), TestUtils.generateRandomSentence());
+    assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]", probability >= 0 && probability <= 1);
+  }
+
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java
new file mode 100644
index 0000000..b2d6d51
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java

@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collections;
+import org.apache.opennlp.utils.TestUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.languagemodel.TrigramSentenceLanguageModel}
+ */
+public class TrigramSentenceLanguageModelTest {
+
+  @Test
+  public void testEmptyVocabularyProbability() throws Exception {
+    TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<String>();
+    assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+            new String[0]), 0d);
+    assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+            new String[]{"1", "2", "3"}), 0d);
+  }
+
+  @Test
+  public void testRandomVocabularyAndSentence() throws Exception {
+    TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<String>();
+    double probability = model.calculateProbability(TestUtils.generateRandomVocabulary(), TestUtils.generateRandomSentence());
+    assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]", probability >= 0 && probability <= 1);
+  }
+
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
index 75dbe16..8da4947 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java

@@ -33,41 +33,40 @@
   @Test
   public void testBigram() {
     Collection<String[]> set = new LinkedList<String[]>();
-    set.add(new String[]{"<s>","I","am","Sam","</s>"});
-    set.add(new String[]{"<s>","Sam","I","am","</s>"});
-    set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+    set.add(new String[]{"<s>", "I", "am", "Sam", "</s>"});
+    set.add(new String[]{"<s>", "Sam", "I", "am", "</s>"});
+    set.add(new String[]{"<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"});
     set.add(new String[]{});
     Double d = NGramUtils.calculateBigramMLProbability("I", "<s>", set);
-    assertTrue(d>0);
-    assertEquals(Double.valueOf(0.6666666666666666d),d);
+    assertTrue(d > 0);
+    assertEquals(Double.valueOf(0.6666666666666666d), d);
     d = NGramUtils.calculateBigramMLProbability("</s>", "Sam", set);
-    assertEquals(Double.valueOf(0.5d),d);
+    assertEquals(Double.valueOf(0.5d), d);
     d = NGramUtils.calculateBigramMLProbability("Sam", "<s>", set);
-    assertEquals(Double.valueOf(0.3333333333333333d),d);
+    assertEquals(Double.valueOf(0.3333333333333333d), d);
   }
 
   @Test
   public void testTrigram() {
     Collection<String[]> set = new LinkedList<String[]>();
-    set.add(new String[]{"<s>","I","am","Sam","</s>"});
-    set.add(new String[]{"<s>","Sam","I","am","</s>"});
-    set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+    set.add(new String[]{"<s>", "I", "am", "Sam", "</s>"});
+    set.add(new String[]{"<s>", "Sam", "I", "am", "</s>"});
+    set.add(new String[]{"<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"});
     set.add(new String[]{});
-    Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam",set);
-    assertTrue(d>0);
-    assertEquals(Double.valueOf(0.5),d);
-    d = NGramUtils.calculateTrigramMLProbability("Sam","I", "am", set);
-    assertEquals(Double.valueOf(1d),d);
+    Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam", set);
+    assertEquals(Double.valueOf(0.5), d);
+    d = NGramUtils.calculateTrigramMLProbability("Sam", "I", "am", set);
+    assertEquals(Double.valueOf(1d), d);
   }
 
   @Test
   public void testLinearInterpolation() throws Exception {
     Collection<String[]> set = new LinkedList<String[]>();
-    set.add(new String[]{"the","green","book","STOP"});
-    set.add(new String[]{"my","blue","book","STOP"});
-    set.add(new String[]{"his","green","house","STOP"});
-    set.add(new String[]{"book","STOP"});
-    Double lambda = 1d/3d;
+    set.add(new String[]{"the", "green", "book", "STOP"});
+    set.add(new String[]{"my", "blue", "book", "STOP"});
+    set.add(new String[]{"his", "green", "house", "STOP"});
+    set.add(new String[]{"book", "STOP"});
+    Double lambda = 1d / 3d;
     Double d = NGramUtils.calculateLinearInterpolationProbability("the", "green", "book", set, lambda, lambda, lambda);
     assertNotNull(d);
     assertTrue(d > 0);
@@ -77,9 +76,9 @@
   @Test
   public void testLinearInterpolation2() throws Exception {
     Collection<String[]> set = new LinkedList<String[]>();
-    set.add(new String[]{"D","N","V","STOP"});
-    set.add(new String[]{"D","N","V","STOP"});
-    Double lambda = 1d/3d;
+    set.add(new String[]{"D", "N", "V", "STOP"});
+    set.add(new String[]{"D", "N", "V", "STOP"});
+    Double lambda = 1d / 3d;
     Double d = NGramUtils.calculateLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda, lambda);
     assertNotNull(d);
     assertTrue(d > 0);
commit	531ccb884335644713e69877bbd069859660951b	[log] [tgz]
author	Tommaso Teofili <tommaso@apache.org>	Tue Sep 16 08:57:11 2014 +0000
committer	Tommaso Teofili <tommaso@apache.org>	Tue Sep 16 08:57:11 2014 +0000
tree	aff4c61f7ba63fcce54092443d2442822a8b9a0e
parent	627d985fb32a1288bd0a623354bbc5df5bb9ed7b [diff]