OPENNLP-713 - fixed some javadocs, using generics in ngrams utils, added more tests to cfg and language modeling packages
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
index e9bd6a0..7d3b33e 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
@@ -61,7 +61,8 @@
}
public ContextFreeGrammar build() {
- assert nonTerminalSymbols != null && terminalSymbols != null && rules != null && startSymbol != null;
+ assert nonTerminalSymbols != null && terminalSymbols != null && rules != null && startSymbol != null :
+ "missing definitions { V : " + nonTerminalSymbols + ", ∑ : " + terminalSymbols + ", R : " + rules + ", S : " + startSymbol + "}";
return new ContextFreeGrammar(nonTerminalSymbols, terminalSymbols, rules, startSymbol, randomExpansion);
}
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
index de64d80..9687d36 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
@@ -28,6 +28,7 @@
* A context free grammar
*/
public class ContextFreeGrammar {
+
private final Collection<String> nonTerminalSymbols;
private final Collection<String> terminalSymbols;
private final Collection<Rule> rules;
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
index 62d6bc7..1a670e6 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
@@ -23,5 +23,5 @@
*/
public interface NaiveBayesClassifier<I, O> {
- public O calculateClass(I inputDocument) throws Exception;
+ O calculateClass(I inputDocument) throws Exception;
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
index 26b6ef8..e84f48c 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
@@ -21,18 +21,20 @@
import java.util.Collection;
/**
- * A language model calculate the probability <i>p</i> (between 0 and 1) of a
+ * A language model can calculate the probability <i>p</i> (between 0 and 1) of a
* certain set of <code>T</code> objects, given a vocabulary.
+ * <code>T</code> is usually an {@link java.lang.Iterable} or an array as language models are very commonly used for
+ * sentences, so that T is e.g. an array of <code>String</code>s.
*/
public interface LanguageModel<T> {
/**
- * Calculate the probability of a sentence given a vocabulary
+ * Calculate the probability of a sample, given a vocabulary
*
* @param vocabulary a {@link Collection} of objects of type <code>T</code>
* @param sample the sample to evaluate the probability for
* @return a <code>double</code> between <code>0</code> and <code>1</code>
*/
- public double calculateProbability(Collection<T> vocabulary, T sample);
+ double calculateProbability(Collection<T> vocabulary, T sample);
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
index 1d44e23..abadc23 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
@@ -26,8 +26,10 @@
* a sentence over the no. of sentences in the vocabulary.
*/
public class NaiveSentenceLanguageModel<T> implements LanguageModel<T[]> {
+
@Override
public double calculateProbability(Collection<T[]> vocabulary, T[] sentence) {
- return Collections.frequency(vocabulary, sentence) / vocabulary.size();
+ return vocabulary.isEmpty() ? 0 : Collections.frequency(vocabulary, sentence) / vocabulary.size();
}
+
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
index 1c02ec8..fd2b81d 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
@@ -44,6 +44,6 @@
public abstract Double calculatePrior(String word);
- public abstract Double calculateLikelihood(String mispelledWord, String word);
+ public abstract Double calculateLikelihood(String misspelledWord, String word);
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
index 3dc6152..a916cd3 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
@@ -21,70 +21,77 @@
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
-
import org.apache.opennlp.utils.ngram.NGramUtils;
/**
* A simple trigram language model for sentences made of <code>String</code> arrays
*/
-public class TrigramSentenceLanguageModel implements LanguageModel<String[]> {
+public class TrigramSentenceLanguageModel<T> implements LanguageModel<T[]> {
+
@Override
- public double calculateProbability(Collection<String[]> vocabulary, String[] sample) {
- double probability = 1d;
- for (Trigram trigram : getTrigrams(sample)) {
- if (trigram.getX0() != null && trigram.getX1() != null) {
- // default
- probability *= NGramUtils.calculateTrigramMLProbability(trigram.getX0(), trigram.getX1(), trigram.getX2(), vocabulary);
- } else if (trigram.getX0() == null && trigram.getX1() != null) {
- // bigram
- probability *= NGramUtils.calculateBigramMLProbability(trigram.getX2(), trigram.getX1(), vocabulary);
- } else if (trigram.getX0() == null && trigram.getX1() == null) {
- // unigram
- probability *= NGramUtils.calculateUnigramMLProbability(trigram.getX2(), vocabulary);
- } else {
- // unexpected
+ public double calculateProbability(Collection<T[]> vocabulary, T[] sample) {
+ double probability = 0d;
+ if (!vocabulary.isEmpty()) {
+ for (Trigram trigram : getTrigrams(sample)) {
+ if (trigram.getX0() != null && trigram.getX1() != null) {
+ // default
+ probability += Math.log(NGramUtils.calculateTrigramMLProbability(trigram.getX0(), trigram.getX1(), trigram.getX2(), vocabulary));
+ } else if (trigram.getX0() == null && trigram.getX1() != null) {
+ // bigram
+ probability += Math.log(NGramUtils.calculateBigramMLProbability(trigram.getX2(), trigram.getX1(), vocabulary));
+ } else if (trigram.getX0() == null) {
+ // unigram
+ probability += Math.log(NGramUtils.calculateUnigramMLProbability(trigram.getX2(), vocabulary));
+ } else {
+ throw new RuntimeException("unexpected");
+ }
+ }
+ if (!Double.isNaN(probability)) {
+ probability = Math.exp(probability);
}
}
return probability;
}
- private Set<Trigram> getTrigrams(String[] sample) {
+ private Set<Trigram> getTrigrams(T[] sample) {
Set<Trigram> trigrams = new HashSet<Trigram>();
- for (int i = 0; i < sample.length - 2; i++) {
- String x0 = null;
- String x1 = null;
- String x2 = sample[i];
- if (i > 1) {
+ for (int i = 0; i < sample.length; i++) {
+ T x0 = null;
+ T x1 = null;
+ T x2 = sample[i];
+ if (i > 0) {
x1 = sample[i - 1];
}
- if (i > 2) {
+ if (i > 1) {
x0 = sample[i - 2];
}
- trigrams.add(new Trigram(x0, x1, x2));
+ if (x0 != null && x1 != null && x2 != null) {
+ trigrams.add(new Trigram(x0, x1, x2));
+ }
}
return trigrams;
}
private class Trigram {
- private final String x0;
- private final String x1;
- private final String x2;
+ private final T x0;
+ private final T x1;
+ private final T x2;
- private Trigram(String x0, String x1, String x2) {
+ private Trigram(T x0, T x1, T x2) {
this.x0 = x0;
this.x1 = x1;
this.x2 = x2;
}
- public String getX0() {
+ public T getX0() {
return x0;
}
- public String getX1() {
+ public T getX1() {
return x1;
}
- public String getX2() {
+ public T getX2() {
return x2;
}
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
index 2f3d1ac..88eb721 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
@@ -27,9 +27,9 @@
*/
public class NGramUtils {
- private static Double count(String x0, String x1, String x2, Collection<String[]> sentences) {
+ private static <T> Double count(T x0, T x1, T x2, Collection<T[]> sentences) {
Double count = 0d;
- for (String[] sentence : sentences) {
+ for (T[] sentence : sentences) {
int idx0 = contains(sentence, x0);
if (idx0 >= 0) {
if (idx0 + 2 < sentence.length && x1.equals(sentence[idx0+1]) && x2.equals(sentence[idx0+2])) {
@@ -40,7 +40,7 @@
return count;
}
- private static int contains(String[] sentence, String word) {
+ private static <T> int contains(T[] sentence, T word) {
for (int i = 0; i < sentence.length; i++) {
if (word.equals(sentence[i])){
return i;
@@ -49,11 +49,11 @@
return -1;
}
- private static Double count(String sequentWord, String precedingWord, Collection<String[]> set) {
+ private static <T> Double count(T sequentWord, T precedingWord, Collection<T[]> set) {
Double result = 0d;
boolean foundPreceding = false;
- for (String[] sentence : set) {
- for (String w : sentence) {
+ for (T[] sentence : set) {
+ for (T w : sentence) {
if (precedingWord.equals(w)) {
foundPreceding = true;
continue;
@@ -69,10 +69,10 @@
return result;
}
- private static Double count(String word, Collection<String[]> set) {
+ private static <T> Double count(T word, Collection<T[]> set) {
Double result = 0d;
- for (String[] sentence : set) {
- for (String w : sentence) {
+ for (T[] sentence : set) {
+ for (T w : sentence) {
if (word.equals(w))
result++;
}
@@ -80,15 +80,15 @@
return result;
}
- public static Double calculateLaplaceSmoothingProbability(String sequentWord, String precedingWord, Collection<String[]> set, Double k) {
+ public static <T> Double calculateLaplaceSmoothingProbability(T sequentWord, T precedingWord, Collection<T[]> set, Double k) {
return (count(sequentWord, precedingWord, set) + k) / (count(precedingWord, set) + k * set.size());
}
- public static Double calculateBigramMLProbability(String sequentWord, String precedingWord, Collection<String[]> set) {
+ public static <T> Double calculateBigramMLProbability(T sequentWord, T precedingWord, Collection<T[]> set) {
return count(sequentWord, precedingWord, set)/ count(precedingWord, set);
}
- public static Double calculateTrigramMLProbability(String x0, String x1, String x2, Collection<String[]> sentences) {
+ public static <T> Double calculateTrigramMLProbability(T x0, T x1, T x2, Collection<T[]> sentences) {
return count(x0, x1, x2, sentences)/ count(x1, x0, sentences);
}
@@ -96,18 +96,18 @@
return (count(sequentWord, precedingWord, set) + k * calculateUnigramMLProbability(sequentWord, set)) / (count(precedingWord, set) + k * set.size());
}
- public static Double calculateUnigramMLProbability(String word, Collection<String[]> set) {
+ public static <T> Double calculateUnigramMLProbability(T word, Collection<T[]> set) {
double vocSize = 0d;
- for (String[] s : set) {
+ for (T[] s : set) {
vocSize+= s.length;
}
return count(word, set) / vocSize;
}
- public static Double calculateLinearInterpolationProbability(String x0, String x1, String x2, Collection<String[]> sentences,
+ public static <T> Double calculateLinearInterpolationProbability(T x0, T x1, T x2, Collection<T[]> sentences,
Double lambda1, Double lambda2, Double lambda3) {
assert lambda1 + lambda2 + lambda3 == 1 : "lambdas sum should be equals to 1";
- assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should be greater than 0";
+ assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should all be greater than 0";
return lambda1 * calculateTrigramMLProbability(x0, x1, x2, sentences) +
lambda2 * calculateBigramMLProbability(x2, x1, sentences) +
@@ -115,18 +115,18 @@
}
- private static Collection<String> flatSet(Collection<String[]> set) {
- Collection<String> flatSet = new HashSet<String>();
- for (String[] sentence : set){
+ private static <T> Collection<T> flatSet(Collection<T[]> set) {
+ Collection<T> flatSet = new HashSet<T>();
+ for (T[] sentence : set){
flatSet.addAll(Arrays.asList(sentence));
}
return flatSet;
}
- public static Double calculateMissingBigramProbabilityMass(String x1, Double discount, Collection<String[]> set) {
+ public static <T> Double calculateMissingBigramProbabilityMass(T x1, Double discount, Collection<T[]> set) {
Double missingMass = 0d;
Double countWord = count(x1, set);
- for (String word : flatSet(set)) {
+ for (T word : flatSet(set)) {
missingMass += (count(word, x1, set) - discount)/ countWord;
}
return 1 - missingMass;
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
index 25920f6..9f66c7c 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
@@ -29,12 +29,12 @@
* @param inputs an array of inputs as <code>double</code>
* @return a <code>double</code> representing the output
*/
- public double calculateOutput(double[] inputs);
+ double calculateOutput(double[] inputs);
/**
* update the internal model's parameters.
*
* @param parameters an array of <code>double</code> containing the updated parameters
*/
- public void updateParameters(double[] parameters);
+ void updateParameters(double[] parameters);
}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
index 2c52876..1d97b5b 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
@@ -19,7 +19,7 @@
package org.apache.opennlp.utils.regression;
/**
- * Simplest {@link Hypothesis} which just linear combines inputs with weights
+ * Simplest {@link Hypothesis} which just linearly combines inputs with weights
*/
public class LinearCombinationHypothesis implements Hypothesis {
private double[] weights;
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
index 1281ea4..5d4b84f 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
@@ -18,8 +18,9 @@
*/
package org.apache.opennlp.utils;
-import org.apache.opennlp.utils.TrainingExample;
-import org.apache.opennlp.utils.TrainingSet;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Random;
import org.junit.Ignore;
/**
@@ -28,6 +29,8 @@
@Ignore
public class TestUtils {
+ private static Random r = new Random();
+
public static void fillTrainingSet(TrainingSet trainingSet, int size, int dimension) {
for (int i = 0; i < size; i++) {
double[] inputs = new double[dimension];
@@ -38,4 +41,24 @@
trainingSet.add(new TrainingExample(inputs, out));
}
}
+
+ public static Collection<String[]> generateRandomVocabulary() {
+ int size = r.nextInt(1000);
+ Collection<String[]> vocabulary = new ArrayList<String[]>(size);
+ for (int i = 0; i < size; i++) {
+ String[] sentence = generateRandomSentence();
+ vocabulary.add(sentence);
+ }
+ return vocabulary;
+ }
+
+ public static String[] generateRandomSentence() {
+ int dimension = r.nextInt(10);
+ String[] sentence = new String[dimension];
+ for (int j = 0; j < dimension; j++) {
+ char c = (char) r.nextInt(10);
+ sentence[j] = c + "-" + c + "-" + c;
+ }
+ return sentence;
+ }
}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
index 04399b7..365cb83 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
@@ -31,8 +31,9 @@
* Testcase for {@link org.apache.opennlp.utils.anomalydetection.AnomalyDetectionUtils}
*/
public class AnomalyDetectionUtilsTest {
+
@Test
- public void testGaussianDistributionProbability() throws Exception {
+ public void testGaussianDistributionProbabilityFromFitParameters() throws Exception {
TrainingSet trainingSet = new TrainingSet();
TestUtils.fillTrainingSet(trainingSet, 100, 5);
double[] mus = AnomalyDetectionUtils.fitMus(trainingSet);
@@ -46,7 +47,7 @@
}
@Test
- public void testGaussianDistributionProbability2() throws Exception {
+ public void testGaussianDistributionProbabilityFromTrainingSet() throws Exception {
TrainingSet trainingSet = new TrainingSet();
TestUtils.fillTrainingSet(trainingSet, 100, 5);
TrainingExample newInput = new TrainingExample(new double[]{1d, 2d, 1000d, 123d, 0.1d}, 0d);
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGBuilderTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGBuilderTest.java
new file mode 100644
index 0000000..323ef6a
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGBuilderTest.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.Arrays;
+import java.util.Collections;
+import org.junit.Test;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.cfg.CFGBuilder}
+ */
+public class CFGBuilderTest {
+
+ @Test
+ public void testVoidBuild() throws Exception {
+ CFGBuilder builder = CFGBuilder.createCFG();
+ assertNotNull(builder);
+ try {
+ builder.build();
+ fail("cannot build a grammar without V, ∑, R and S");
+ } catch (AssertionError e) {
+ // expected to fail
+ }
+ }
+
+ @Test
+ public void testBuildWithEmptySets() throws Exception {
+ CFGBuilder builder = CFGBuilder.createCFG().
+ withNonTerminals(Collections.<String>emptyList()).
+ withTerminals(Collections.<String>emptyList()).
+ withRules(Collections.<Rule>emptyList()).
+ withStartSymbol("");
+ try {
+ assertNotNull(builder.build());
+ fail("cannot build a grammar whose start symbol doesn't belong to the non terminals symbols set");
+ } catch (AssertionError e) {
+ // expected to fail
+ }
+ }
+
+ @Test
+ public void testBuildWithMinimalGrammarSettings() throws Exception {
+ CFGBuilder builder = CFGBuilder.createCFG().
+ withNonTerminals(Arrays.asList("")).
+ withTerminals(Collections.<String>emptyList()).
+ withRules(Collections.<Rule>emptyList()).
+ withStartSymbol("");
+ assertNotNull(builder.build());
+ }
+
+ @Test
+ public void testBuildWithMinimalGrammarSettingsAndRandomExpansion() throws Exception {
+ CFGBuilder builder = CFGBuilder.createCFG().
+ withNonTerminals(Arrays.asList("")).
+ withTerminals(Collections.<String>emptyList()).
+ withRules(Collections.<Rule>emptyList()).
+ withRandomExpansion(true).
+ withStartSymbol("");
+ assertNotNull(builder.build());
+ }
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
index 7f7971f..1534027 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
@@ -18,13 +18,8 @@
*/
package org.apache.opennlp.utils.cfg;
-import java.util.Arrays;
import java.util.Collection;
-import java.util.HashSet;
import java.util.LinkedList;
-import java.util.Set;
-import java.util.TreeSet;
-import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java
new file mode 100644
index 0000000..10e0fac
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collections;
+import org.apache.opennlp.utils.TestUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.languagemodel.NaiveSentenceLanguageModel}
+ */
+public class NaiveSentenceLanguageModelTest {
+
+ @Test
+ public void testEmptyVocabularyProbability() throws Exception {
+ NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<String>();
+ assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+ new String[0]), 0d);
+ assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+ new String[]{"1", "2", "3"}), 0d);
+ }
+
+ @Test
+ public void testRandomVocabularyAndSentence() throws Exception {
+ NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<String>();
+ double probability = model.calculateProbability(TestUtils.generateRandomVocabulary(), TestUtils.generateRandomSentence());
+ assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]", probability >= 0 && probability <= 1);
+ }
+
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java
new file mode 100644
index 0000000..b2d6d51
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collections;
+import org.apache.opennlp.utils.TestUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.languagemodel.TrigramSentenceLanguageModel}
+ */
+public class TrigramSentenceLanguageModelTest {
+
+ @Test
+ public void testEmptyVocabularyProbability() throws Exception {
+ TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<String>();
+ assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+ new String[0]), 0d);
+ assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(),
+ new String[]{"1", "2", "3"}), 0d);
+ }
+
+ @Test
+ public void testRandomVocabularyAndSentence() throws Exception {
+ TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<String>();
+ double probability = model.calculateProbability(TestUtils.generateRandomVocabulary(), TestUtils.generateRandomSentence());
+ assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]", probability >= 0 && probability <= 1);
+ }
+
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
index 75dbe16..8da4947 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
@@ -33,41 +33,40 @@
@Test
public void testBigram() {
Collection<String[]> set = new LinkedList<String[]>();
- set.add(new String[]{"<s>","I","am","Sam","</s>"});
- set.add(new String[]{"<s>","Sam","I","am","</s>"});
- set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+ set.add(new String[]{"<s>", "I", "am", "Sam", "</s>"});
+ set.add(new String[]{"<s>", "Sam", "I", "am", "</s>"});
+ set.add(new String[]{"<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"});
set.add(new String[]{});
Double d = NGramUtils.calculateBigramMLProbability("I", "<s>", set);
- assertTrue(d>0);
- assertEquals(Double.valueOf(0.6666666666666666d),d);
+ assertTrue(d > 0);
+ assertEquals(Double.valueOf(0.6666666666666666d), d);
d = NGramUtils.calculateBigramMLProbability("</s>", "Sam", set);
- assertEquals(Double.valueOf(0.5d),d);
+ assertEquals(Double.valueOf(0.5d), d);
d = NGramUtils.calculateBigramMLProbability("Sam", "<s>", set);
- assertEquals(Double.valueOf(0.3333333333333333d),d);
+ assertEquals(Double.valueOf(0.3333333333333333d), d);
}
@Test
public void testTrigram() {
Collection<String[]> set = new LinkedList<String[]>();
- set.add(new String[]{"<s>","I","am","Sam","</s>"});
- set.add(new String[]{"<s>","Sam","I","am","</s>"});
- set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+ set.add(new String[]{"<s>", "I", "am", "Sam", "</s>"});
+ set.add(new String[]{"<s>", "Sam", "I", "am", "</s>"});
+ set.add(new String[]{"<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"});
set.add(new String[]{});
- Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam",set);
- assertTrue(d>0);
- assertEquals(Double.valueOf(0.5),d);
- d = NGramUtils.calculateTrigramMLProbability("Sam","I", "am", set);
- assertEquals(Double.valueOf(1d),d);
+ Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam", set);
+ assertEquals(Double.valueOf(0.5), d);
+ d = NGramUtils.calculateTrigramMLProbability("Sam", "I", "am", set);
+ assertEquals(Double.valueOf(1d), d);
}
@Test
public void testLinearInterpolation() throws Exception {
Collection<String[]> set = new LinkedList<String[]>();
- set.add(new String[]{"the","green","book","STOP"});
- set.add(new String[]{"my","blue","book","STOP"});
- set.add(new String[]{"his","green","house","STOP"});
- set.add(new String[]{"book","STOP"});
- Double lambda = 1d/3d;
+ set.add(new String[]{"the", "green", "book", "STOP"});
+ set.add(new String[]{"my", "blue", "book", "STOP"});
+ set.add(new String[]{"his", "green", "house", "STOP"});
+ set.add(new String[]{"book", "STOP"});
+ Double lambda = 1d / 3d;
Double d = NGramUtils.calculateLinearInterpolationProbability("the", "green", "book", set, lambda, lambda, lambda);
assertNotNull(d);
assertTrue(d > 0);
@@ -77,9 +76,9 @@
@Test
public void testLinearInterpolation2() throws Exception {
Collection<String[]> set = new LinkedList<String[]>();
- set.add(new String[]{"D","N","V","STOP"});
- set.add(new String[]{"D","N","V","STOP"});
- Double lambda = 1d/3d;
+ set.add(new String[]{"D", "N", "V", "STOP"});
+ set.add(new String[]{"D", "N", "V", "STOP"});
+ Double lambda = 1d / 3d;
Double d = NGramUtils.calculateLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda, lambda);
assertNotNull(d);
assertTrue(d > 0);