OPENNLP-657 Initial pull of the nlp-utils provided by Tommaso Teofili. Thanks for contributing.
diff --git a/nlp-utils/README.md b/nlp-utils/README.md
new file mode 100644
index 0000000..9782787
--- /dev/null
+++ b/nlp-utils/README.md
@@ -0,0 +1,10 @@
+nlp-utils
+=========
+a set of utilities for most common nlp tasks.
+currently providing tools for:
+* ngram
+* naive bayes
+* gradient descent / regression
+* anomaly detection
+* language modeling
+* CFG
diff --git a/nlp-utils/pom.xml b/nlp-utils/pom.xml
new file mode 100644
index 0000000..d32a6ae
--- /dev/null
+++ b/nlp-utils/pom.xml
@@ -0,0 +1,54 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>nlp-utils</artifactId>
+ <version>0.1-SNAPSHOT</version>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <compilerVersion>1.6</compilerVersion>
+ <source>1.6</source>
+ <target>1.6</target>
+ <encoding>UTF-8</encoding>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingExample.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingExample.java
new file mode 100644
index 0000000..6a08c7a
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingExample.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.opennlp.utils;
+
+/**
+ * A {@link TrainingExample} holds some inputs and a corresponding output
+ */
+public class TrainingExample {
+ private final double[] inputs;
+ private final double output;
+
+ public TrainingExample(double[] inputs, double output) {
+ this.inputs = inputs;
+ this.output = output;
+ }
+
+ public double[] getInputs() {
+ return inputs;
+ }
+
+ public double getOutput() {
+ return output;
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingSet.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingSet.java
new file mode 100644
index 0000000..174f8f3
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingSet.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+
+/**
+ * An {@link Iterable} over {@link TrainingExample}s
+ */
+public class TrainingSet implements Iterable<TrainingExample> {
+
+ private final Collection<TrainingExample> trainingExamples = new HashSet<TrainingExample>();
+
+ @Override
+ public Iterator<TrainingExample> iterator() {
+ return trainingExamples.iterator();
+ }
+
+ public void add(TrainingExample trainingExample) {
+ trainingExamples.add(trainingExample);
+ }
+
+ public int size() {
+ return trainingExamples.size();
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java
new file mode 100644
index 0000000..0fdf326
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.anomalydetection;
+
+import java.math.BigDecimal;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+
+/**
+ * Utility class for anomaly detection
+ */
+public class AnomalyDetectionUtils {
+
+ /**
+ * calculate Mu distribution parameters for a {@link org.apache.opennlp.utils.TrainingSet}'s set of features
+ *
+ * @param inputs the {@link org.apache.opennlp.utils.TrainingSet} to fit
+ * @return the <code>double[]</code> containing the Mu parameters for each feature
+ * @throws Exception
+ */
+ public static double[] fitMus(TrainingSet inputs) {
+ assert inputs != null && inputs.size() > 0 : "empty dataset";
+ int size = inputs.iterator().next().getInputs().length;
+ double[] result = new double[size];
+ for (int i = 0; i < size; i++) {
+ for (TrainingExample trainingExample : inputs) {
+ result[i] += trainingExample.getInputs()[i];
+ }
+ result[i] /= inputs.size();
+ }
+ return result;
+ }
+
+ /**
+ * calculates (squared) standard deviation parameters for the given {@link TrainingSet}
+ *
+ * @param mus mean parameters
+ * @param inputs the {@link TrainingSet} to fit
+ * @return the <code>double[]</code> containing the standard deviations
+ * @throws Exception
+ */
+ public static double[] fitSigmas(double[] mus, TrainingSet inputs) {
+ assert inputs != null && inputs.size() > 0 : "empty dataset";
+ int size = inputs.iterator().next().getInputs().length;
+ double[] result = new double[size];
+ for (int i = 0; i < size; i++) {
+ for (TrainingExample trainingExample : inputs) {
+ result[i] += Math.pow(trainingExample.getInputs()[i] - mus[i], 2);
+ }
+ result[i] /= inputs.size();
+ }
+ return result;
+ }
+
+ /**
+ * calculate the probability of a certain input
+ *
+ * @param x the input
+ * @param mus the means for the modeled features
+ * @param sigmas the standard deviations for the modeled features
+ * @return the probability of the given input
+ */
+ public static double getGaussianProbability(TrainingExample x, double[] mus, double[] sigmas) {
+ return calculateGaussianProbability(x, mus, sigmas);
+ }
+
+ /**
+ * calculate the probability of a certain input in a certain training set
+ *
+ * @param x the input
+ * @param set the training set
+ * @return the probability of the given input
+ * @throws Exception
+ */
+ public static double getGaussianProbability(TrainingExample x, TrainingSet set) throws Exception {
+ double[] mus = fitMus(set);
+ double[] sigmas = fitSigmas(mus, set);
+ return calculateGaussianProbability(x, mus, sigmas);
+ }
+
+ private static double calculateGaussianProbability(TrainingExample x, double[] mus,
+ double[] sigmas) {
+ assert mus.length == sigmas.length : "parameters not aligned";
+ BigDecimal px = new BigDecimal(1d);
+ for (int i = 0; i < mus.length; i++) {
+ BigDecimal firstTerm = BigDecimal.ONE.divide(BigDecimal.valueOf(Math.sqrt(2d * Math.PI * sigmas[i])), BigDecimal.ROUND_CEILING);
+ BigDecimal secondTerm = BigDecimal.valueOf(Math.exp(-1 * (Math.pow(x.getInputs()[i] - mus[i], 2) / (2 * Math.pow(sigmas[i], 2)))));
+ px = px.multiply(firstTerm.multiply(secondTerm));
+ }
+ return px.doubleValue();
+ }
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
new file mode 100644
index 0000000..716f7e9
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.Set;
+
+/**
+ * A builder for {@link ContextFreeGrammar}s
+ */
+public class CFGBuilder {
+
+ private Set<String> nonTerminalSymbols;
+ private Set<String> terminalSymbols;
+ private Set<Rule> rules;
+ private String startSymbol;
+
+ public static CFGBuilder createCFG() {
+ return new CFGBuilder();
+ }
+
+ public CFGBuilder withTerminals(Set<String> terminalSymbols) {
+ this.terminalSymbols = terminalSymbols;
+ return this;
+ }
+
+ public CFGBuilder withNonTerminals(Set<String> nonTerminalSymbols) {
+ this.nonTerminalSymbols = nonTerminalSymbols;
+ return this;
+ }
+
+ public CFGBuilder withRules(Set<Rule> rules) {
+ this.rules = rules;
+ return this;
+ }
+
+ public CFGBuilder withStartSymbol(String startSymbol) {
+ this.startSymbol = startSymbol;
+ return this;
+ }
+
+ public ContextFreeGrammar build() {
+ return new ContextFreeGrammar(nonTerminalSymbols, terminalSymbols, rules, startSymbol);
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
new file mode 100644
index 0000000..c3b90f9
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.Random;
+import java.util.Set;
+
+/**
+ * A context free grammar
+ */
+public class ContextFreeGrammar {
+ private Set<String> nonTerminalSymbols;
+ private Set<String> terminalSymbols;
+ private Set<Rule> rules;
+ private String startSymbol;
+
+ public ContextFreeGrammar(Set<String> nonTerminalSymbols, Set<String> terminalSymbols, Set<Rule> rules, String startSymbol) {
+ assert nonTerminalSymbols.contains(startSymbol) : "start symbol doesn't belong to non-terminal symbols set";
+
+ this.nonTerminalSymbols = nonTerminalSymbols;
+ this.terminalSymbols = terminalSymbols;
+ this.rules = rules;
+ this.startSymbol = startSymbol;
+ }
+
+
+ public String[] leftMostDerivation(String... words) {
+ ArrayList<String> expansion = new ArrayList<String>(words.length);
+
+ assert words.length > 0 && startSymbol.equals(words[0]);
+
+ for (String word : words) {
+ expansion.addAll(getTerminals(word));
+ }
+ return expansion.toArray(new String[expansion.size()]);
+
+ }
+
+ private Collection<String> getTerminals(String word) {
+
+ if (terminalSymbols.contains(word)) {
+ Collection<String> c = new LinkedList<String>();
+ c.add(word);
+ return c;
+ } else {
+ assert nonTerminalSymbols.contains(word) : "word " + word + " is not contained in non terminals";
+ String[] expansions = getExpansionForSymbol(word);
+ Collection<String> c = new LinkedList<String>();
+ for (String e : expansions) {
+ c.addAll(getTerminals(e));
+ }
+ return c;
+ }
+ }
+
+ private String[] getExpansionForSymbol(String currentSymbol) {
+ Rule r = getRuleForSymbol(currentSymbol);
+ return r.getExpansion();
+ }
+
+ private Rule getRuleForSymbol(String word) {
+ ArrayList<Rule> possibleRules = new ArrayList<Rule>();
+ for (Rule r : rules) {
+ if (word.equals(r.getEntry())) {
+ possibleRules.add(r);
+ }
+ }
+ if (possibleRules.size() > 0) {
+ return possibleRules.get(new Random().nextInt(possibleRules.size()));
+ } else {
+ throw new RuntimeException("could not find a rule for expanding symbol " + word);
+ }
+ }
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/Rule.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/Rule.java
new file mode 100644
index 0000000..a017b82
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/Rule.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+/**
+ * A rule for context free grammars
+ */
+public class Rule implements Comparable<Rule> {
+ private final String entry;
+ private final String[] expansion;
+
+ public Rule(String entry, String... expansion) {
+ this.entry = entry;
+ this.expansion = expansion;
+ }
+
+ public String getEntry() {
+ return entry;
+ }
+
+ public String[] getExpansion() {
+ return expansion;
+ }
+
+ @Override
+ public int compareTo(Rule o) {
+ return entry.compareTo(o.getEntry());
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
new file mode 100644
index 0000000..62d6bc7
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+/**
+ * A Naive Bayes Classifier for classifying objects of type I assigning classes of type O
+ */
+public interface NaiveBayesClassifier<I, O> {
+
+ public O calculateClass(I inputDocument) throws Exception;
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java
new file mode 100644
index 0000000..b7bf33c
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+/**
+ * C = argmax( P(d|c) * P(c) )
+ * where P(d|c) is called: likelihood
+ * and P(c) is called: prior - we can count relative frequencies in a corpus
+ * and d is a vector of features
+ * <p/>
+ * we assume:
+ * 1. bag of words assumption: positions don't matter
+ * 2. conditional independence: the feature probabilities are independent given a class
+ * <p/>
+ * thus P(d|c) == P(x1,..,xn|c) == P(x1|c)*...P(xn|c)
+ */
+public class SimpleNaiveBayesClassifier implements NaiveBayesClassifier<String, String> {
+
+ private static final String UNKNOWN_WORD_TOKEN = "_unk_word_";
+
+ private Collection<String> vocabulary; // the bag of all the words in the corpus
+ private final Map<String, String> docsWithClass; // this is the trained corpus holding a the doc as a key and the class as a value
+ private Map<String, String> classMegaDocMap; // key is the class, value is the megadoc
+ // private Map<String, String> preComputedWordClasses; // the key is the word, the value is its likelihood
+ private Map<String, Double> priors;
+
+
+ public SimpleNaiveBayesClassifier(Map<String, String> trainedCorpus) {
+ this.docsWithClass = trainedCorpus;
+ createVocabulary();
+ createMegaDocs();
+ preComputePriors();
+// preComputeWordClasses();
+
+ }
+
+ private void preComputePriors() {
+ priors = new HashMap<String, Double>();
+ for (String cl : classMegaDocMap.keySet()) {
+ priors.put(cl, calculatePrior(cl));
+ }
+ }
+
+// private void preComputeWordClasses() {
+// Set<String> uniqueWordsVocabulary = new HashSet<String>(vocabulary);
+// for (String d : docsWithClass.keySet()) {
+// calculateClass(d);
+// }
+// }
+
+ private void createMegaDocs() {
+ classMegaDocMap = new HashMap<String, String>();
+ Map<String, StringBuilder> mockClassMegaDocMap = new HashMap<String, StringBuilder>();
+ for (String doc : docsWithClass.keySet()) {
+ String cl = docsWithClass.get(doc);
+ StringBuilder megaDoc = mockClassMegaDocMap.get(cl);
+ if (megaDoc == null) {
+ megaDoc = new StringBuilder();
+ megaDoc.append(doc);
+ mockClassMegaDocMap.put(cl, megaDoc);
+ } else {
+ mockClassMegaDocMap.put(cl, megaDoc.append(" ").append(doc));
+ }
+ }
+ for (String cl : mockClassMegaDocMap.keySet()) {
+ classMegaDocMap.put(cl, mockClassMegaDocMap.get(cl).toString());
+ }
+ }
+
+ private void createVocabulary() {
+ vocabulary = new LinkedList<String>();
+ for (String doc : docsWithClass.keySet()) {
+ String[] split = tokenizeDoc(doc);
+ vocabulary.addAll(Arrays.asList(split));
+ }
+ }
+
+ private String[] tokenizeDoc(String doc) {
+ // TODO : this is by far not a tokenization, it should be changed
+ return doc.split(" ");
+ }
+
+ @Override
+ public String calculateClass(String inputDocument) {
+ Double max = 0d;
+ String foundClass = null;
+ for (String cl : classMegaDocMap.keySet()) {
+ Double clVal = priors.get(cl) * calculateLikelihood(inputDocument, cl);
+ if (clVal > max) {
+ max = clVal;
+ foundClass = cl;
+ }
+ }
+ return foundClass;
+ }
+
+
+ private Double calculateLikelihood(String document, String c) {
+ String megaDoc = classMegaDocMap.get(c);
+ // for each word
+ Double result = 1d;
+ for (String word : tokenizeDoc(document)) {
+ // num : count the no of times the word appears in documents of class c (+1)
+ double num = count(word, megaDoc) + 1; // +1 is added because of add 1 smoothing
+
+ // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+ double den = 0;
+ for (String w : vocabulary) {
+ den += count(w, megaDoc) + 1; // +1 is added because of add 1 smoothing
+ }
+
+ // P(w|c) = num/den
+ double wordProbability = num / den;
+ result *= wordProbability;
+ }
+
+ // P(d|c) = P(w1|c)*...*P(wn|c)
+ return result;
+ }
+
+ private int count(String word, String doc) {
+ int count = 0;
+ for (String t : tokenizeDoc(doc)) {
+ if (t.equals(word))
+ count++;
+ }
+ return count;
+ }
+
+ private Double calculatePrior(String currentClass) {
+ return (double) docCount(currentClass) / docsWithClass.keySet().size();
+ }
+
+ private int docCount(String countedClass) {
+ int count = 0;
+ for (String c : docsWithClass.values()) {
+ if (c.equals(countedClass)) {
+ count++;
+ }
+ }
+ return count;
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java
new file mode 100644
index 0000000..dab6f60
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;
+
+
+public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier<List<String>, String> {
+
+
+ private final Collection<String> vocabulary = new TreeSet<String>(); // the bag of all the words in the corpus
+ private final Map<String, Integer> classCounts = new LinkedHashMap<String, Integer>();
+ private double noDocs = 0d;
+ private final Map<String, Map<String, Integer>> nm = new HashMap<String, Map<String, Integer>>();
+ private final Map<String, Double> priors = new HashMap<String, Double>();
+ private final Map<String, Double> dens = new HashMap<String, Double>();
+
+ public void addExample(String klass, List<String> words) {
+ vocabulary.addAll(words);
+
+ Integer integer = classCounts.get(klass);
+ Integer f = integer != null ? integer : 0;
+ classCounts.put(klass, f + 1);
+
+ noDocs++;
+
+ for (String w : words) {
+ Map<String, Integer> wordCountsForClass = nm.get(klass);
+ if (wordCountsForClass == null) {
+ wordCountsForClass = new HashMap<String, Integer>();
+ }
+ Integer count = wordCountsForClass.get(w);
+ if (count == null) {
+ count = 1;
+ } else {
+ count++;
+ }
+ wordCountsForClass.put(w, count);
+ nm.put(klass, wordCountsForClass);
+ }
+ for (String c : classCounts.keySet()) {
+ priors.put(klass, calculatePrior(c));
+ }
+ calculateDen(klass);
+
+
+ }
+
+ private void calculateDen(String c) {
+ // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+ Double den = 0d;
+ for (String w : vocabulary) {
+ Integer integer = nm.get(c).get(w);
+ den += integer != null ? integer : 0;
+ }
+ den += vocabulary.size() + 1; // +|V| is added because of add 1 smoothing, +1 for unknown words
+ dens.put(c, den);
+ }
+
+ public String calculateClass(List<String> words) throws Exception {
+ Double max = -1000000d;
+ String foundClass = null;
+ for (String cl : nm.keySet()) {
+ double prior = priors.get(cl);
+ double likeliHood = calculateLikelihood(words, cl);
+ double clVal = prior + likeliHood;
+ if (clVal > max) {
+ max = clVal;
+ foundClass = cl;
+ }
+ }
+ System.err.println("class found: " + foundClass);
+ return foundClass;
+ }
+
+ private Double calculateLikelihood(List<String> words, String c) {
+ Map<String, Integer> wordFreqs = nm.get(c);
+ // for each word
+ double result = 0d;
+ for (String word : words) {
+ // num : count the no of times the word appears in documents of class c (+1)
+ Integer freq = wordFreqs.get(word) != null ? wordFreqs.get(word) : 0;
+ double num = freq + 1d; // +1 is added because of add 1 smoothing
+
+ // P(w|c) = num/den
+ double wordProbability = Math.log(num / dens.get(c));
+
+ result += wordProbability;
+ }
+
+ // P(d|c) = P(w1|c)*...*P(wn|c)
+ return result;
+ }
+
+ private Double calculatePrior(String currentClass) {
+ return Math.log(docCount(currentClass) / noDocs);
+ }
+
+ private double docCount(String countedClass) {
+ Integer integer = classCounts.get(countedClass);
+ return integer != null ? (double) integer : 0d;
+ }
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
new file mode 100644
index 0000000..26b6ef8
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collection;
+
+/**
+ * A language model calculate the probability <i>p</i> (between 0 and 1) of a
+ * certain set of <code>T</code> objects, given a vocabulary.
+ */
+public interface LanguageModel<T> {
+
+ /**
+ * Calculate the probability of a sentence given a vocabulary
+ *
+ * @param vocabulary a {@link Collection} of objects of type <code>T</code>
+ * @param sample the sample to evaluate the probability for
+ * @return a <code>double</code> between <code>0</code> and <code>1</code>
+ */
+ public double calculateProbability(Collection<T> vocabulary, T sample);
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
new file mode 100644
index 0000000..1d44e23
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collection;
+import java.util.Collections;
+
+/**
+ * Simple sentence language model which just counts the no. of occurrences of
+ * a sentence over the no. of sentences in the vocabulary.
+ */
+public class NaiveSentenceLanguageModel<T> implements LanguageModel<T[]> {
+ @Override
+ public double calculateProbability(Collection<T[]> vocabulary, T[] sentence) {
+ return Collections.frequency(vocabulary, sentence) / vocabulary.size();
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
new file mode 100644
index 0000000..1c02ec8
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+/**
+ * Abstract impl of a noisy channel
+ */
+public abstract class NoisyChannel {
+
+ private String[] dictionary;
+
+ public void initializeDictionary(String[] dictionary) {
+ this.dictionary = dictionary;
+ }
+
+ public String findCorrection(String mispelledWord) {
+ Double val = 0d;
+ String correctWord = null;
+ for (String word : dictionary) {
+ Double curVal = calculateLikelihood(mispelledWord, word) * calculatePrior(word);
+ if (curVal > val) {
+ val = curVal;
+ correctWord = word;
+ }
+ }
+ return correctWord;
+ }
+
+ public abstract Double calculatePrior(String word);
+
+ public abstract Double calculateLikelihood(String mispelledWord, String word);
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
new file mode 100644
index 0000000..3dc6152
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.opennlp.utils.ngram.NGramUtils;
+
+/**
+ * A simple trigram language model for sentences made of <code>String</code> arrays
+ */
+public class TrigramSentenceLanguageModel implements LanguageModel<String[]> {
+ @Override
+ public double calculateProbability(Collection<String[]> vocabulary, String[] sample) {
+ double probability = 1d;
+ for (Trigram trigram : getTrigrams(sample)) {
+ if (trigram.getX0() != null && trigram.getX1() != null) {
+ // default
+ probability *= NGramUtils.calculateTrigramMLProbability(trigram.getX0(), trigram.getX1(), trigram.getX2(), vocabulary);
+ } else if (trigram.getX0() == null && trigram.getX1() != null) {
+ // bigram
+ probability *= NGramUtils.calculateBigramMLProbability(trigram.getX2(), trigram.getX1(), vocabulary);
+ } else if (trigram.getX0() == null && trigram.getX1() == null) {
+ // unigram
+ probability *= NGramUtils.calculateUnigramMLProbability(trigram.getX2(), vocabulary);
+ } else {
+ // unexpected
+ }
+ }
+ return probability;
+ }
+
+ private Set<Trigram> getTrigrams(String[] sample) {
+ Set<Trigram> trigrams = new HashSet<Trigram>();
+ for (int i = 0; i < sample.length - 2; i++) {
+ String x0 = null;
+ String x1 = null;
+ String x2 = sample[i];
+ if (i > 1) {
+ x1 = sample[i - 1];
+ }
+ if (i > 2) {
+ x0 = sample[i - 2];
+ }
+ trigrams.add(new Trigram(x0, x1, x2));
+ }
+ return trigrams;
+ }
+
+ private class Trigram {
+ private final String x0;
+ private final String x1;
+ private final String x2;
+
+ private Trigram(String x0, String x1, String x2) {
+ this.x0 = x0;
+ this.x1 = x1;
+ this.x2 = x2;
+ }
+
+ public String getX0() {
+ return x0;
+ }
+
+ public String getX1() {
+ return x1;
+ }
+
+ public String getX2() {
+ return x2;
+ }
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
new file mode 100644
index 0000000..2f3d1ac
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.ngram;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+
+/**
+ * utility class for calculating probabilities of tri/bi/uni-grams
+ */
+public class NGramUtils {
+
+ private static Double count(String x0, String x1, String x2, Collection<String[]> sentences) {
+ Double count = 0d;
+ for (String[] sentence : sentences) {
+ int idx0 = contains(sentence, x0);
+ if (idx0 >= 0) {
+ if (idx0 + 2 < sentence.length && x1.equals(sentence[idx0+1]) && x2.equals(sentence[idx0+2])) {
+ count++;
+ }
+ }
+ }
+ return count;
+ }
+
+ private static int contains(String[] sentence, String word) {
+ for (int i = 0; i < sentence.length; i++) {
+ if (word.equals(sentence[i])){
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ private static Double count(String sequentWord, String precedingWord, Collection<String[]> set) {
+ Double result = 0d;
+ boolean foundPreceding = false;
+ for (String[] sentence : set) {
+ for (String w : sentence) {
+ if (precedingWord.equals(w)) {
+ foundPreceding = true;
+ continue;
+ }
+ if (foundPreceding && sequentWord.equals(w)) {
+ foundPreceding = false;
+ result++;
+ }
+ else
+ foundPreceding = false;
+ }
+ }
+ return result;
+ }
+
+ private static Double count(String word, Collection<String[]> set) {
+ Double result = 0d;
+ for (String[] sentence : set) {
+ for (String w : sentence) {
+ if (word.equals(w))
+ result++;
+ }
+ }
+ return result;
+ }
+
+ public static Double calculateLaplaceSmoothingProbability(String sequentWord, String precedingWord, Collection<String[]> set, Double k) {
+ return (count(sequentWord, precedingWord, set) + k) / (count(precedingWord, set) + k * set.size());
+ }
+
+ public static Double calculateBigramMLProbability(String sequentWord, String precedingWord, Collection<String[]> set) {
+ return count(sequentWord, precedingWord, set)/ count(precedingWord, set);
+ }
+
+ public static Double calculateTrigramMLProbability(String x0, String x1, String x2, Collection<String[]> sentences) {
+ return count(x0, x1, x2, sentences)/ count(x1, x0, sentences);
+ }
+
+ public static Double calculateBigramPriorSmoothingProbability(String sequentWord, String precedingWord, Collection<String[]> set, Double k) {
+ return (count(sequentWord, precedingWord, set) + k * calculateUnigramMLProbability(sequentWord, set)) / (count(precedingWord, set) + k * set.size());
+ }
+
+ public static Double calculateUnigramMLProbability(String word, Collection<String[]> set) {
+ double vocSize = 0d;
+ for (String[] s : set) {
+ vocSize+= s.length;
+ }
+ return count(word, set) / vocSize;
+ }
+
+ public static Double calculateLinearInterpolationProbability(String x0, String x1, String x2, Collection<String[]> sentences,
+ Double lambda1, Double lambda2, Double lambda3) {
+ assert lambda1 + lambda2 + lambda3 == 1 : "lambdas sum should be equals to 1";
+ assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should be greater than 0";
+
+ return lambda1 * calculateTrigramMLProbability(x0, x1, x2, sentences) +
+ lambda2 * calculateBigramMLProbability(x2, x1, sentences) +
+ lambda3 * calculateUnigramMLProbability(x2, sentences);
+
+ }
+
+ private static Collection<String> flatSet(Collection<String[]> set) {
+ Collection<String> flatSet = new HashSet<String>();
+ for (String[] sentence : set){
+ flatSet.addAll(Arrays.asList(sentence));
+ }
+ return flatSet;
+ }
+
+ public static Double calculateMissingBigramProbabilityMass(String x1, Double discount, Collection<String[]> set) {
+ Double missingMass = 0d;
+ Double countWord = count(x1, set);
+ for (String word : flatSet(set)) {
+ missingMass += (count(word, x1, set) - discount)/ countWord;
+ }
+ return 1 - missingMass;
+ }
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java
new file mode 100644
index 0000000..9fccc61
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import java.util.Arrays;
+
+import org.apache.opennlp.utils.TrainingSet;
+
+/**
+ * Utility class for calculating gradient descent
+ */
+public class GradientDescentUtils {
+
+ private static final double THRESHOLD = 0.5;
+ private static final int MAX_ITERATIONS = 100000;
+
+ /**
+ * Calculates batch gradient descent on the give hypothesis, training set and learning rate alpha.
+ * The algorithms iteratively adjusts the hypothesis parameters
+ *
+ * @param hypothesis the hypothesis representing the model used
+ * @param trainingSet the training set used to fit the parameters
+ * @param alpha the learning rate alpha used to define how big the descent steps are
+ */
+ public static void batchGradientDescent(Hypothesis hypothesis, TrainingSet trainingSet, double alpha) {
+ // set initial random weights
+ double[] parameters = initializeRandomWeights(trainingSet.iterator().next().getInputs().length);
+ hypothesis.updateParameters(parameters);
+
+ int iterations = 0;
+
+ double cost = Double.MAX_VALUE;
+ while (true) {
+ // calculate cost
+ double newCost = RegressionModelUtils.ordinaryLeastSquares(trainingSet, hypothesis);
+
+ if (newCost > cost) {
+ throw new RuntimeException("failed to converge at iteration " + iterations + " with cost going from " + cost + " to " + newCost);
+ } else if (cost == newCost || newCost < THRESHOLD || iterations > MAX_ITERATIONS) {
+ System.out.println(cost + " with parameters " + Arrays.toString(parameters));
+ break;
+ }
+
+ // update registered cost
+ cost = newCost;
+
+ // calculate the updated parameters
+ parameters = RegressionModelUtils.batchLeastMeanSquareUpdate(parameters, alpha, trainingSet, hypothesis);
+
+ // update weights in the hypothesis
+ hypothesis.updateParameters(parameters);
+
+ iterations++;
+ }
+ }
+
+ private static double[] initializeRandomWeights(int size) {
+ double[] doubles = new double[size];
+ for (int i = 0; i < doubles.length; i++) {
+ doubles[i] = Math.random() * 0.1d;
+ }
+ return doubles;
+ }
+
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
new file mode 100644
index 0000000..25920f6
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+/**
+ * An {@link Hypothesis} maps a series of inputs to an output
+ */
+public interface Hypothesis {
+
+ /**
+ * calculate the output given some inputs according to the underlying model.
+ *
+ * @param inputs an array of inputs as <code>double</code>
+ * @return a <code>double</code> representing the output
+ */
+ public double calculateOutput(double[] inputs);
+
+ /**
+ * update the internal model's parameters.
+ *
+ * @param parameters an array of <code>double</code> containing the updated parameters
+ */
+ public void updateParameters(double[] parameters);
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
new file mode 100644
index 0000000..2c52876
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+/**
+ * Simplest {@link Hypothesis} which just linear combines inputs with weights
+ */
+public class LinearCombinationHypothesis implements Hypothesis {
+ private double[] weights;
+
+ @Override
+ public double calculateOutput(double[] inputs) {
+ double output = 0d;
+ for (int i = 0; i < weights.length; i++) {
+ output += weights[i] * inputs[i];
+ }
+ return output;
+ }
+
+ @Override
+ public void updateParameters(double[] parameters) {
+ weights = parameters;
+ }
+}
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java
new file mode 100644
index 0000000..d543f51
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+
+/**
+ * Utility class for calculating various regression models costs
+ */
+public class RegressionModelUtils {
+
+ /**
+ * calculate the ordinary least squares (OLS) cost in the given training set for a given hypothesis
+ *
+ * @param trainingSet the training set used
+ * @param hypothesis the hypothesis function representing the model
+ * @return the cost of the hypothesis for the given training set using OLS
+ */
+ public static double ordinaryLeastSquares(TrainingSet trainingSet, Hypothesis hypothesis) {
+ double output = 0;
+ for (TrainingExample trainingExample : trainingSet) {
+ double difference = hypothesis.calculateOutput(trainingExample.getInputs()) - trainingExample.getOutput();
+ output += Math.pow(difference, 2);
+ }
+ return output / 2d;
+ }
+
+ /**
+ * calculate the least mean square (LMS) update for a given weight vector
+ *
+ * @param thetas the array of weights
+ * @param alpha the learning rate alpha
+ * @param trainingSet the training set to use for learning
+ * @param hypothesis the hypothesis representing the model
+ * @return the updated weights vector
+ */
+ public static double[] batchLeastMeanSquareUpdate(double[] thetas, double alpha, TrainingSet trainingSet, Hypothesis hypothesis) {
+ double[] updatedWeights = new double[thetas.length];
+ for (int i = 0; i < updatedWeights.length; i++) {
+ double errors = 0;
+ for (TrainingExample trainingExample : trainingSet) {
+ errors += (trainingExample.getOutput() - hypothesis.calculateOutput(trainingExample.getInputs())) * trainingExample.getInputs()[i];
+ }
+ updatedWeights[i] = thetas[i] + alpha * errors;
+ }
+ return updatedWeights;
+ }
+
+ /**
+ * calculate least mean square update for a given training example for the j-th input
+ *
+ * @param thetas the array of weights
+ * @param alpha the learning rate alpha
+ * @param trainingExample the training example to use for learning
+ * @param hypothesis the hypothesis representing the model
+ * @param j the index of the j-th input
+ * @return the updated weight for the j-th element of the weights vector
+ */
+ public static double singleLeastMeanSquareUpdate(double[] thetas, double alpha, TrainingExample trainingExample, Hypothesis hypothesis, int j) {
+ return thetas[j] + alpha * (trainingExample.getOutput() - hypothesis.calculateOutput(trainingExample.getInputs())) * trainingExample.getInputs()[j];
+ }
+
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
new file mode 100644
index 0000000..1281ea4
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+import org.junit.Ignore;
+
+/**
+ * Utility class for tests
+ */
+@Ignore
+public class TestUtils {
+
+ public static void fillTrainingSet(TrainingSet trainingSet, int size, int dimension) {
+ for (int i = 0; i < size; i++) {
+ double[] inputs = new double[dimension];
+ for (int j = 0; j < dimension; j++) {
+ inputs[j] = Math.random();
+ }
+ double out = Math.random();
+ trainingSet.add(new TrainingExample(inputs, out));
+ }
+ }
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
new file mode 100644
index 0000000..04399b7
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.anomalydetection;
+
+import org.junit.Test;
+
+import org.apache.opennlp.utils.TestUtils;
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.anomalydetection.AnomalyDetectionUtils}
+ */
+public class AnomalyDetectionUtilsTest {
+ @Test
+ public void testGaussianDistributionProbability() throws Exception {
+ TrainingSet trainingSet = new TrainingSet();
+ TestUtils.fillTrainingSet(trainingSet, 100, 5);
+ double[] mus = AnomalyDetectionUtils.fitMus(trainingSet);
+ assertNotNull(mus);
+ double[] sigmas = AnomalyDetectionUtils.fitSigmas(mus, trainingSet);
+ assertNotNull(sigmas);
+ TrainingExample newInput = new TrainingExample(new double[]{1d, 2d, 1000d, 123d, 0.1d}, 0d);
+ double probability = AnomalyDetectionUtils.getGaussianProbability(newInput, mus, sigmas);
+ assertTrue("negative probability " + probability, 0 <= probability);
+ assertTrue("probability bigger than 1 " + probability, 1 >= probability);
+ }
+
+ @Test
+ public void testGaussianDistributionProbability2() throws Exception {
+ TrainingSet trainingSet = new TrainingSet();
+ TestUtils.fillTrainingSet(trainingSet, 100, 5);
+ TrainingExample newInput = new TrainingExample(new double[]{1d, 2d, 1000d, 123d, 0.1d}, 0d);
+ double probability = AnomalyDetectionUtils.getGaussianProbability(newInput, trainingSet);
+ assertTrue("negative probability " + probability, 0 <= probability);
+ assertTrue("probability bigger than 1 " + probability, 1 >= probability);
+ }
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
new file mode 100644
index 0000000..7901820
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.cfg.ContextFreeGrammar}
+ */
+public class ContextFreeGrammarTest {
+
+ private ContextFreeGrammar contextFreeGrammar;
+ private Set<String> terminals;
+
+ @Before
+ public void setUp() throws Exception {
+
+ Set<String> nonTerminals = new HashSet<String>(); // PoS + Parse tags
+ nonTerminals.add("S");
+ nonTerminals.add("NP");
+ nonTerminals.add("VP");
+ nonTerminals.add("PP");
+ nonTerminals.add("DT");
+ nonTerminals.add("Vi");
+ nonTerminals.add("Vt");
+ nonTerminals.add("NN");
+ nonTerminals.add("IN");
+ nonTerminals.add("NNP");
+ nonTerminals.add("CJ");
+ nonTerminals.add("DJ");
+ nonTerminals.add("P");
+
+ String startSymbol = "S";
+
+ terminals = new HashSet<String>();
+ terminals.add("sleeps");
+ terminals.add("saw");
+ terminals.add("man");
+ terminals.add("woman");
+ terminals.add("telescope");
+ terminals.add("the");
+ terminals.add("with");
+ terminals.add("in");
+ terminals.add("tommaso");
+ terminals.add("simone");
+ terminals.add("joao");
+ terminals.add("tigro");
+ terminals.add("michele");
+ terminals.add("scarlett");
+ terminals.add("and");
+ terminals.add("but");
+ terminals.add("while");
+ terminals.add("of");
+ terminals.add("for");
+
+ Set<Rule> rules = new HashSet<Rule>();
+ rules.add(new Rule("S", "NP", "VP"));
+ rules.add(new Rule("P", "S", "CJ", "S"));
+ rules.add(new Rule("P", "S", "DJ", "S"));
+ rules.add(new Rule("VP", "Vi"));
+ rules.add(new Rule("VP", "Vt", "NP"));
+ rules.add(new Rule("VP", "VP", "PP"));
+ rules.add(new Rule("NP", "DT", "NN"));
+ rules.add(new Rule("NP", "NP", "PP"));
+ rules.add(new Rule("NP", "NNP"));
+ rules.add(new Rule("PP", "IN", "NP"));
+ rules.add(new Rule("Vi", "sleeps"));
+ rules.add(new Rule("Vt", "saw"));
+ rules.add(new Rule("NN", "man"));
+ rules.add(new Rule("NN", "woman"));
+ rules.add(new Rule("NN", "telescope"));
+ rules.add(new Rule("DT", "the"));
+ rules.add(new Rule("IN", "with"));
+ rules.add(new Rule("IN", "in"));
+ rules.add(new Rule("IN", "for"));
+ rules.add(new Rule("IN", "of"));
+ rules.add(new Rule("NNP", "tommaso"));
+ rules.add(new Rule("NNP", "simone"));
+ rules.add(new Rule("NNP", "joao"));
+ rules.add(new Rule("NNP", "tigro"));
+ rules.add(new Rule("NNP", "michele"));
+ rules.add(new Rule("NNP", "scarlett"));
+ rules.add(new Rule("CJ", "and"));
+ rules.add(new Rule("DJ", "but"));
+ rules.add(new Rule("DJ", "while"));
+
+ contextFreeGrammar = new ContextFreeGrammar(nonTerminals, terminals, rules, startSymbol);
+ }
+
+ @Test
+ public void testSingleExpansion() throws Exception {
+ String[] expansion = contextFreeGrammar.leftMostDerivation("S");
+ checkExpansion(expansion);
+ }
+
+
+ @Test
+ public void testMultipleSentencesExpansion() throws Exception {
+ String[] expansion = contextFreeGrammar.leftMostDerivation("S", "CJ", "S");
+ checkExpansion(expansion);
+
+ expansion = contextFreeGrammar.leftMostDerivation("S", "DJ", "S", "CJ", "P");
+ checkExpansion(expansion);
+ }
+
+ private void checkExpansion(String[] expansion) {
+ assertNotNull(expansion);
+ assertTrue(expansion.length > 0);
+ for (String t : expansion) {
+ assertTrue("term " + t + " is not a terminal symbol", terminals.contains(t));
+ }
+ System.err.println(Arrays.toString(expansion));
+ }
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java
new file mode 100644
index 0000000..8016679
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.classification.SimpleNaiveBayesClassifier}
+ */
+public class SimpleNaiveBayesClassifierTest {
+
+ @Test
+ public void ppsIntegrationTest() throws Exception {
+ Map<String, String> trainedCorpus = new HashMap<String, String>();
+ trainedCorpus.put("CAVOUR ad.te napoleone III affare: cat. C/2 ottimo" +
+ " stato ingresso angolo cottura bagno con doccia e camera. " +
+ "ottimo per investimento o piccolo studio per professionisti" +
+ " e 99.000 Ag.Imm.", "A");
+ trainedCorpus.put("TRASTEVERE via degli Orti di Trastevere in palazzo " +
+ "signorile (con s. portineria) appartamento mq 180 + cantina mq" +
+ " 6 con rifiniture di pregio marmi & armadi a muro + ampia " +
+ "balconata 50 mq assolutamente no agenzie E 930.000", "N");
+ trainedCorpus.put("CORSO VITTORIO Emanuele V. del banco di santo spirito" +
+ " 3° piano con ascensore appartamento di 142 mq commerciali " +
+ "composto da: ingresso disimpegno tre camere soggiorno cucina" +
+ " due bagni due cantine per un totale di 15 mq e. 900.000 Ag.Imm.", "A");
+ trainedCorpus.put("TRASTEVERE Ippolito Nievo quinto piano tripla " +
+ "esposizione ingresso salone doppio cucina abitabile tre " +
+ "camere servizio ripostiglio terrazzo e soffitta da ristrutturare " +
+ "e 650.000 Ag.Imm.", "A");
+ trainedCorpus.put("TRASTEVERE E.Rolli solo privati palazzo epoca doppia" +
+ " esposizione ingresso soppalcato soggiorno 2 camere cucinotto " +
+ "bagno 84 mq IV piano no ascensore 385.000 giardino condominio", "N");
+ trainedCorpus.put("CENTRO monti sforza elegante edificio con ampi spazi" +
+ " comuni ristrutturato ingresso soggiorno angolo cucina camera " +
+ "letto armadi a muro bagno vasca con finestra pavimenti cotto " +
+ "luminoso silenzioso doppio affaccio climatizzato e 405.000 ag. " +
+ "imm. cl en.g", "A");
+ trainedCorpus.put("SAN LORENZO app.to epoca privato vende salone due " +
+ "camere cucina abit. due bagni ripostigli vari II piano con" +
+ " ascensore triplo affaccio E 530.000 ", "N");
+ trainedCorpus.put("SAN LORENZO Via Porta Labicana appartamento mq 80 " +
+ "piano rialzato con ingresso 3 camere cucina bagno E 395.000 ", "N");
+ trainedCorpus.put("SAN LORENZO via degli Umbri I° p. 3 stanze cucina " +
+ "servizio terrazzino interno buono stato E. 390.000 tratt. " +
+ "assoloutamente no agenzie ", "N");
+
+
+ SimpleNaiveBayesClassifier classifier = new SimpleNaiveBayesClassifier(trainedCorpus);
+
+ Boolean isAgency = classifier.calculateClass("CENTRO S.Maria Maggiore " +
+ "angolo Napoleone III in palazzo epoca con portiere 110 mq ristrutt." +
+ " IIp salone doppio cucina ab. 2 camere bagno ripost. balcone " +
+ "perimetrale E. 730.000 tratt. ").equals("A");
+ assertFalse(isAgency);
+
+ isAgency = classifier.calculateClass("TRASTEVERE via del Mattonato in " +
+ "piccola palazzina d'epoca app.to finemente ristrutturato " +
+ "ingresso salone camera cucina tinello servizio balconcino " +
+ "aria condiz. e 540.000 Ag.Imm. ").equals("A");
+ assertTrue(isAgency);
+
+ }
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
new file mode 100644
index 0000000..75dbe16
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.ngram;
+
+import java.util.Collection;
+import java.util.LinkedList;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.ngram.NGramUtils}
+ */
+public class NGramUtilsTest {
+ @Test
+ public void testBigram() {
+ Collection<String[]> set = new LinkedList<String[]>();
+ set.add(new String[]{"<s>","I","am","Sam","</s>"});
+ set.add(new String[]{"<s>","Sam","I","am","</s>"});
+ set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+ set.add(new String[]{});
+ Double d = NGramUtils.calculateBigramMLProbability("I", "<s>", set);
+ assertTrue(d>0);
+ assertEquals(Double.valueOf(0.6666666666666666d),d);
+ d = NGramUtils.calculateBigramMLProbability("</s>", "Sam", set);
+ assertEquals(Double.valueOf(0.5d),d);
+ d = NGramUtils.calculateBigramMLProbability("Sam", "<s>", set);
+ assertEquals(Double.valueOf(0.3333333333333333d),d);
+ }
+
+ @Test
+ public void testTrigram() {
+ Collection<String[]> set = new LinkedList<String[]>();
+ set.add(new String[]{"<s>","I","am","Sam","</s>"});
+ set.add(new String[]{"<s>","Sam","I","am","</s>"});
+ set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+ set.add(new String[]{});
+ Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam",set);
+ assertTrue(d>0);
+ assertEquals(Double.valueOf(0.5),d);
+ d = NGramUtils.calculateTrigramMLProbability("Sam","I", "am", set);
+ assertEquals(Double.valueOf(1d),d);
+ }
+
+ @Test
+ public void testLinearInterpolation() throws Exception {
+ Collection<String[]> set = new LinkedList<String[]>();
+ set.add(new String[]{"the","green","book","STOP"});
+ set.add(new String[]{"my","blue","book","STOP"});
+ set.add(new String[]{"his","green","house","STOP"});
+ set.add(new String[]{"book","STOP"});
+ Double lambda = 1d/3d;
+ Double d = NGramUtils.calculateLinearInterpolationProbability("the", "green", "book", set, lambda, lambda, lambda);
+ assertNotNull(d);
+ assertTrue(d > 0);
+ assertEquals("wrong result", Double.valueOf(0.5714285714285714d), d);
+ }
+
+ @Test
+ public void testLinearInterpolation2() throws Exception {
+ Collection<String[]> set = new LinkedList<String[]>();
+ set.add(new String[]{"D","N","V","STOP"});
+ set.add(new String[]{"D","N","V","STOP"});
+ Double lambda = 1d/3d;
+ Double d = NGramUtils.calculateLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda, lambda);
+ assertNotNull(d);
+ assertTrue(d > 0);
+ assertEquals("wrong result", Double.valueOf(0.75d), d);
+ }
+
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java
new file mode 100644
index 0000000..d4a8df0
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import org.apache.opennlp.utils.TestUtils;
+import org.apache.opennlp.utils.TrainingSet;
+import org.junit.Test;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.regression.GradientDescentUtils}
+ */
+public class GradientDescentUtilsTest {
+
+ @Test
+ public void testConvergence() throws Exception {
+ TrainingSet trainingSet = new TrainingSet();
+ TestUtils.fillTrainingSet(trainingSet, 100, 5);
+ GradientDescentUtils.batchGradientDescent(new LinearCombinationHypothesis(), trainingSet, 0.00002);
+ }
+
+}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java
new file mode 100644
index 0000000..d0cb9de
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+import org.junit.Test;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.regression.RegressionModelUtils}
+ */
+public class RegressionModelUtilsTest {
+
+ @Test
+ public void testLMS() throws Exception {
+ TrainingSet trainingSet = new TrainingSet();
+ trainingSet.add(new TrainingExample(new double[]{10, 10}, 1));
+ LinearCombinationHypothesis hypothesis = new LinearCombinationHypothesis();
+ hypothesis.updateParameters(new double[]{1, 1});
+ double[] updatedParameters = RegressionModelUtils.batchLeastMeanSquareUpdate(new double[]{1, 1}, 0.1, trainingSet, hypothesis);
+ assertNotNull(updatedParameters);
+ assertTrue(updatedParameters.length == 2);
+ assertTrue(updatedParameters[0] == -18d);
+ assertTrue(updatedParameters[1] == -18d);
+ }
+}
diff --git a/nlp-utils/src/test/resources/presidents.txt b/nlp-utils/src/test/resources/presidents.txt
new file mode 100644
index 0000000..a765da9
--- /dev/null
+++ b/nlp-utils/src/test/resources/presidents.txt
@@ -0,0 +1,44 @@
+Washington 94
+Adams 48
+Jefferson 96
+Madison 96
+Monroe 96
+Adams 48
+Jackson 96
+Van Buren 48
+Harrison 1
+Tyler 47
+Polk 48
+Taylor 16
+Filmore 32
+Pierce 48
+Buchanan 48
+Lincoln 49
+Johnson 47
+Grant 96
+Hayes 48
+Garfield 7
+Arthur 41
+Cleveland 48
+Harrison 48
+Cleveland 48
+McKinley 54
+Roosevelt 90
+Taft 48
+Wilson 96
+Harding 29
+Coolidge 67
+Hoover 48
+Roosevelt 146
+Truman 92
+Eisenhower 96
+Kennedy 34
+Johnson 62
+Nixon 67
+Ford 29
+Carter 48
+Reagan 96
+Bush 48
+Clinton 96
+Bush 96
+Obama 48
\ No newline at end of file