OPENNLP-657 Initial pull of the nlp-utils provided by Tommaso Teofili. Thanks for contributing.

commit: af92a7d8be2af01705f3e9090da9cdd9bc5c568a [log] [tgz]
author: Jörn Kottmann <joern@apache.org> Mon Mar 10 12:28:25 2014 +0000
committer: Jörn Kottmann <joern@apache.org> Mon Mar 10 12:28:25 2014 +0000
tree: bc83fd32f6a40ec6cc3f5889ca20b278f0384097
parent: 874769963efd435a751c1085e4c60c29097d9167 [diff]
diff --git a/nlp-utils/README.md b/nlp-utils/README.md
new file mode 100644
index 0000000..9782787
--- /dev/null
+++ b/nlp-utils/README.md

@@ -0,0 +1,10 @@
+nlp-utils
+=========
+a set of utilities for most common nlp tasks.
+currently providing tools for:
+* ngram
+* naive bayes
+* gradient descent / regression
+* anomaly detection
+* language modeling
+* CFG

diff --git a/nlp-utils/pom.xml b/nlp-utils/pom.xml
new file mode 100644
index 0000000..d32a6ae
--- /dev/null
+++ b/nlp-utils/pom.xml

@@ -0,0 +1,54 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.opennlp</groupId>
+  <artifactId>nlp-utils</artifactId>
+  <version>0.1-SNAPSHOT</version>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>2.0.2</version>
+        <configuration>
+          <compilerVersion>1.6</compilerVersion>
+          <source>1.6</source>
+          <target>1.6</target>
+          <encoding>UTF-8</encoding>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingExample.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingExample.java
new file mode 100644
index 0000000..6a08c7a
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingExample.java

@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.opennlp.utils;
+
+/**
+ * A {@link TrainingExample} holds some inputs and a corresponding output
+ */
+public class TrainingExample {
+  private final double[] inputs;
+  private final double output;
+
+  public TrainingExample(double[] inputs, double output) {
+    this.inputs = inputs;
+    this.output = output;
+  }
+
+  public double[] getInputs() {
+    return inputs;
+  }
+
+  public double getOutput() {
+    return output;
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingSet.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingSet.java
new file mode 100644
index 0000000..174f8f3
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/TrainingSet.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+
+/**
+ * An {@link Iterable} over {@link TrainingExample}s
+ */
+public class TrainingSet implements Iterable<TrainingExample> {
+
+  private final Collection<TrainingExample> trainingExamples = new HashSet<TrainingExample>();
+
+  @Override
+  public Iterator<TrainingExample> iterator() {
+    return trainingExamples.iterator();
+  }
+
+  public void add(TrainingExample trainingExample) {
+    trainingExamples.add(trainingExample);
+  }
+
+  public int size() {
+    return trainingExamples.size();
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java
new file mode 100644
index 0000000..0fdf326
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java

@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.anomalydetection;
+
+import java.math.BigDecimal;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+
+/**
+ * Utility class for anomaly detection
+ */
+public class AnomalyDetectionUtils {
+
+  /**
+   * calculate Mu distribution parameters for a {@link org.apache.opennlp.utils.TrainingSet}'s set of features
+   *
+   * @param inputs the {@link org.apache.opennlp.utils.TrainingSet} to fit
+   * @return the <code>double[]</code> containing the Mu parameters for each feature
+   * @throws Exception
+   */
+  public static double[] fitMus(TrainingSet inputs) {
+    assert inputs != null && inputs.size() > 0 : "empty dataset";
+    int size = inputs.iterator().next().getInputs().length;
+    double[] result = new double[size];
+    for (int i = 0; i < size; i++) {
+      for (TrainingExample trainingExample : inputs) {
+        result[i] += trainingExample.getInputs()[i];
+      }
+      result[i] /= inputs.size();
+    }
+    return result;
+  }
+
+  /**
+   * calculates (squared) standard deviation parameters for the given {@link TrainingSet}
+   *
+   * @param mus    mean parameters
+   * @param inputs the {@link TrainingSet} to fit
+   * @return the <code>double[]</code> containing the standard deviations
+   * @throws Exception
+   */
+  public static double[] fitSigmas(double[] mus, TrainingSet inputs) {
+    assert inputs != null && inputs.size() > 0 : "empty dataset";
+    int size = inputs.iterator().next().getInputs().length;
+    double[] result = new double[size];
+    for (int i = 0; i < size; i++) {
+      for (TrainingExample trainingExample : inputs) {
+        result[i] += Math.pow(trainingExample.getInputs()[i] - mus[i], 2);
+      }
+      result[i] /= inputs.size();
+    }
+    return result;
+  }
+
+  /**
+   * calculate the probability of a certain input
+   *
+   * @param x      the input
+   * @param mus    the means for the modeled features
+   * @param sigmas the standard deviations for the modeled features
+   * @return the probability of the given input
+   */
+  public static double getGaussianProbability(TrainingExample x, double[] mus, double[] sigmas) {
+    return calculateGaussianProbability(x, mus, sigmas);
+  }
+
+  /**
+   * calculate the probability of a certain input in a certain training set
+   *
+   * @param x      the input
+   * @param set    the training set
+   * @return the probability of the given input
+   * @throws Exception 
+   */
+  public static double getGaussianProbability(TrainingExample x, TrainingSet set) throws Exception {
+    double[] mus = fitMus(set);
+    double[] sigmas = fitSigmas(mus, set);
+    return calculateGaussianProbability(x, mus, sigmas);
+  }
+
+  private static double calculateGaussianProbability(TrainingExample x, double[] mus,
+          double[] sigmas) {
+    assert mus.length == sigmas.length : "parameters not aligned";
+    BigDecimal px = new BigDecimal(1d);
+    for (int i = 0; i < mus.length; i++) {
+      BigDecimal firstTerm = BigDecimal.ONE.divide(BigDecimal.valueOf(Math.sqrt(2d * Math.PI * sigmas[i])), BigDecimal.ROUND_CEILING);
+      BigDecimal secondTerm = BigDecimal.valueOf(Math.exp(-1 * (Math.pow(x.getInputs()[i] - mus[i], 2) / (2 * Math.pow(sigmas[i], 2)))));
+      px = px.multiply(firstTerm.multiply(secondTerm));
+    }
+    return px.doubleValue();
+  }
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java
new file mode 100644
index 0000000..716f7e9
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java

@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.Set;
+
+/**
+ * A builder for {@link ContextFreeGrammar}s
+ */
+public class CFGBuilder {
+
+    private Set<String> nonTerminalSymbols;
+    private Set<String> terminalSymbols;
+    private Set<Rule> rules;
+    private String startSymbol;
+
+    public static CFGBuilder createCFG() {
+        return new CFGBuilder();
+    }
+
+    public CFGBuilder withTerminals(Set<String> terminalSymbols) {
+        this.terminalSymbols = terminalSymbols;
+        return this;
+    }
+
+    public CFGBuilder withNonTerminals(Set<String> nonTerminalSymbols) {
+        this.nonTerminalSymbols = nonTerminalSymbols;
+        return this;
+    }
+
+    public CFGBuilder withRules(Set<Rule> rules) {
+        this.rules = rules;
+        return this;
+    }
+
+    public CFGBuilder withStartSymbol(String startSymbol) {
+        this.startSymbol = startSymbol;
+        return this;
+    }
+
+    public ContextFreeGrammar build() {
+        return new ContextFreeGrammar(nonTerminalSymbols, terminalSymbols, rules, startSymbol);
+    }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java
new file mode 100644
index 0000000..c3b90f9
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java

@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.Random;
+import java.util.Set;
+
+/**
+ * A context free grammar
+ */
+public class ContextFreeGrammar {
+  private Set<String> nonTerminalSymbols;
+  private Set<String> terminalSymbols;
+  private Set<Rule> rules;
+  private String startSymbol;
+
+  public ContextFreeGrammar(Set<String> nonTerminalSymbols, Set<String> terminalSymbols, Set<Rule> rules, String startSymbol) {
+    assert nonTerminalSymbols.contains(startSymbol) : "start symbol doesn't belong to non-terminal symbols set";
+
+    this.nonTerminalSymbols = nonTerminalSymbols;
+    this.terminalSymbols = terminalSymbols;
+    this.rules = rules;
+    this.startSymbol = startSymbol;
+  }
+
+
+  public String[] leftMostDerivation(String... words) {
+    ArrayList<String> expansion = new ArrayList<String>(words.length);
+
+    assert words.length > 0 && startSymbol.equals(words[0]);
+
+    for (String word : words) {
+      expansion.addAll(getTerminals(word));
+    }
+    return expansion.toArray(new String[expansion.size()]);
+
+  }
+
+  private Collection<String> getTerminals(String word) {
+
+    if (terminalSymbols.contains(word)) {
+      Collection<String> c = new LinkedList<String>();
+      c.add(word);
+      return c;
+    } else {
+      assert nonTerminalSymbols.contains(word) : "word " + word + " is not contained in non terminals";
+      String[] expansions = getExpansionForSymbol(word);
+      Collection<String> c = new LinkedList<String>();
+      for (String e : expansions) {
+        c.addAll(getTerminals(e));
+      }
+      return c;
+    }
+  }
+
+  private String[] getExpansionForSymbol(String currentSymbol) {
+    Rule r = getRuleForSymbol(currentSymbol);
+    return r.getExpansion();
+  }
+
+  private Rule getRuleForSymbol(String word) {
+    ArrayList<Rule> possibleRules = new ArrayList<Rule>();
+    for (Rule r : rules) {
+      if (word.equals(r.getEntry())) {
+        possibleRules.add(r);
+      }
+    }
+    if (possibleRules.size() > 0) {
+      return possibleRules.get(new Random().nextInt(possibleRules.size()));
+    } else {
+      throw new RuntimeException("could not find a rule for expanding symbol " + word);
+    }
+  }
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/Rule.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/Rule.java
new file mode 100644
index 0000000..a017b82
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/Rule.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+/**
+ * A rule for context free grammars
+ */
+public class Rule implements Comparable<Rule> {
+  private final String entry;
+  private final String[] expansion;
+
+  public Rule(String entry, String... expansion) {
+    this.entry = entry;
+    this.expansion = expansion;
+  }
+
+  public String getEntry() {
+    return entry;
+  }
+
+  public String[] getExpansion() {
+    return expansion;
+  }
+
+  @Override
+  public int compareTo(Rule o) {
+    return entry.compareTo(o.getEntry());
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java
new file mode 100644
index 0000000..62d6bc7
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/NaiveBayesClassifier.java

@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+/**
+ * A Naive Bayes Classifier for classifying objects of type I assigning classes of type O
+ */
+public interface NaiveBayesClassifier<I, O> {
+
+  public O calculateClass(I inputDocument) throws Exception;
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java
new file mode 100644
index 0000000..b7bf33c
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java

@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+/**
+ * C = argmax( P(d|c) * P(c) )
+ * where P(d|c) is called: likelihood
+ * and P(c) is called: prior - we can count relative frequencies in a corpus
+ * and d is a vector of features
+ * <p/>
+ * we assume:
+ * 1. bag of words assumption: positions don't matter
+ * 2. conditional independence: the feature probabilities are independent given a class
+ * <p/>
+ * thus P(d|c) == P(x1,..,xn|c) == P(x1|c)*...P(xn|c)
+ */
+public class SimpleNaiveBayesClassifier implements NaiveBayesClassifier<String, String> {
+
+  private static final String UNKNOWN_WORD_TOKEN = "_unk_word_";
+
+  private Collection<String> vocabulary; // the bag of all the words in the corpus
+  private final Map<String, String> docsWithClass; // this is the trained corpus holding a the doc as a key and the class as a value
+  private Map<String, String> classMegaDocMap; // key is the class, value is the megadoc
+  //    private Map<String, String> preComputedWordClasses; // the key is the word, the value is its likelihood
+  private Map<String, Double> priors;
+
+
+  public SimpleNaiveBayesClassifier(Map<String, String> trainedCorpus) {
+    this.docsWithClass = trainedCorpus;
+    createVocabulary();
+    createMegaDocs();
+    preComputePriors();
+//        preComputeWordClasses();
+
+  }
+
+  private void preComputePriors() {
+    priors = new HashMap<String, Double>();
+    for (String cl : classMegaDocMap.keySet()) {
+      priors.put(cl, calculatePrior(cl));
+    }
+  }
+
+//    private void preComputeWordClasses() {
+//        Set<String> uniqueWordsVocabulary = new HashSet<String>(vocabulary);
+//        for (String d : docsWithClass.keySet()) {
+//            calculateClass(d);
+//        }
+//    }
+
+  private void createMegaDocs() {
+    classMegaDocMap = new HashMap<String, String>();
+    Map<String, StringBuilder> mockClassMegaDocMap = new HashMap<String, StringBuilder>();
+    for (String doc : docsWithClass.keySet()) {
+      String cl = docsWithClass.get(doc);
+      StringBuilder megaDoc = mockClassMegaDocMap.get(cl);
+      if (megaDoc == null) {
+        megaDoc = new StringBuilder();
+        megaDoc.append(doc);
+        mockClassMegaDocMap.put(cl, megaDoc);
+      } else {
+        mockClassMegaDocMap.put(cl, megaDoc.append(" ").append(doc));
+      }
+    }
+    for (String cl : mockClassMegaDocMap.keySet()) {
+      classMegaDocMap.put(cl, mockClassMegaDocMap.get(cl).toString());
+    }
+  }
+
+  private void createVocabulary() {
+    vocabulary = new LinkedList<String>();
+    for (String doc : docsWithClass.keySet()) {
+      String[] split = tokenizeDoc(doc);
+      vocabulary.addAll(Arrays.asList(split));
+    }
+  }
+
+  private String[] tokenizeDoc(String doc) {
+    // TODO : this is by far not a tokenization, it should be changed
+    return doc.split(" ");
+  }
+
+  @Override
+  public String calculateClass(String inputDocument) {
+    Double max = 0d;
+    String foundClass = null;
+    for (String cl : classMegaDocMap.keySet()) {
+      Double clVal = priors.get(cl) * calculateLikelihood(inputDocument, cl);
+      if (clVal > max) {
+        max = clVal;
+        foundClass = cl;
+      }
+    }
+    return foundClass;
+  }
+
+
+  private Double calculateLikelihood(String document, String c) {
+    String megaDoc = classMegaDocMap.get(c);
+    // for each word
+    Double result = 1d;
+    for (String word : tokenizeDoc(document)) {
+      // num : count the no of times the word appears in documents of class c (+1)
+      double num = count(word, megaDoc) + 1; // +1 is added because of add 1 smoothing
+
+      // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+      double den = 0;
+      for (String w : vocabulary) {
+        den += count(w, megaDoc) + 1; // +1 is added because of add 1 smoothing
+      }
+
+      // P(w|c) = num/den
+      double wordProbability = num / den;
+      result *= wordProbability;
+    }
+
+    // P(d|c) = P(w1|c)*...*P(wn|c)
+    return result;
+  }
+
+  private int count(String word, String doc) {
+    int count = 0;
+    for (String t : tokenizeDoc(doc)) {
+      if (t.equals(word))
+        count++;
+    }
+    return count;
+  }
+
+  private Double calculatePrior(String currentClass) {
+    return (double) docCount(currentClass) / docsWithClass.keySet().size();
+  }
+
+  private int docCount(String countedClass) {
+    int count = 0;
+    for (String c : docsWithClass.values()) {
+      if (c.equals(countedClass)) {
+        count++;
+      }
+    }
+    return count;
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java
new file mode 100644
index 0000000..dab6f60
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java

@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;
+
+
+public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier<List<String>, String> {
+
+
+  private final Collection<String> vocabulary = new TreeSet<String>(); // the bag of all the words in the corpus
+  private final Map<String, Integer> classCounts = new LinkedHashMap<String, Integer>();
+  private double noDocs = 0d;
+  private final Map<String, Map<String, Integer>> nm = new HashMap<String, Map<String, Integer>>();
+  private final Map<String, Double> priors = new HashMap<String, Double>();
+  private final Map<String, Double> dens = new HashMap<String, Double>();
+
+  public void addExample(String klass, List<String> words) {
+    vocabulary.addAll(words);
+
+    Integer integer = classCounts.get(klass);
+    Integer f = integer != null ? integer : 0;
+    classCounts.put(klass, f + 1);
+
+    noDocs++;
+
+    for (String w : words) {
+      Map<String, Integer> wordCountsForClass = nm.get(klass);
+      if (wordCountsForClass == null) {
+        wordCountsForClass = new HashMap<String, Integer>();
+      }
+      Integer count = wordCountsForClass.get(w);
+      if (count == null) {
+        count = 1;
+      } else {
+        count++;
+      }
+      wordCountsForClass.put(w, count);
+      nm.put(klass, wordCountsForClass);
+    }
+    for (String c : classCounts.keySet()) {
+      priors.put(klass, calculatePrior(c));
+    }
+    calculateDen(klass);
+
+
+  }
+
+  private void calculateDen(String c) {
+    // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
+    Double den = 0d;
+    for (String w : vocabulary) {
+      Integer integer = nm.get(c).get(w);
+      den += integer != null ? integer : 0;
+    }
+    den += vocabulary.size() + 1; // +|V| is added because of add 1 smoothing, +1 for unknown words
+    dens.put(c, den);
+  }
+
+  public String calculateClass(List<String> words) throws Exception {
+    Double max = -1000000d;
+    String foundClass = null;
+    for (String cl : nm.keySet()) {
+      double prior = priors.get(cl);
+      double likeliHood = calculateLikelihood(words, cl);
+      double clVal = prior + likeliHood;
+      if (clVal > max) {
+        max = clVal;
+        foundClass = cl;
+      }
+    }
+    System.err.println("class found: " + foundClass);
+    return foundClass;
+  }
+
+  private Double calculateLikelihood(List<String> words, String c) {
+    Map<String, Integer> wordFreqs = nm.get(c);
+    // for each word
+    double result = 0d;
+    for (String word : words) {
+      // num : count the no of times the word appears in documents of class c (+1)
+      Integer freq = wordFreqs.get(word) != null ? wordFreqs.get(word) : 0;
+      double num = freq + 1d; // +1 is added because of add 1 smoothing
+
+      // P(w|c) = num/den
+      double wordProbability = Math.log(num / dens.get(c));
+
+      result += wordProbability;
+    }
+
+    // P(d|c) = P(w1|c)*...*P(wn|c)
+    return result;
+  }
+
+  private Double calculatePrior(String currentClass) {
+    return Math.log(docCount(currentClass) / noDocs);
+  }
+
+  private double docCount(String countedClass) {
+    Integer integer = classCounts.get(countedClass);
+    return integer != null ? (double) integer : 0d;
+  }
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java
new file mode 100644
index 0000000..26b6ef8
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collection;
+
+/**
+ * A language model calculate the probability <i>p</i> (between 0 and 1) of a
+ * certain set of <code>T</code> objects, given a vocabulary.
+ */
+public interface LanguageModel<T> {
+
+  /**
+   * Calculate the probability of a sentence given a vocabulary
+   *
+   * @param vocabulary a {@link Collection} of objects of type <code>T</code>
+   * @param sample     the sample to evaluate the probability for
+   * @return a <code>double</code> between <code>0</code> and <code>1</code>
+   */
+  public double calculateProbability(Collection<T> vocabulary, T sample);
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java
new file mode 100644
index 0000000..1d44e23
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java

@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collection;
+import java.util.Collections;
+
+/**
+ * Simple sentence language model which just counts the no. of occurrences of
+ * a sentence over the no. of sentences in the vocabulary.
+ */
+public class NaiveSentenceLanguageModel<T> implements LanguageModel<T[]> {
+  @Override
+  public double calculateProbability(Collection<T[]> vocabulary, T[] sentence) {
+    return Collections.frequency(vocabulary, sentence) / vocabulary.size();
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java
new file mode 100644
index 0000000..1c02ec8
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java

@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+/**
+ * Abstract impl of a noisy channel
+ */
+public abstract class NoisyChannel {
+
+  private String[] dictionary;
+
+  public void initializeDictionary(String[] dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  public String findCorrection(String mispelledWord) {
+    Double val = 0d;
+    String correctWord = null;
+    for (String word : dictionary) {
+      Double curVal = calculateLikelihood(mispelledWord, word) * calculatePrior(word);
+      if (curVal > val) {
+        val = curVal;
+        correctWord = word;
+      }
+    }
+    return correctWord;
+  }
+
+  public abstract Double calculatePrior(String word);
+
+  public abstract Double calculateLikelihood(String mispelledWord, String word);
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java
new file mode 100644
index 0000000..3dc6152
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.languagemodel;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.opennlp.utils.ngram.NGramUtils;
+
+/**
+ * A simple trigram language model for sentences made of <code>String</code> arrays
+ */
+public class TrigramSentenceLanguageModel implements LanguageModel<String[]> {
+  @Override
+  public double calculateProbability(Collection<String[]> vocabulary, String[] sample) {
+    double probability = 1d;
+    for (Trigram trigram : getTrigrams(sample)) {
+      if (trigram.getX0() != null && trigram.getX1() != null) {
+        // default
+        probability *= NGramUtils.calculateTrigramMLProbability(trigram.getX0(), trigram.getX1(), trigram.getX2(), vocabulary);
+      } else if (trigram.getX0() == null && trigram.getX1() != null) {
+        // bigram
+        probability *= NGramUtils.calculateBigramMLProbability(trigram.getX2(), trigram.getX1(), vocabulary);
+      } else if (trigram.getX0() == null && trigram.getX1() == null) {
+        // unigram
+        probability *= NGramUtils.calculateUnigramMLProbability(trigram.getX2(), vocabulary);
+      } else {
+        // unexpected
+      }
+    }
+    return probability;
+  }
+
+  private Set<Trigram> getTrigrams(String[] sample) {
+    Set<Trigram> trigrams = new HashSet<Trigram>();
+    for (int i = 0; i < sample.length - 2; i++) {
+      String x0 = null;
+      String x1 = null;
+      String x2 = sample[i];
+      if (i > 1) {
+        x1 = sample[i - 1];
+      }
+      if (i > 2) {
+        x0 = sample[i - 2];
+      }
+      trigrams.add(new Trigram(x0, x1, x2));
+    }
+    return trigrams;
+  }
+
+  private class Trigram {
+    private final String x0;
+    private final String x1;
+    private final String x2;
+
+    private Trigram(String x0, String x1, String x2) {
+      this.x0 = x0;
+      this.x1 = x1;
+      this.x2 = x2;
+    }
+
+    public String getX0() {
+      return x0;
+    }
+
+    public String getX1() {
+      return x1;
+    }
+
+    public String getX2() {
+      return x2;
+    }
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java
new file mode 100644
index 0000000..2f3d1ac
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java

@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.ngram;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+
+/**
+ * utility class for calculating probabilities of tri/bi/uni-grams
+ */
+public class NGramUtils {
+
+  private static Double count(String x0, String x1, String x2, Collection<String[]> sentences) {
+    Double count = 0d;
+    for (String[] sentence : sentences) {
+      int idx0 = contains(sentence, x0);
+      if (idx0 >= 0) {
+        if (idx0 + 2 < sentence.length && x1.equals(sentence[idx0+1]) && x2.equals(sentence[idx0+2])) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+
+  private static int contains(String[] sentence, String word) {
+    for (int i = 0; i < sentence.length; i++) {
+      if (word.equals(sentence[i])){
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  private static Double count(String sequentWord, String precedingWord, Collection<String[]> set) {
+    Double result = 0d;
+    boolean foundPreceding = false;
+    for (String[] sentence : set) {
+      for (String w : sentence) {
+        if (precedingWord.equals(w)) {
+          foundPreceding = true;
+          continue;
+        }
+        if (foundPreceding && sequentWord.equals(w)) {
+          foundPreceding = false;
+          result++;
+        }
+        else
+          foundPreceding = false;
+      }
+    }
+    return result;
+  }
+
+  private static Double count(String word, Collection<String[]> set) {
+    Double result = 0d;
+    for (String[] sentence : set) {
+      for (String w : sentence) {
+        if (word.equals(w))
+          result++;
+      }
+    }
+    return result;
+  }
+
+  public static Double calculateLaplaceSmoothingProbability(String sequentWord, String precedingWord, Collection<String[]> set, Double k) {
+    return (count(sequentWord, precedingWord, set) + k) / (count(precedingWord, set) + k * set.size());
+  }
+
+  public static Double calculateBigramMLProbability(String sequentWord, String precedingWord, Collection<String[]> set) {
+    return count(sequentWord, precedingWord, set)/ count(precedingWord, set);
+  }
+
+  public static Double calculateTrigramMLProbability(String x0, String x1, String x2, Collection<String[]> sentences) {
+    return count(x0, x1, x2, sentences)/ count(x1, x0, sentences);
+  }
+
+  public static Double calculateBigramPriorSmoothingProbability(String sequentWord, String precedingWord, Collection<String[]> set, Double k) {
+    return (count(sequentWord, precedingWord, set) + k * calculateUnigramMLProbability(sequentWord, set)) / (count(precedingWord, set) + k * set.size());
+  }
+
+  public static Double calculateUnigramMLProbability(String word, Collection<String[]> set) {
+    double vocSize = 0d;
+    for (String[] s : set) {
+      vocSize+= s.length;
+    }
+    return count(word, set) / vocSize;
+  }
+
+  public static Double calculateLinearInterpolationProbability(String x0, String x1, String x2, Collection<String[]> sentences,
+                                                               Double lambda1, Double lambda2, Double lambda3) {
+    assert lambda1 + lambda2 + lambda3 == 1 : "lambdas sum should be equals to 1";
+    assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should be greater than 0";
+
+    return  lambda1 * calculateTrigramMLProbability(x0, x1, x2, sentences) +
+            lambda2 * calculateBigramMLProbability(x2, x1, sentences) +
+            lambda3 * calculateUnigramMLProbability(x2, sentences);
+
+  }
+
+  private static Collection<String> flatSet(Collection<String[]> set) {
+    Collection<String> flatSet = new HashSet<String>();
+    for (String[] sentence : set){
+      flatSet.addAll(Arrays.asList(sentence));
+    }
+    return flatSet;
+  }
+
+  public static Double calculateMissingBigramProbabilityMass(String x1, Double discount, Collection<String[]> set) {
+    Double missingMass = 0d;
+    Double countWord = count(x1, set);
+    for (String word : flatSet(set)) {
+      missingMass += (count(word, x1, set) - discount)/ countWord;
+    }
+    return 1 - missingMass;
+  }
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java
new file mode 100644
index 0000000..9fccc61
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import java.util.Arrays;
+
+import org.apache.opennlp.utils.TrainingSet;
+
+/**
+ * Utility class for calculating gradient descent
+ */
+public class GradientDescentUtils {
+
+  private static final double THRESHOLD = 0.5;
+  private static final int MAX_ITERATIONS = 100000;
+
+  /**
+   * Calculates batch gradient descent on the give hypothesis, training set and learning rate alpha.
+   * The algorithms iteratively adjusts the hypothesis parameters
+   *
+   * @param hypothesis  the hypothesis representing the model used
+   * @param trainingSet the training set used to fit the parameters
+   * @param alpha       the learning rate alpha used to define how big the descent steps are
+   */
+  public static void batchGradientDescent(Hypothesis hypothesis, TrainingSet trainingSet, double alpha) {
+    // set initial random weights
+    double[] parameters = initializeRandomWeights(trainingSet.iterator().next().getInputs().length);
+    hypothesis.updateParameters(parameters);
+
+    int iterations = 0;
+
+    double cost = Double.MAX_VALUE;
+    while (true) {
+      // calculate cost
+      double newCost = RegressionModelUtils.ordinaryLeastSquares(trainingSet, hypothesis);
+
+      if (newCost > cost) {
+        throw new RuntimeException("failed to converge at iteration " + iterations + " with cost going from " + cost + " to " + newCost);
+      } else if (cost == newCost || newCost < THRESHOLD || iterations > MAX_ITERATIONS) {
+        System.out.println(cost + " with parameters " + Arrays.toString(parameters));
+        break;
+      }
+
+      // update registered cost
+      cost = newCost;
+
+      // calculate the updated parameters
+      parameters = RegressionModelUtils.batchLeastMeanSquareUpdate(parameters, alpha, trainingSet, hypothesis);
+
+      // update weights in the hypothesis
+      hypothesis.updateParameters(parameters);
+
+      iterations++;
+    }
+  }
+
+  private static double[] initializeRandomWeights(int size) {
+    double[] doubles = new double[size];
+    for (int i = 0; i < doubles.length; i++) {
+      doubles[i] = Math.random() * 0.1d;
+    }
+    return doubles;
+  }
+
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java
new file mode 100644
index 0000000..25920f6
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java

@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+/**
+ * An {@link Hypothesis} maps a series of inputs to an output
+ */
+public interface Hypothesis {
+
+  /**
+   * calculate the output given some inputs according to the underlying model.
+   *
+   * @param inputs an array of inputs as <code>double</code>
+   * @return a <code>double</code> representing the output
+   */
+  public double calculateOutput(double[] inputs);
+
+  /**
+   * update the internal model's parameters.
+   *
+   * @param parameters an array of <code>double</code> containing the updated parameters
+   */
+  public void updateParameters(double[] parameters);
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java
new file mode 100644
index 0000000..2c52876
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java

@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+/**
+ * Simplest {@link Hypothesis} which just linear combines inputs with weights
+ */
+public class LinearCombinationHypothesis implements Hypothesis {
+  private double[] weights;
+
+  @Override
+  public double calculateOutput(double[] inputs) {
+    double output = 0d;
+    for (int i = 0; i < weights.length; i++) {
+      output += weights[i] * inputs[i];
+    }
+    return output;
+  }
+
+  @Override
+  public void updateParameters(double[] parameters) {
+    weights = parameters;
+  }
+}

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java
new file mode 100644
index 0000000..d543f51
--- /dev/null
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java

@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+
+/**
+ * Utility class for calculating various regression models costs
+ */
+public class RegressionModelUtils {
+
+  /**
+   * calculate the ordinary least squares (OLS) cost in the given training set for a given hypothesis
+   *
+   * @param trainingSet the training set used
+   * @param hypothesis  the hypothesis function representing the model
+   * @return the cost of the hypothesis for the given training set using OLS
+   */
+  public static double ordinaryLeastSquares(TrainingSet trainingSet, Hypothesis hypothesis) {
+    double output = 0;
+    for (TrainingExample trainingExample : trainingSet) {
+      double difference = hypothesis.calculateOutput(trainingExample.getInputs()) - trainingExample.getOutput();
+      output += Math.pow(difference, 2);
+    }
+    return output / 2d;
+  }
+
+  /**
+   * calculate the least mean square (LMS) update for a given weight vector
+   *
+   * @param thetas      the array of weights
+   * @param alpha       the learning rate alpha
+   * @param trainingSet the training set to use for learning
+   * @param hypothesis  the hypothesis representing the model
+   * @return the updated weights vector
+   */
+  public static double[] batchLeastMeanSquareUpdate(double[] thetas, double alpha, TrainingSet trainingSet, Hypothesis hypothesis) {
+    double[] updatedWeights = new double[thetas.length];
+    for (int i = 0; i < updatedWeights.length; i++) {
+      double errors = 0;
+      for (TrainingExample trainingExample : trainingSet) {
+        errors += (trainingExample.getOutput() - hypothesis.calculateOutput(trainingExample.getInputs())) * trainingExample.getInputs()[i];
+      }
+      updatedWeights[i] = thetas[i] + alpha * errors;
+    }
+    return updatedWeights;
+  }
+
+  /**
+   * calculate least mean square update for a given training example for the j-th input
+   *
+   * @param thetas          the array of weights
+   * @param alpha           the learning rate alpha
+   * @param trainingExample the training example to use for learning
+   * @param hypothesis      the hypothesis representing the model
+   * @param j               the index of the j-th input
+   * @return the updated weight for the j-th element of the weights vector
+   */
+  public static double singleLeastMeanSquareUpdate(double[] thetas, double alpha, TrainingExample trainingExample, Hypothesis hypothesis, int j) {
+    return thetas[j] + alpha * (trainingExample.getOutput() - hypothesis.calculateOutput(trainingExample.getInputs())) * trainingExample.getInputs()[j];
+  }
+
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java
new file mode 100644
index 0000000..1281ea4
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java

@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+import org.junit.Ignore;
+
+/**
+ * Utility class for tests
+ */
+@Ignore
+public class TestUtils {
+
+  public static void fillTrainingSet(TrainingSet trainingSet, int size, int dimension) {
+    for (int i = 0; i < size; i++) {
+      double[] inputs = new double[dimension];
+      for (int j = 0; j < dimension; j++) {
+        inputs[j] = Math.random();
+      }
+      double out = Math.random();
+      trainingSet.add(new TrainingExample(inputs, out));
+    }
+  }
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java
new file mode 100644
index 0000000..04399b7
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.anomalydetection;
+
+import org.junit.Test;
+
+import org.apache.opennlp.utils.TestUtils;
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.anomalydetection.AnomalyDetectionUtils}
+ */
+public class AnomalyDetectionUtilsTest {
+  @Test
+  public void testGaussianDistributionProbability() throws Exception {
+    TrainingSet trainingSet = new TrainingSet();
+    TestUtils.fillTrainingSet(trainingSet, 100, 5);
+    double[] mus = AnomalyDetectionUtils.fitMus(trainingSet);
+    assertNotNull(mus);
+    double[] sigmas = AnomalyDetectionUtils.fitSigmas(mus, trainingSet);
+    assertNotNull(sigmas);
+    TrainingExample newInput = new TrainingExample(new double[]{1d, 2d, 1000d, 123d, 0.1d}, 0d);
+    double probability = AnomalyDetectionUtils.getGaussianProbability(newInput, mus, sigmas);
+    assertTrue("negative probability " + probability, 0 <= probability);
+    assertTrue("probability bigger than 1 " + probability, 1 >= probability);
+  }
+
+  @Test
+  public void testGaussianDistributionProbability2() throws Exception {
+    TrainingSet trainingSet = new TrainingSet();
+    TestUtils.fillTrainingSet(trainingSet, 100, 5);
+    TrainingExample newInput = new TrainingExample(new double[]{1d, 2d, 1000d, 123d, 0.1d}, 0d);
+    double probability = AnomalyDetectionUtils.getGaussianProbability(newInput, trainingSet);
+    assertTrue("negative probability " + probability, 0 <= probability);
+    assertTrue("probability bigger than 1 " + probability, 1 >= probability);
+  }
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java
new file mode 100644
index 0000000..7901820
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ContextFreeGrammarTest.java

@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.cfg.ContextFreeGrammar}
+ */
+public class ContextFreeGrammarTest {
+
+  private ContextFreeGrammar contextFreeGrammar;
+  private Set<String> terminals;
+
+  @Before
+  public void setUp() throws Exception {
+
+    Set<String> nonTerminals = new HashSet<String>(); // PoS + Parse tags
+    nonTerminals.add("S");
+    nonTerminals.add("NP");
+    nonTerminals.add("VP");
+    nonTerminals.add("PP");
+    nonTerminals.add("DT");
+    nonTerminals.add("Vi");
+    nonTerminals.add("Vt");
+    nonTerminals.add("NN");
+    nonTerminals.add("IN");
+    nonTerminals.add("NNP");
+    nonTerminals.add("CJ");
+    nonTerminals.add("DJ");
+    nonTerminals.add("P");
+
+    String startSymbol = "S";
+
+    terminals = new HashSet<String>();
+    terminals.add("sleeps");
+    terminals.add("saw");
+    terminals.add("man");
+    terminals.add("woman");
+    terminals.add("telescope");
+    terminals.add("the");
+    terminals.add("with");
+    terminals.add("in");
+    terminals.add("tommaso");
+    terminals.add("simone");
+    terminals.add("joao");
+    terminals.add("tigro");
+    terminals.add("michele");
+    terminals.add("scarlett");
+    terminals.add("and");
+    terminals.add("but");
+    terminals.add("while");
+    terminals.add("of");
+    terminals.add("for");
+
+    Set<Rule> rules = new HashSet<Rule>();
+    rules.add(new Rule("S", "NP", "VP"));
+    rules.add(new Rule("P", "S", "CJ", "S"));
+    rules.add(new Rule("P", "S", "DJ", "S"));
+    rules.add(new Rule("VP", "Vi"));
+    rules.add(new Rule("VP", "Vt", "NP"));
+    rules.add(new Rule("VP", "VP", "PP"));
+    rules.add(new Rule("NP", "DT", "NN"));
+    rules.add(new Rule("NP", "NP", "PP"));
+    rules.add(new Rule("NP", "NNP"));
+    rules.add(new Rule("PP", "IN", "NP"));
+    rules.add(new Rule("Vi", "sleeps"));
+    rules.add(new Rule("Vt", "saw"));
+    rules.add(new Rule("NN", "man"));
+    rules.add(new Rule("NN", "woman"));
+    rules.add(new Rule("NN", "telescope"));
+    rules.add(new Rule("DT", "the"));
+    rules.add(new Rule("IN", "with"));
+    rules.add(new Rule("IN", "in"));
+    rules.add(new Rule("IN", "for"));
+    rules.add(new Rule("IN", "of"));
+    rules.add(new Rule("NNP", "tommaso"));
+    rules.add(new Rule("NNP", "simone"));
+    rules.add(new Rule("NNP", "joao"));
+    rules.add(new Rule("NNP", "tigro"));
+    rules.add(new Rule("NNP", "michele"));
+    rules.add(new Rule("NNP", "scarlett"));
+    rules.add(new Rule("CJ", "and"));
+    rules.add(new Rule("DJ", "but"));
+    rules.add(new Rule("DJ", "while"));
+
+    contextFreeGrammar = new ContextFreeGrammar(nonTerminals, terminals, rules, startSymbol);
+  }
+
+  @Test
+  public void testSingleExpansion() throws Exception {
+    String[] expansion = contextFreeGrammar.leftMostDerivation("S");
+    checkExpansion(expansion);
+  }
+
+
+  @Test
+  public void testMultipleSentencesExpansion() throws Exception {
+    String[] expansion = contextFreeGrammar.leftMostDerivation("S", "CJ", "S");
+    checkExpansion(expansion);
+
+    expansion = contextFreeGrammar.leftMostDerivation("S", "DJ", "S", "CJ", "P");
+    checkExpansion(expansion);
+  }
+
+  private void checkExpansion(String[] expansion) {
+    assertNotNull(expansion);
+    assertTrue(expansion.length > 0);
+    for (String t : expansion) {
+      assertTrue("term " + t + " is not a terminal symbol", terminals.contains(t));
+    }
+    System.err.println(Arrays.toString(expansion));
+  }
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java
new file mode 100644
index 0000000..8016679
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java

@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.classification;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.classification.SimpleNaiveBayesClassifier}
+ */
+public class SimpleNaiveBayesClassifierTest {
+
+  @Test
+  public void ppsIntegrationTest() throws Exception {
+    Map<String, String> trainedCorpus = new HashMap<String, String>();
+    trainedCorpus.put("CAVOUR ad.te napoleone III affare: cat. C/2 ottimo" +
+            " stato ingresso angolo cottura bagno con doccia e camera. " +
+            "ottimo per investimento o piccolo studio per professionisti" +
+            " e 99.000 Ag.Imm.", "A");
+    trainedCorpus.put("TRASTEVERE via degli Orti di Trastevere in palazzo " +
+            "signorile (con s. portineria) appartamento mq 180 + cantina mq" +
+            " 6 con rifiniture di pregio marmi & armadi a muro + ampia " +
+            "balconata 50 mq assolutamente no agenzie E 930.000", "N");
+    trainedCorpus.put("CORSO VITTORIO Emanuele V. del banco di santo spirito" +
+            " 3° piano con ascensore appartamento di 142 mq commerciali " +
+            "composto da: ingresso disimpegno tre camere soggiorno cucina" +
+            " due bagni due cantine per un totale di 15 mq e. 900.000 Ag.Imm.", "A");
+    trainedCorpus.put("TRASTEVERE Ippolito Nievo quinto piano tripla " +
+            "esposizione ingresso salone doppio cucina abitabile tre " +
+            "camere servizio ripostiglio terrazzo e soffitta da ristrutturare " +
+            "e 650.000 Ag.Imm.", "A");
+    trainedCorpus.put("TRASTEVERE E.Rolli solo privati palazzo epoca doppia" +
+            " esposizione ingresso soppalcato soggiorno 2 camere cucinotto " +
+            "bagno 84 mq IV piano no ascensore 385.000 giardino condominio", "N");
+    trainedCorpus.put("CENTRO monti sforza elegante edificio con ampi spazi" +
+            " comuni ristrutturato ingresso soggiorno angolo cucina camera " +
+            "letto armadi a muro bagno vasca con finestra pavimenti cotto " +
+            "luminoso silenzioso doppio affaccio climatizzato e 405.000 ag. " +
+            "imm. cl en.g", "A");
+    trainedCorpus.put("SAN LORENZO app.to epoca privato vende salone due " +
+            "camere cucina abit. due bagni ripostigli vari II piano con" +
+            " ascensore triplo affaccio E 530.000 ", "N");
+    trainedCorpus.put("SAN LORENZO Via Porta Labicana appartamento mq 80 " +
+            "piano rialzato con ingresso 3 camere cucina bagno E 395.000 ", "N");
+    trainedCorpus.put("SAN LORENZO via degli Umbri I° p. 3 stanze cucina " +
+            "servizio terrazzino interno buono stato E. 390.000 tratt. " +
+            "assoloutamente no agenzie ", "N");
+
+
+    SimpleNaiveBayesClassifier classifier = new SimpleNaiveBayesClassifier(trainedCorpus);
+
+    Boolean isAgency = classifier.calculateClass("CENTRO S.Maria Maggiore " +
+            "angolo Napoleone III in palazzo epoca con portiere 110 mq ristrutt." +
+            " IIp salone doppio cucina ab. 2 camere bagno ripost. balcone " +
+            "perimetrale E. 730.000 tratt. ").equals("A");
+    assertFalse(isAgency);
+
+    isAgency = classifier.calculateClass("TRASTEVERE via del Mattonato in " +
+            "piccola palazzina d'epoca app.to finemente ristrutturato " +
+            "ingresso salone camera cucina tinello servizio balconcino " +
+            "aria condiz. e 540.000 Ag.Imm. ").equals("A");
+    assertTrue(isAgency);
+
+  }
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java
new file mode 100644
index 0000000..75dbe16
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.ngram;
+
+import java.util.Collection;
+import java.util.LinkedList;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.ngram.NGramUtils}
+ */
+public class NGramUtilsTest {
+  @Test
+  public void testBigram() {
+    Collection<String[]> set = new LinkedList<String[]>();
+    set.add(new String[]{"<s>","I","am","Sam","</s>"});
+    set.add(new String[]{"<s>","Sam","I","am","</s>"});
+    set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+    set.add(new String[]{});
+    Double d = NGramUtils.calculateBigramMLProbability("I", "<s>", set);
+    assertTrue(d>0);
+    assertEquals(Double.valueOf(0.6666666666666666d),d);
+    d = NGramUtils.calculateBigramMLProbability("</s>", "Sam", set);
+    assertEquals(Double.valueOf(0.5d),d);
+    d = NGramUtils.calculateBigramMLProbability("Sam", "<s>", set);
+    assertEquals(Double.valueOf(0.3333333333333333d),d);
+  }
+
+  @Test
+  public void testTrigram() {
+    Collection<String[]> set = new LinkedList<String[]>();
+    set.add(new String[]{"<s>","I","am","Sam","</s>"});
+    set.add(new String[]{"<s>","Sam","I","am","</s>"});
+    set.add(new String[]{"<s>","I","do","not","like","green","eggs","and","ham","</s>"});
+    set.add(new String[]{});
+    Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam",set);
+    assertTrue(d>0);
+    assertEquals(Double.valueOf(0.5),d);
+    d = NGramUtils.calculateTrigramMLProbability("Sam","I", "am", set);
+    assertEquals(Double.valueOf(1d),d);
+  }
+
+  @Test
+  public void testLinearInterpolation() throws Exception {
+    Collection<String[]> set = new LinkedList<String[]>();
+    set.add(new String[]{"the","green","book","STOP"});
+    set.add(new String[]{"my","blue","book","STOP"});
+    set.add(new String[]{"his","green","house","STOP"});
+    set.add(new String[]{"book","STOP"});
+    Double lambda = 1d/3d;
+    Double d = NGramUtils.calculateLinearInterpolationProbability("the", "green", "book", set, lambda, lambda, lambda);
+    assertNotNull(d);
+    assertTrue(d > 0);
+    assertEquals("wrong result", Double.valueOf(0.5714285714285714d), d);
+  }
+
+  @Test
+  public void testLinearInterpolation2() throws Exception {
+    Collection<String[]> set = new LinkedList<String[]>();
+    set.add(new String[]{"D","N","V","STOP"});
+    set.add(new String[]{"D","N","V","STOP"});
+    Double lambda = 1d/3d;
+    Double d = NGramUtils.calculateLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda, lambda);
+    assertNotNull(d);
+    assertTrue(d > 0);
+    assertEquals("wrong result", Double.valueOf(0.75d), d);
+  }
+
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java
new file mode 100644
index 0000000..d4a8df0
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java

@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import org.apache.opennlp.utils.TestUtils;
+import org.apache.opennlp.utils.TrainingSet;
+import org.junit.Test;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.regression.GradientDescentUtils}
+ */
+public class GradientDescentUtilsTest {
+
+  @Test
+  public void testConvergence() throws Exception {
+    TrainingSet trainingSet = new TrainingSet();
+    TestUtils.fillTrainingSet(trainingSet, 100, 5);
+    GradientDescentUtils.batchGradientDescent(new LinearCombinationHypothesis(), trainingSet, 0.00002);
+  }
+
+}

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java
new file mode 100644
index 0000000..d0cb9de
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.regression;
+
+import org.apache.opennlp.utils.TrainingExample;
+import org.apache.opennlp.utils.TrainingSet;
+import org.junit.Test;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Testcase for {@link org.apache.opennlp.utils.regression.RegressionModelUtils}
+ */
+public class RegressionModelUtilsTest {
+
+  @Test
+  public void testLMS() throws Exception {
+    TrainingSet trainingSet = new TrainingSet();
+    trainingSet.add(new TrainingExample(new double[]{10, 10}, 1));
+    LinearCombinationHypothesis hypothesis = new LinearCombinationHypothesis();
+    hypothesis.updateParameters(new double[]{1, 1});
+    double[] updatedParameters = RegressionModelUtils.batchLeastMeanSquareUpdate(new double[]{1, 1}, 0.1, trainingSet, hypothesis);
+    assertNotNull(updatedParameters);
+    assertTrue(updatedParameters.length == 2);
+    assertTrue(updatedParameters[0] == -18d);
+    assertTrue(updatedParameters[1] == -18d);
+  }
+}

diff --git a/nlp-utils/src/test/resources/presidents.txt b/nlp-utils/src/test/resources/presidents.txt
new file mode 100644
index 0000000..a765da9
--- /dev/null
+++ b/nlp-utils/src/test/resources/presidents.txt

@@ -0,0 +1,44 @@
+Washington 94
+Adams 48
+Jefferson 96
+Madison 96
+Monroe 96
+Adams 48
+Jackson 96
+Van Buren 48
+Harrison 1
+Tyler 47
+Polk 48
+Taylor 16
+Filmore 32
+Pierce 48
+Buchanan 48
+Lincoln 49
+Johnson 47
+Grant 96
+Hayes 48
+Garfield 7
+Arthur 41
+Cleveland 48
+Harrison 48
+Cleveland 48
+McKinley 54
+Roosevelt 90
+Taft 48
+Wilson 96
+Harding 29
+Coolidge 67
+Hoover 48
+Roosevelt 146
+Truman 92
+Eisenhower 96
+Kennedy 34
+Johnson 62
+Nixon 67
+Ford 29
+Carter 48
+Reagan 96
+Bush 48
+Clinton 96
+Bush 96
+Obama 48
\ No newline at end of file
commit	af92a7d8be2af01705f3e9090da9cdd9bc5c568a	[log] [tgz]
author	Jörn Kottmann <joern@apache.org>	Mon Mar 10 12:28:25 2014 +0000
committer	Jörn Kottmann <joern@apache.org>	Mon Mar 10 12:28:25 2014 +0000
tree	bc83fd32f6a40ec6cc3f5889ca20b278f0384097
parent	874769963efd435a751c1085e4c60c29097d9167 [diff]