OPENNLP-821 Moved mallet addon from my github repository to here
diff --git a/mallet-addon/LICENSE b/mallet-addon/LICENSE
new file mode 100644
index 0000000..e06d208
--- /dev/null
+++ b/mallet-addon/LICENSE
@@ -0,0 +1,202 @@
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/mallet-addon/params/crf-params.txt b/mallet-addon/params/crf-params.txt
new file mode 100644
index 0000000..0a2ace3
--- /dev/null
+++ b/mallet-addon/params/crf-params.txt
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Sample machine learning properties file
+Algorithm=opennlp.addons.mallet.CRFTrainer
+Cutoff=0
+Iterations=100
+
diff --git a/mallet-addon/params/maxent-params.txt b/mallet-addon/params/maxent-params.txt
new file mode 100644
index 0000000..d8cf288
--- /dev/null
+++ b/mallet-addon/params/maxent-params.txt
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Sample machine learning properties file
+Algorithm=opennlp.addons.mallet.MaxentTrainer
+Cutoff=0
+Iterations=100
+
diff --git a/mallet-addon/pom.xml b/mallet-addon/pom.xml
new file mode 100644
index 0000000..38f1fc9
--- /dev/null
+++ b/mallet-addon/pom.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>kottmann.opennlp</groupId>
+ <artifactId>mallet-addon</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+
+ <packaging>jar</packaging>
+ <name>Apache OpenNLP Mallet Addon</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0</version>
+ </dependency>
+
+ <dependency>
+ <groupId>cc.mallet</groupId>
+ <artifactId>mallet</artifactId>
+ <version>2.0.7</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <version>2.1</version>
+ <executions>
+ <execution>
+ <id>copy-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-dependencies</goal>
+ </goals>
+ <configuration>
+ <excludeScope>provided</excludeScope>
+ <stripVersion>true</stripVersion>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>true</skipTests>
+ <argLine>-Xmx512m</argLine>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
new file mode 100644
index 0000000..2980131
--- /dev/null
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.addons.mallet;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import opennlp.tools.ml.AbstractSequenceTrainer;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.Sequence;
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.ml.model.SequenceStream;
+import cc.mallet.fst.CRF;
+import cc.mallet.fst.CRFTrainerByLabelLikelihood;
+import cc.mallet.fst.Transducer;
+import cc.mallet.types.Alphabet;
+import cc.mallet.types.FeatureVector;
+import cc.mallet.types.FeatureVectorSequence;
+import cc.mallet.types.Instance;
+import cc.mallet.types.InstanceList;
+import cc.mallet.types.Label;
+import cc.mallet.types.LabelAlphabet;
+import cc.mallet.types.LabelSequence;
+
+// Transducer should be abstract, we have two CRF and HMM.
+// For HMM we don't need to generate any features (how to do that nicely?!)
+// Dummy feature generator ?!
+public class CRFTrainer extends AbstractSequenceTrainer {
+
+ public CRFTrainer(Map<String, String> trainParams,
+ Map<String, String> reportMap) {
+ super(trainParams, reportMap);
+ }
+
+ private int[] getOrders() {
+ String[] ordersString = "0,1".split(",");
+ int[] orders = new int[ordersString.length];
+ for (int i = 0; i < ordersString.length; i++) {
+ orders[i] = Integer.parseInt(ordersString[i]);
+ System.err.println("Orders: " + orders[i]);
+ }
+ return orders;
+ }
+
+ // TODO: Interface has to be changed here,
+ @Override
+ public SequenceClassificationModel<String> doTrain(SequenceStream sequences)
+ throws IOException {
+
+ Alphabet dataAlphabet = new Alphabet();
+ LabelAlphabet targetAlphabet = new LabelAlphabet();
+
+ InstanceList trainingData = new InstanceList(dataAlphabet, targetAlphabet);
+
+ int nameIndex = 0;
+ for (Sequence sequence : sequences) {
+ FeatureVector featureVectors[] = new FeatureVector[sequence.getEvents().length];
+ Label malletOutcomes[] = new Label[sequence.getEvents().length];
+
+ Event events[] = sequence.getEvents();
+
+ for (int eventIndex = 0; eventIndex < events.length; eventIndex++) {
+
+ Event event = events[eventIndex];
+
+ String features[] = event.getContext();
+ int malletFeatures[] = new int[features.length];
+
+ for (int featureIndex = 0; featureIndex < features.length; featureIndex++) {
+ malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
+ features[featureIndex], true);
+ }
+
+ // Note: Might contain a feature more than once ... will that
+ // work ?!
+ featureVectors[eventIndex] = new FeatureVector(dataAlphabet,
+ malletFeatures);
+
+ malletOutcomes[eventIndex] = targetAlphabet.lookupLabel(
+ event.getOutcome(), true);
+ }
+
+ LabelSequence malletOutcomeSequence = new LabelSequence(malletOutcomes);
+
+ FeatureVectorSequence malletSequence = new FeatureVectorSequence(
+ featureVectors);
+
+ trainingData.add(new Instance(malletSequence, malletOutcomeSequence,
+ "name" + nameIndex++, "source"));
+ }
+
+ CRF crf = new CRF(trainingData.getDataAlphabet(),
+ trainingData.getTargetAlphabet());
+
+ String startStateName = crf.addOrderNStates(trainingData, getOrders(),
+ (boolean[]) null,
+ // default label
+ "other", Pattern.compile("other,*-cont"), // forbidden pattern
+ null, // allowed pattern
+ true);
+ crf.getState(startStateName).setInitialWeight(0.0);
+
+ for (int i = 0; i < crf.numStates(); i++) {
+ crf.getState(i).setInitialWeight(Transducer.IMPOSSIBLE_WEIGHT);
+ }
+
+ crf.getState(startStateName).setInitialWeight(0.0);
+ crf.setWeightsDimensionAsIn(trainingData, true);
+
+ // CRFOptimizableBy* objects (terms in the objective function)
+ // objective 1: label likelihood objective
+
+ CRFTrainerByLabelLikelihood crfTrainer = new CRFTrainerByLabelLikelihood(
+ crf);
+ crfTrainer.setGaussianPriorVariance(1.0);
+
+ // CRFOptimizableByLabelLikelihood optLabel = new
+ // CRFOptimizableByLabelLikelihood(
+ // crf, trainingData);
+
+ // CRF trainer
+ // Optimizable.ByGradientValue[] opts = new Optimizable.ByGradientValue[] {
+ // optLabel };
+
+ // by default, use L-BFGS as the optimizer
+ // CRFTrainerByValueGradients crfTrainer = new CRFTrainerByValueGradients(
+ // crf, opts);
+ // crfTrainer.setMaxResets(0);
+
+ // SNIP
+
+ crfTrainer.train(trainingData, Integer.MAX_VALUE);
+
+ // can be very similar to the other model
+ // one important difference is that the feature gen needs to be integrated
+ // ...
+ return new TransducerModel(crf);
+ }
+
+ // TODO: We need to return a sequence model here. How should that be done ?!
+ //
+
+}
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
new file mode 100644
index 0000000..5f6661d
--- /dev/null
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.addons.mallet;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.model.SerializableArtifact;
+import cc.mallet.classify.Classification;
+import cc.mallet.classify.Classifier;
+import cc.mallet.types.Alphabet;
+import cc.mallet.types.FeatureVector;
+import cc.mallet.types.Instance;
+import cc.mallet.types.Label;
+import cc.mallet.types.LabelAlphabet;
+import cc.mallet.types.LabelVector;
+
+class ClassifierModel implements MaxentModel, SerializableArtifact {
+
+ private Classifier classifer;
+
+ public ClassifierModel(Classifier classifer) {
+ this.classifer = classifer;
+ }
+
+ Classifier getClassifer() {
+ return classifer;
+ }
+
+ public double[] eval(String[] features) {
+ Alphabet dataAlphabet = classifer.getAlphabet();
+
+ List<Integer> malletFeatureList = new ArrayList<>(features.length);
+
+ for (String feature : features) {
+ int featureId = dataAlphabet.lookupIndex(feature);
+ if (featureId != -1) {
+ malletFeatureList.add(featureId);
+ }
+ }
+
+ int malletFeatures[] = new int[malletFeatureList.size()];
+ for (int i = 0; i < malletFeatureList.size(); i++) {
+ malletFeatures[i] = malletFeatureList.get(i);
+ }
+
+ FeatureVector fv = new FeatureVector(classifer.getAlphabet(),
+ malletFeatures);
+ Instance instance = new Instance(fv, null, null, null);
+
+ Classification result = classifer.classify(instance);
+
+ LabelVector labeling = result.getLabelVector();
+
+ LabelAlphabet targetAlphabet = classifer.getLabelAlphabet();
+
+ double outcomes[] = new double[targetAlphabet.size()];
+ for (int i = 0; i < outcomes.length; i++) {
+
+ Label label = targetAlphabet.lookupLabel(i);
+
+ int rank = labeling.getRank(label);
+ outcomes[i] = labeling.getValueAtRank(rank);
+ }
+
+ return outcomes;
+ }
+
+ public double[] eval(String[] context, double[] probs) {
+ return eval(context);
+ }
+
+ public double[] eval(String[] context, float[] values) {
+ return eval(context);
+ }
+
+ @Override
+ public String getBestOutcome(double[] ocs) {
+ int best = 0;
+ for (int i = 1; i < ocs.length; i++)
+ if (ocs[i] > ocs[best])
+ best = i;
+
+ return getOutcome(best);
+ }
+
+ @Override
+ public String getAllOutcomes(double[] outcomes) {
+ return null;
+ }
+
+ @Override
+ public String getOutcome(int i) {
+ return classifer.getLabelAlphabet().lookupLabel(i).getEntry().toString();
+ }
+
+ @Override
+ public int getIndex(String outcome) {
+ return classifer.getLabelAlphabet().lookupIndex(outcome);
+ }
+
+ @Override
+ public int getNumOutcomes() {
+ return classifer.getLabelAlphabet().size();
+ }
+
+ @Override
+ public Class<?> getArtifactSerializerClass() {
+ return ClassifierModelSerializer.class;
+ }
+}
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
new file mode 100644
index 0000000..9cfb6f2
--- /dev/null
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.addons.mallet;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+
+import cc.mallet.classify.Classifier;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+
+// The standard method for saving classifiers in Mallet is through Java serialization.
+
+public class ClassifierModelSerializer implements
+ ArtifactSerializer<ClassifierModel> {
+
+ @Override
+ public ClassifierModel create(InputStream in) throws IOException,
+ InvalidFormatException {
+
+ ObjectInputStream ois = new ObjectInputStream(in);
+ try {
+ Classifier classifier = (Classifier) ois.readObject();
+ return new ClassifierModel(classifier);
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ }
+
+ @Override
+ public void serialize(ClassifierModel artifact, OutputStream out)
+ throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(out);
+ oos.writeObject(artifact.getClassifer());
+ oos.flush();
+ }
+}
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
new file mode 100644
index 0000000..34f5f7c
--- /dev/null
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.addons.mallet;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Map;
+
+import opennlp.tools.ml.AbstractEventTrainer;
+import opennlp.tools.ml.model.DataIndexer;
+import opennlp.tools.ml.model.MaxentModel;
+import cc.mallet.classify.Classifier;
+import cc.mallet.classify.MaxEntTrainer;
+import cc.mallet.types.Alphabet;
+import cc.mallet.types.FeatureVector;
+import cc.mallet.types.Instance;
+import cc.mallet.types.InstanceList;
+import cc.mallet.types.LabelAlphabet;
+
+public class MaxentTrainer extends AbstractEventTrainer {
+
+ public MaxentTrainer(Map<String, String> trainParams,
+ Map<String, String> reportMap) {
+ super(trainParams, reportMap);
+ }
+
+ @Override
+ public boolean isSortAndMerge() {
+ return true;
+ }
+
+ @Override
+ public MaxentModel doTrain(DataIndexer indexer) throws IOException {
+
+ int numFeatures = indexer.getPredLabels().length;
+
+ Alphabet dataAlphabet = new Alphabet(numFeatures);
+ LabelAlphabet targetAlphabet = new LabelAlphabet();
+
+ Collection<Instance> instances = new ArrayList<>();
+
+ String predLabels[] = indexer.getPredLabels();
+
+ int outcomes[] = indexer.getOutcomeList();
+ for (int contextIndex = 0; contextIndex < indexer.getContexts().length; contextIndex++) {
+
+ int malletFeatures[] = new int[indexer.getContexts()[contextIndex].length];
+ double weights[] = new double[indexer.getContexts()[contextIndex].length];
+
+ for (int featureIndex = 0; featureIndex < malletFeatures.length; featureIndex++) {
+ malletFeatures[featureIndex] = dataAlphabet.lookupIndex(
+ predLabels[indexer.getContexts()[contextIndex][featureIndex]], true);
+
+ weights[featureIndex] = indexer.getNumTimesEventsSeen()[contextIndex];
+ }
+
+ FeatureVector fv = new FeatureVector(dataAlphabet, malletFeatures,
+ weights);
+ Instance inst = new Instance(fv, targetAlphabet.lookupLabel(
+ indexer.getOutcomeLabels()[outcomes[contextIndex]], true), "name",
+ "data-indexer");
+ instances.add(inst);
+ }
+
+ InstanceList trainingData = new InstanceList(dataAlphabet, targetAlphabet);
+ Instance inst = instances.iterator().next();
+
+ Alphabet.alphabetsMatch(trainingData, inst);
+ trainingData.addAll(instances);
+
+ MaxEntTrainer trainer = new MaxEntTrainer();
+
+ Classifier classifier = trainer.train(trainingData);
+
+ return new ClassifierModel(classifier);
+ }
+}
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
new file mode 100644
index 0000000..52f2ce5
--- /dev/null
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.addons.mallet;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.BeamSearchContextGenerator;
+import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.model.SerializableArtifact;
+import cc.mallet.fst.CRF;
+import cc.mallet.fst.MaxLatticeDefault;
+import cc.mallet.fst.Transducer;
+import cc.mallet.types.Alphabet;
+import cc.mallet.types.FeatureVector;
+import cc.mallet.types.FeatureVectorSequence;
+import cc.mallet.types.Sequence;
+
+public class TransducerModel<T> implements SequenceClassificationModel<T>, SerializableArtifact {
+
+ private Transducer model;
+
+ public TransducerModel(Transducer model) {
+ this.model = model;
+ }
+
+ Transducer getModel() {
+ return model;
+ }
+
+ public opennlp.tools.util.Sequence bestSequence(T[] sequence,
+ Object[] additionalContext, BeamSearchContextGenerator<T> cg,
+ SequenceValidator<T> validator) {
+ return bestSequences(1, sequence, additionalContext, cg, validator)[0];
+ }
+
+ public opennlp.tools.util.Sequence[] bestSequences(int numSequences,
+ T[] sequence, Object[] additionalContext,
+ BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) {
+
+ // TODO: CRF.getInputAlphabet
+ Alphabet dataAlphabet = model.getInputPipe().getAlphabet();
+
+ FeatureVector featureVectors[] = new FeatureVector[sequence.length];
+
+ // TODO:: The feature generator needs to get the detected sequence in the end
+ // to update the adaptive data!
+ String prior[] = new String[sequence.length];
+ Arrays.fill(prior, "s"); // <- HACK, this will degrade performance!
+
+ // TODO: Put together a feature generator which doesn't fail if outcomes is null!
+ for (int i = 0; i < sequence.length; i++) {
+ String features[] = cg.getContext(i, sequence, null, additionalContext);
+
+ List<Integer> malletFeatureList = new ArrayList<>(features.length);
+
+ for (int featureIndex = 0; featureIndex < features.length; featureIndex++) {
+ if (dataAlphabet.contains(features[featureIndex])) {
+ malletFeatureList.add(dataAlphabet.lookupIndex(features[featureIndex]));
+ }
+ }
+
+ int malletFeatures[] = new int[malletFeatureList.size()];
+ for (int k = 0; k < malletFeatureList.size(); k++) {
+ malletFeatures[k] = malletFeatureList.get(k);
+ }
+
+ // Note: Might contain a feature more than once ... will that work ?!
+ featureVectors[i] = new FeatureVector(dataAlphabet, malletFeatures);
+ }
+
+ FeatureVectorSequence malletSequence = new FeatureVectorSequence(featureVectors);
+
+ Sequence[] answers = null;
+ if (numSequences == 1) {
+ answers = new Sequence[1];
+ answers[0] = model.transduce(malletSequence);
+ } else {
+ MaxLatticeDefault lattice = new MaxLatticeDefault(model, malletSequence, null, 3);
+
+ answers = lattice.bestOutputSequences(numSequences).toArray(new Sequence[0]);
+ }
+
+ opennlp.tools.util.Sequence[] outcomeSequences = new opennlp.tools.util.Sequence[answers.length];
+
+ for (int i = 0; i < answers.length; i++) {
+ Sequence seq = answers[i];
+
+ List<String> outcomes = new ArrayList<>(seq.size());
+
+ for (int j = 0; j < seq.size(); j++) {
+ outcomes.add(seq.get(j).toString());
+ }
+
+ outcomeSequences[i] = new opennlp.tools.util.Sequence(outcomes);
+ }
+
+ return outcomeSequences;
+ }
+
+ @Override
+ public Class<?> getArtifactSerializerClass() {
+ return TransducerModelSerializer.class;
+ }
+}
diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
new file mode 100644
index 0000000..b793ca2
--- /dev/null
+++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.addons.mallet;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import cc.mallet.fst.Transducer;
+
+public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel> {
+
+ @Override
+ public TransducerModel create(InputStream in) throws IOException,
+ InvalidFormatException {
+ ObjectInputStream ois = new ObjectInputStream(in);
+ try {
+ Transducer classifier = (Transducer) ois.readObject();
+ return new TransducerModel(classifier);
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ }
+
+ @Override
+ public void serialize(TransducerModel artifact, OutputStream out)
+ throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(out);
+ oos.writeObject(artifact.getModel());
+ oos.flush();
+ }
+}