added unit tests, corrected some mistakes, need more unit tests
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index 136d5f2..d890ba0 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
@@ -27,6 +27,7 @@
protected boolean isCoarseSense;
public static boolean isStemCompare;
+ protected boolean returnMultiple;
public static enum SenseSource {
WORDNET, WSDHELPER, OTHER;
@@ -61,8 +62,17 @@
this.senseSource = senseSource;
}
+ public boolean isReturnMultiple() {
+ return returnMultiple;
+ }
+
+ public void setReturnMultiple(boolean returnMultiple) {
+ this.returnMultiple = returnMultiple;
+ }
+
public WSDParameters() {
this.isCoarseSense = false;
+ this.returnMultiple = false;
}
/**
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index 06451e5..a825e11 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
@@ -150,7 +150,7 @@
if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
String s = IMSParameters.SenseSource.WSDHELPER.name() + " "
- + tokenTags[i];
+ + WSDHelper.getNonRelevWordsDef(tokenTags[i]);
String[] sense = { s };
senses.add(sense);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
index fbf8ba1..14413d6 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
@@ -55,8 +55,9 @@
}
public String[] extractSurroundingWords(int index, String[] toks,
- String[] lemmas) {
+ String[] lemmas, int windowSize) {
+ // TODO consider the windowSize
ArrayList<String> contextWords = new ArrayList<String>();
for (int i = 0; i < toks.length; i++) {
@@ -123,7 +124,7 @@
HashSet<String> surroundingWords = new HashSet<>();
surroundingWords.addAll(Arrays.asList(extractSurroundingWords(index, toks,
- lemmas)));
+ lemmas, windowSize)));
String[] localCollocations = extractLocalCollocations(index, toks, ngram);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
index 55bc0ed..b1e8a18 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
@@ -41,16 +41,17 @@
protected static IMSContextGenerator cg = new DefaultIMSContextGenerator();
- public IMSME(IMSParameters params){
+ public IMSME(IMSParameters params) {
this.params = params;
}
-
+
public IMSME(IMSModel model, IMSParameters params) {
this.imsModel = model;
this.params = params;
-
-// Assert.assertEquals(model.getWindowSize(),params.getWindowSize());
-// Assert.assertEquals(model.getNgram(),params.getNgram());
+ }
+
+ public IMSModel getModel() {
+ return imsModel;
}
public void setModel(IMSModel model) {
@@ -65,7 +66,7 @@
TrainingParameters mlParams, IMSParameters imsParams,
IMSFactory imsfactory) throws IOException {
- ArrayList<String> surroundingWordModel = buildSurroundingWords(samples);
+ ArrayList<String> surroundingWordModel = buildSurroundingWords(samples, imsParams.getWindowSize());
HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
@@ -88,13 +89,13 @@
events.add(ev);
- es = ObjectStreamUtils.createObjectStream(events);
-
} while ((sample = samples.read()) != null);
}
- EventTrainer trainer = TrainerFactory.getEventTrainer(
- mlParams.getSettings(), manifestInfoEntries);
+ es = ObjectStreamUtils.createObjectStream(events);
+
+ EventTrainer trainer = TrainerFactory
+ .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
imsModel = trainer.train(es);
return new IMSModel(lang, wordTag, imsParams.windowSize, imsParams.ngram,
@@ -102,13 +103,13 @@
}
public static ArrayList<String> buildSurroundingWords(
- ObjectStream<WSDSample> samples) throws IOException {
+ ObjectStream<WSDSample> samples, int windowSize) throws IOException {
DefaultIMSContextGenerator imsCG = new DefaultIMSContextGenerator();
ArrayList<String> surroundingWordsModel = new ArrayList<String>();
WSDSample sample;
while ((sample = samples.read()) != null) {
- String[] words = imsCG.extractSurroundingWords(
- sample.getTargetPosition(), sample.getSentence(), sample.getLemmas());
+ String[] words = imsCG.extractSurroundingWords(sample.getTargetPosition(),
+ sample.getSentence(), sample.getLemmas(), windowSize);
if (words.length > 0) {
for (String word : words) {
@@ -125,10 +126,11 @@
if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
String wordTag = sample.getTargetWordTag();
- String trainingFile = ((IMSParameters) this.getParams())
- .getTrainingDataDirectory() + sample.getTargetWordTag();
+ if (imsModel == null
+ || !imsModel.getWordTag().equals(sample.getTargetWordTag())) {
- if (imsModel==null || !imsModel.getWordTag().equals(sample.getTargetWordTag())) {
+ String trainingFile = ((IMSParameters) this.getParams())
+ .getTrainingDataDirectory() + sample.getTargetWordTag();
File file = new File(trainingFile + ".ims.model");
if (file.exists() && !file.isDirectory()) {
@@ -167,11 +169,11 @@
}
} else {
-
MFS mfs = new MFS();
return mfs.disambiguate(wordTag);
}
} else {
+
String outcome = "";
String[] context = cg.getContext(sample,
@@ -226,8 +228,8 @@
*/
public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
String[] lemmas, int index) {
- return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
- index));
+ return disambiguate(
+ new WSDSample(tokenizedContext, tokenTags, lemmas, index));
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
index af85582..1e540cf 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
@@ -20,6 +20,7 @@
package opennlp.tools.disambiguator.ims;
import java.io.File;
+import java.security.InvalidParameterException;
import opennlp.tools.disambiguator.WSDParameters;
@@ -34,6 +35,11 @@
protected int ngram;
protected String trainingDataDirectory;
+
+ protected static final int DFLT_WIN_SIZE = 3;
+ protected static final int DFLT_NGRAM = 2;
+ protected static final String DFLT_LANG_CODE = "En";
+ protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
/**
* This constructor takes only two parameters. The default language used is
@@ -49,8 +55,9 @@
* the source of the training data
*/
public IMSParameters(int windowSize, int ngram, SenseSource senseSource,
- String trainingDataDirectory) {
- this.languageCode = "En";
+ String trainingDataDirectory){
+
+ this.languageCode = DFLT_LANG_CODE;
this.windowSize = windowSize;
this.ngram = ngram;
this.senseSource = senseSource;
@@ -63,19 +70,7 @@
}
public IMSParameters(String trainingDataDirectory) {
- this(3, 2, SenseSource.WORDNET, trainingDataDirectory);
-
- File folder = new File(trainingDataDirectory);
- if (!folder.exists())
- folder.mkdirs();
- }
-
- public IMSParameters() {
- this(3, 2, SenseSource.WORDNET, null);
- }
-
- public IMSParameters(int windowSize, int ngram) {
- this(windowSize, ngram, SenseSource.WORDNET, null);
+ this(DFLT_WIN_SIZE, DFLT_NGRAM, DFLT_SOURCE, trainingDataDirectory);
}
public String getLanguageCode() {
@@ -109,7 +104,6 @@
* Creates the context generator of IMS
*/
public IMSContextGenerator createContextGenerator() {
-
return new DefaultIMSContextGenerator();
}
@@ -123,7 +117,7 @@
@Override
public boolean isValid() {
- // TODO Auto-generated method stub
+ // TODO recheck this pattern switch to maps
return true;
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
index e286658..fe82987 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
@@ -194,9 +194,12 @@
WordPOS wordPOS = new WordPOS(word, pos);
ArrayList<Synset> synsets = wordPOS.getSynsets();
-
- int size = synsets.size();
-
+ int size;
+ if (this.parameters.isReturnMultiple()) {
+ size = synsets.size();
+ } else {
+ size = 1;
+ }
String[] senses = new String[size];
for (int i = 0; i < size; i++) {
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
index 9584487..b935c45 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
@@ -39,6 +39,7 @@
public String[] extractSurroundingContextClusters(int index, String[] toks,
String[] tags, String[] lemmas, int windowSize) {
+ // TODO consider windowSize
ArrayList<String> contextClusters = new ArrayList<String>();
for (int i = 0; i < toks.length; i++) {
@@ -49,19 +50,19 @@
String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
.trim();
-
- WordPOS word = new WordPOS(lemma, tags[i]);
- // TODO check fix for "_" and null pointers
- if (lemma.length() > 1 && !lemma.contains("_")) {
- try{
- ArrayList<Synset> synsets = word.getSynsets();
- if (synsets!=null && synsets.size() > 0 ){
- contextClusters.add(synsets.get(0).getOffset() + "");
- }
- }catch(NullPointerException ex)
- {
- //TODO tagger mistake add proper exception
+ WordPOS word = new WordPOS(lemma, tags[i]);
+
+ if (lemma.length() > 1) {
+ try {
+ ArrayList<Synset> synsets = word.getSynsets();
+ if (synsets != null && synsets.size() > 0) {
+ for (Synset syn : synsets){
+ contextClusters.add(syn.getOffset() + "");
+ }
+ }
+ } catch (NullPointerException ex) {
+ // TODO tagger mistake add proper exception
}
}
@@ -80,30 +81,32 @@
*/
@Override
public String[] getContext(int index, String[] toks, String[] tags,
- String[] lemmas, int windowSize) {
+ String[] lemmas, int windowSize, ArrayList<String> model) {
HashSet<String> surroundingContextClusters = new HashSet<>();
- surroundingContextClusters.addAll(Arrays
- .asList(extractSurroundingContextClusters(index, toks, tags, lemmas,
- windowSize)));
+ surroundingContextClusters
+ .addAll(Arrays.asList(extractSurroundingContextClusters(index, toks,
+ tags, lemmas, windowSize)));
- String[] serializedFeatures = new String[surroundingContextClusters.size()];
+ String[] serializedFeatures = new String[model.size()];
int i = 0;
-
- for (String feature : surroundingContextClusters) {
- serializedFeatures[i] = "F" + i + "=" + feature;
+ for (String word : model) {
+ if (surroundingContextClusters.contains(word.toString())) {
+ serializedFeatures[i] = "F" + i + "=1";
+ } else {
+ serializedFeatures[i] = "F" + i + "=0";
+ }
i++;
}
return serializedFeatures;
-
}
- public String[] getContext(WSDSample sample, int windowSize) {
+ public String[] getContext(WSDSample sample, int windowSize, ArrayList<String> model) {
return getContext(sample.getTargetPosition(), sample.getSentence(),
- sample.getTags(), sample.getLemmas(), windowSize);
+ sample.getTags(), sample.getLemmas(), windowSize, model);
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
index 9c0055f..4e79c38 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
@@ -19,6 +19,8 @@
package opennlp.tools.disambiguator.oscc;
+import java.util.ArrayList;
+
import opennlp.tools.disambiguator.WSDSample;
/**
@@ -27,7 +29,7 @@
public interface OSCCContextGenerator {
String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,
- int windowSize);
+ int windowSize, ArrayList<String> model);
- String[] getContext(WSDSample sample, int windowSize);
+ String[] getContext(WSDSample sample, int windowSize, ArrayList<String> model);
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
index 7202680..f2c67ba 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
@@ -22,7 +22,6 @@
import java.util.ArrayList;
import java.util.HashMap;
-import junit.framework.Assert;
import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.WSDisambiguator;
@@ -53,11 +52,11 @@
* Please see {@link DefaultOSCCContextGenerator}
*
* The approach finds the context clusters surrounding the target and uses a
- * classifier to judge on the best case.
+ * classifier to judge on the best case.
*
* Here an ME classifier is used.
*
-*/
+ */
public class OSCCME extends WSDisambiguator {
protected OSCCModel osccModel;
@@ -69,10 +68,12 @@
}
public OSCCME(OSCCModel model, OSCCParameters params) {
- this.osccModel = osccModel;
+ this.osccModel = model;
this.params = params;
+ }
- Assert.assertEquals(model.getWindowSize(), params.getWindowSize());
+ public OSCCModel getModel() {
+ return osccModel;
}
public void setModel(OSCCModel model) {
@@ -85,7 +86,10 @@
public static OSCCModel train(String lang, ObjectStream<WSDSample> samples,
TrainingParameters mlParams, OSCCParameters osccParams,
- OSCCFactory imsfactory) throws IOException {
+ OSCCFactory osccFactory) throws IOException {
+
+ ArrayList<String> surroundingClusterModel = buildSurroundingClusters(
+ samples, osccParams.getWindowSize());
HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
@@ -99,39 +103,57 @@
if (sample != null) {
wordTag = sample.getTargetWordTag();
do {
-
String sense = sample.getSenseIDs().get(0);
-
- String[] context = cg.getContext(sample, osccParams.windowSize);
+ String[] context = cg.getContext(sample, osccParams.windowSize,
+ surroundingClusterModel);
Event ev = new Event(sense + "", context);
-
events.add(ev);
-
- es = ObjectStreamUtils.createObjectStream(events);
-
} while ((sample = samples.read()) != null);
}
- EventTrainer trainer = TrainerFactory.getEventTrainer(
- mlParams.getSettings(), manifestInfoEntries);
+ es = ObjectStreamUtils.createObjectStream(events);
+ EventTrainer trainer = TrainerFactory
+ .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
+
osccModel = trainer.train(es);
- return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel, manifestInfoEntries, imsfactory);
+ return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel,
+ surroundingClusterModel, manifestInfoEntries, osccFactory);
}
+ public static ArrayList<String> buildSurroundingClusters(
+ ObjectStream<WSDSample> samples, int windowSize) throws IOException {
+ // TODO modify to clusters
+ DefaultOSCCContextGenerator osccCG = new DefaultOSCCContextGenerator();
+ ArrayList<String> surroundingWordsModel = new ArrayList<String>();
+ WSDSample sample;
+ while ((sample = samples.read()) != null) {
+ String[] words = osccCG.extractSurroundingContextClusters(
+ sample.getTargetPosition(), sample.getSentence(), sample.getTags(),
+ sample.getLemmas(), windowSize);
+
+ if (words.length > 0) {
+ for (String word : words) {
+ surroundingWordsModel.add(word);
+ }
+ }
+ }
+ samples.reset();
+ return surroundingWordsModel;
+ }
@Override
public String[] disambiguate(WSDSample sample) {
if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
String wordTag = sample.getTargetWordTag();
- String trainingFile = ((OSCCParameters) this.getParams())
- .getTrainingDataDirectory() + sample.getTargetWordTag();
-
if (osccModel == null
|| !osccModel.getWordTag().equals(sample.getTargetWordTag())) {
- File file = new File(trainingFile + ".ims.model");
+ String trainingFile = ((OSCCParameters) this.getParams())
+ .getTrainingDataDirectory() + sample.getTargetWordTag();
+
+ File file = new File(trainingFile + ".oscc.model");
if (file.exists() && !file.isDirectory()) {
try {
setModel(new OSCCModel(file));
@@ -147,7 +169,8 @@
String outcome = "";
String[] context = cg.getContext(sample,
- ((OSCCParameters) this.params).windowSize);
+ ((OSCCParameters) this.params).windowSize,
+ osccModel.getContextClusters());
double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
@@ -174,7 +197,8 @@
String outcome = "";
String[] context = cg.getContext(sample,
- ((OSCCParameters) this.params).windowSize);
+ ((OSCCParameters) this.params).windowSize,
+ osccModel.getContextClusters());
double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
@@ -223,8 +247,8 @@
*/
public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
String[] lemmas, int index) {
- return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
- index));
+ return disambiguate(
+ new WSDSample(tokenizedContext, tokenTags, lemmas, index));
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
index f3b28ab..19069c3 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
@@ -21,6 +21,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Map;
import java.util.Properties;
import java.net.URL;
@@ -43,13 +44,13 @@
private static final String WINSIZE = "winsize";
private static final String CONTEXTCLUSTERS = "contextclusters";
- //private ArrayList<String> contextClusters = new ArrayList<String>();
+ private ArrayList<String> contextClusters = new ArrayList<String>();
private String wordTag;
private int windowSize;
- /*public ArrayList<String> getContextClusters() {
+ public ArrayList<String> getContextClusters() {
return contextClusters;
- }*/
+ }
public int getWindowSize() {
return windowSize;
@@ -59,9 +60,9 @@
this.windowSize = windowSize;
}
- /* public void setContextClusters(ArrayList<String> contextClusters) {
+ public void setContextClusters(ArrayList<String> contextClusters) {
this.contextClusters = contextClusters;
- }*/
+ }
public String getWordTag() {
return wordTag;
@@ -72,7 +73,7 @@
}
public OSCCModel(String languageCode, String wordTag, int windowSize,
- MaxentModel osccModel,
+ MaxentModel osccModel, ArrayList<String> contextClusters,
Map<String, String> manifestInfoEntries, OSCCFactory factory) {
super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
@@ -80,17 +81,17 @@
this.setManifestProperty(WORDTAG, wordTag);
this.setManifestProperty(WINSIZE, windowSize + "");
-// this.setManifestProperty(CONTEXTCLUSTERS,
-// StringUtils.join(contextClusters, ","));
+ this.setManifestProperty(CONTEXTCLUSTERS,
+ StringUtils.join(contextClusters, ","));
- //this.contextClusters = contextClusters;
+ this.contextClusters = contextClusters;
checkArtifactMap();
}
public OSCCModel(String languageCode, String wordTag, int windowSize,
- int ngram, MaxentModel osccModel,
+ int ngram, MaxentModel osccModel, ArrayList<String> contextClusters,
OSCCFactory factory) {
- this(languageCode, wordTag, windowSize, osccModel,
+ this(languageCode, wordTag, windowSize, osccModel, contextClusters,
null, factory);
}
@@ -135,10 +136,10 @@
public void updateAttributes() {
Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
- //String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);
+ String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);
- /* this.contextClusters = new ArrayList(
- Arrays.asList(contextClusters.split(",")));*/
+ this.contextClusters = new ArrayList(
+ Arrays.asList(contextClusters.split(",")));
this.wordTag = (String) manifest.get(WORDTAG);
this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
index 42a7742..15f1004 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
@@ -70,13 +70,11 @@
}
public OSCCParameters() {
- // TODO change the "" into null ??
- this(DFLT_WIN_SIZE, DFLT_SOURCE, "");
+ this(DFLT_WIN_SIZE, DFLT_SOURCE, null);
}
public OSCCParameters(int windowSize) {
- // TODO change the "" into null ??
- this(windowSize, DFLT_SOURCE, "");
+ this(windowSize, DFLT_SOURCE, null);
}
public String getLanguageCode() {
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java
new file mode 100644
index 0000000..ce0f86e
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.ims.IMSFactory;
+import opennlp.tools.disambiguator.ims.IMSME;
+import opennlp.tools.disambiguator.ims.IMSModel;
+import opennlp.tools.disambiguator.ims.IMSParameters;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * This is the test class for {@link IMSME}.
+ *
+ * The scope of this test is to make sure that the IMS disambiguator code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of the
+ * disambiguator.
+ *
+ * In this test the {@link IMSME} is trained with Semcor and then the computed
+ * model is used to predict sentences from the training sentences.
+ */
+public class IMSMETester {
+ // TODO write more tests
+ // TODO modify when we fix the parameter model
+
+ static String modelsDir = "src\\test\\resources\\models\\";
+ static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
+
+ static IMSParameters IMSParams;
+ static IMSME ims;
+ static IMSFactory IMSFactory;
+ static IMSModel model;
+
+ static String test = "please.v";
+ static File outFile;
+
+ static String test1 = "We need to discuss an important topic, please write to me soon.";
+ static String test2 = "The component was highly radioactive to the point that"
+ + " it has been activated the second it touched water";
+ static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+ static String[] sentence1;
+ static String[] sentence2;
+ static String[] sentence3;
+
+ static String[] tags1;
+ static String[] tags2;
+ static String[] tags3;
+
+ static String[] lemmas1;
+ static String[] lemmas2;
+ static String[] lemmas3;
+
+ /*
+ * Setup the testing variables
+ */
+ @BeforeClass
+ public static void setUpAndTraining() {
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+ tags1 = WSDHelper.getTagger().tag(sentence1);
+ tags2 = WSDHelper.getTagger().tag(sentence2);
+ tags3 = WSDHelper.getTagger().tag(sentence3);
+
+ List<String> tempLemmas1 = new ArrayList<String>();
+ for (int i = 0; i < sentence1.length; i++) {
+ tempLemmas1
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
+ }
+ lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+ List<String> tempLemmas2 = new ArrayList<String>();
+ for (int i = 0; i < sentence2.length; i++) {
+ tempLemmas2
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
+ }
+ lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+
+ List<String> tempLemmas3 = new ArrayList<String>();
+ for (int i = 0; i < sentence3.length; i++) {
+ tempLemmas3
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
+ }
+ lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+ IMSParams = new IMSParameters("");
+ IMSParams.setTrainingDataDirectory(trainingDataDirectory);
+ IMSFactory = new IMSFactory();
+ TrainingParameters trainingParams = new TrainingParameters();
+ SemcorReaderExtended sr = new SemcorReaderExtended();
+ ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+ IMSModel writeModel = null;
+ /*
+ * Tests training the disambiguator We test both writing and reading a model
+ * file trained by semcor
+ */
+
+ try {
+ writeModel = IMSME.train("en", sampleStream, trainingParams, IMSParams,
+ IMSFactory);
+ assertNotNull("Checking the model to be written", writeModel);
+ writeModel.writeModel(IMSParams.getTrainingDataDirectory() + test);
+ outFile = new File(
+ IMSParams.getTrainingDataDirectory() + test + ".ims.model");
+ model = new IMSModel(outFile);
+ assertNotNull("Checking the read model", model);
+ ims = new IMSME(model, IMSParams);
+ assertNotNull("Checking the disambiguator", ims);
+ } catch (IOException e1) {
+ e1.printStackTrace();
+ fail("Exception in training");
+ }
+ }
+
+ /*
+ * Tests disambiguating only one word : The ambiguous word "please"
+ */
+ @Test
+ public void testOneWordDisambiguation() {
+ String[] senses = ims.disambiguate(sentence1, tags1, lemmas1, 8);
+
+ assertEquals("Check number of senses", 1, senses.length);
+ }
+
+ /*
+ * Tests disambiguating a word Span In this case we test a mix of monosemous
+ * and polysemous words as well as words that do not need disambiguation such
+ * as determiners
+ */
+ @Test
+ public void testWordSpanDisambiguation() {
+ Span span = new Span(3, 7);
+ List<String[]> senses = ims.disambiguate(sentence2, tags2, lemmas2, span);
+
+ assertEquals("Check number of returned words", 5, senses.size());
+ assertEquals("Check number of senses", 1, senses.get(0).length);
+ assertEquals("Check monosemous word", 1, senses.get(1).length);
+ assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
+ assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
+ }
+
+ /*
+ * Tests disambiguating all the words
+ */
+ @Test
+ public void testAllWordsDisambiguation() {
+ List<String[]> senses = ims.disambiguate(sentence3, tags3, lemmas3);
+
+ assertEquals("Check number of returned words", 15, senses.size());
+ assertEquals("Check preposition", "WSDHELPER personal pronoun",
+ senses.get(6)[0]);
+ }
+
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
deleted file mode 100644
index c832156..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.disambiguator.ims.IMSME;
-import opennlp.tools.disambiguator.ims.IMSParameters;
-import opennlp.tools.util.Span;
-
-/**
- * This is a typical example of how to call the disambiguation function in the
- * IMS class.
- * <ul>
- * <li>In the 2 first examples, the training data exist, therefore the IMS
- * approach is used.</li>
- * <li>In the 3rd example, the training data for the word to disambiguate are
- * absent, therefore the Most Frequent Sents (MFS) is returend</li>
- * </ul>
- */
-public class IMSTester {
-
- public static void main(String[] args) {
-
- // TODO write unit test
-
- String modelsDir = "src\\test\\resources\\models\\";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- IMSParameters params = new IMSParameters("");
-
- WSDHelper.print(params.getTrainingDataDirectory());
-
- IMSME ims = new IMSME(params);
-
-
- // This is how to make the context for one-word-disambiguation using IMS
-
- String test1 = "We need to discuss important topic, please write to me soon.";
- String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
- String[] tags1 = WSDHelper.getTagger().tag(sentence1);
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence1[i], tags1[i]);
- tempLemmas1.add(lemma);
- }
- String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- // output
- String[] senses1 = ims.disambiguate(sentence1, tags1, lemmas1, 8);
- System.out.print(lemmas1[8] + " :\t");
- WSDHelper.print(senses1);
- WSDHelper.print("*****************************");
-
- // This is how to make the context for disambiguation of span of words
-
- String test2 = "The component was highly radioactive to the point that"
- + " it has been activated the second it touched water";
- String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
- String[] tags2 = WSDHelper.getTagger().tag(sentence2);
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence2[i], tags2[i]);
- tempLemmas2.add(lemma);
- }
- String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
- Span span = new Span(3, 7);
-
- // output
- List<String[]> senses2 = ims.disambiguate(sentence2, tags2, lemmas2, span);
- for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
- String[] senses = senses2.get(i - span.getStart());
- System.out.print(lemmas2[i] + " :\t");
- WSDHelper.print(senses);
- WSDHelper.print("----------");
- }
-
- WSDHelper.print("*****************************");
-
- // This is how to make the context for all-words-disambiguation
-
- String test3 = "The summer almost over and I not to the beach even once";
- String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
- String[] tags3 = WSDHelper.getTagger().tag(sentence3);
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence3[i], tags3[i]);
- tempLemmas3.add(lemma);
- }
- String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
- // output
- List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
- for (int i = 0; i < sentence3.length; i++) {
- String[] senses = senses3.get(i);
- System.out.print(lemmas3[i] + " :\t");
- WSDHelper.print(senses);
- WSDHelper.print("----------");
- }
- }
-
-}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
index 13c959b..edb1346 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
@@ -19,81 +19,137 @@
package opennlp.tools.disambiguator;
+import static org.junit.Assert.assertEquals;
+
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.disambiguator.lesk.Lesk;
import opennlp.tools.disambiguator.lesk.LeskParameters;
import opennlp.tools.disambiguator.lesk.LeskParameters.LESK_TYPE;
+import opennlp.tools.util.Span;
+import org.junit.BeforeClass;
import org.junit.Test;
+/**
+ * This is the test class for {@link Lesk}.
+ *
+ * The scope of this test is to make sure that the Lesk disambiguator code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of the
+ * disambiguator.
+ */
public class LeskTester {
- @Test
- public static void main(String[] args) {
+ // TODO write more tests
- Lesk lesk = new Lesk();
- LeskParameters params = new LeskParameters();
- params.setLeskType(LESK_TYPE.LESK_EXT);
- boolean a[] = { true, true, true, true, true, true, true, true, true, true };
- params.setFeatures(a);
- lesk.setParams(params);
- String modelsDir = "src\\test\\resources\\models\\";
+ static String modelsDir = "src\\test\\resources\\models\\";
+
+ static Lesk lesk;
+
+ static String test1 = "We need to discuss an important topic, please write to me soon.";
+ static String test2 = "The component was highly radioactive to the point that"
+ + " it has been activated the second it touched water";
+ static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+ static String[] sentence1;
+ static String[] sentence2;
+ static String[] sentence3;
+
+ static String[] tags1;
+ static String[] tags2;
+ static String[] tags3;
+
+ static String[] lemmas1;
+ static String[] lemmas2;
+ static String[] lemmas3;
+
+ /*
+ * Setup the testing variables
+ */
+ @BeforeClass
+ public static void setUp() {
+
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
- String test1 = "I went to the bank to deposit money.";
- String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
- int targetWordIndex1 = 5;
- String[] tags1 = WSDHelper.getTagger().tag(sentence1);
+ sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+ tags1 = WSDHelper.getTagger().tag(sentence1);
+ tags2 = WSDHelper.getTagger().tag(sentence2);
+ tags3 = WSDHelper.getTagger().tag(sentence3);
+
List<String> tempLemmas1 = new ArrayList<String>();
for (int i = 0; i < sentence1.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence1[i], tags1[i]);
- tempLemmas1.add(lemma);
+ tempLemmas1
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
}
- String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
- String[] results1 = lesk.disambiguate(sentence1, tags1, lemmas1,
- targetWordIndex1);
- WSDHelper.print(results1);
- WSDHelper.printResults(lesk, results1);
+ lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
- WSDHelper.print("----------------------------------------");
-
- String test2 = "it was a strong argument that his hypothesis was true";
- String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
- int targetWordIndex2 = 4;
- String[] tags2 = WSDHelper.getTagger().tag(sentence2);
List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence2[i], tags2[i]);
- tempLemmas2.add(lemma);
+ for (int i = 0; i < sentence2.length; i++) {
+ tempLemmas2
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
}
- String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
- String[] results2 = lesk.disambiguate(sentence2, tags2, lemmas2,
- targetWordIndex2);
- WSDHelper.print(results2);
- WSDHelper.printResults(lesk, results2);
- WSDHelper.print("----------------------------------------");
+ lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
- String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
- String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
- int targetWordIndex3 = 3;
- String[] tags3 = WSDHelper.getTagger().tag(sentence3);
List<String> tempLemmas3 = new ArrayList<String>();
for (int i = 0; i < sentence3.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence3[i], tags3[i]);
- tempLemmas3.add(lemma);
+ tempLemmas3
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
}
- String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
- String[] results3 = lesk.disambiguate(sentence3, tags3, lemmas3,
- targetWordIndex3);
- WSDHelper.print(results3);
- WSDHelper.printResults(lesk, results3);
- WSDHelper.print("----------------------------------------");
+ lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+ lesk = new Lesk();
+
+ LeskParameters params = new LeskParameters();
+ params.setLeskType(LESK_TYPE.LESK_EXT);
+ boolean a[] = { true, true, true, true, true, true, true, true, true,
+ true };
+ params.setFeatures(a);
+ lesk.setParams(params);
+ }
+
+ /*
+ * Tests disambiguating only one word : The ambiguous word "please"
+ */
+ @Test
+ public void testOneWordDisambiguation() {
+ String[] senses = lesk.disambiguate(sentence1, tags1, lemmas1, 8);
+
+ assertEquals("Check number of senses", 1, senses.length);
+ }
+
+ /*
+ * Tests disambiguating a word Span In this case we test a mix of monosemous
+ * and polysemous words as well as words that do not need disambiguation such
+ * as determiners
+ */
+ @Test
+ public void testWordSpanDisambiguation() {
+ Span span = new Span(3, 7);
+ List<String[]> senses = lesk.disambiguate(sentence2, tags2, lemmas2, span);
+
+ assertEquals("Check number of returned words", 5, senses.size());
+ assertEquals("Check number of senses", 3, senses.get(0).length);
+ assertEquals("Check monosemous word", 1, senses.get(1).length);
+ assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
+ assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
+ }
+
+ /*
+ * Tests disambiguating all the words
+ */
+ @Test
+ public void testAllWordsDisambiguation() {
+ List<String[]> senses = lesk.disambiguate(sentence3, tags3, lemmas3);
+
+ assertEquals("Check number of returned words", 15, senses.size());
+ assertEquals("Check preposition", "WSDHELPER personal pronoun",
+ senses.get(6)[0]);
}
}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
index f74faad..a675268 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
@@ -19,96 +19,128 @@
package opennlp.tools.disambiguator;
+import static org.junit.Assert.assertEquals;
+
import java.util.ArrayList;
import java.util.List;
+import org.junit.BeforeClass;
+import org.junit.Test;
import opennlp.tools.disambiguator.mfs.MFS;
import opennlp.tools.util.Span;
/**
- * This is a typical example of how to call the disambiguation function in the
- * MFS class.
+ * This is the test class for {@link MFS}.
+ *
+ * The scope of this test is to make sure that the MFS disambiguator code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of the
+ * disambiguator.
*/
public class MFSTester {
+ // TODO write more tests
+ // TODO modify when we fix the parameter model
- public static void main(String[] args) {
- String modelsDir = "src\\test\\resources\\models\\";
+ static String modelsDir = "src\\test\\resources\\models\\";
+
+ static MFS mfs;
+
+ static String test1 = "We need to discuss an important topic, please write to me soon.";
+ static String test2 = "The component was highly radioactive to the point that"
+ + " it has been activated the second it touched water";
+ static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+ static String[] sentence1;
+ static String[] sentence2;
+ static String[] sentence3;
+
+ static String[] tags1;
+ static String[] tags2;
+ static String[] tags3;
+
+ static String[] lemmas1;
+ static String[] lemmas2;
+ static String[] lemmas3;
+
+ /*
+ * Setup the testing variables and the training files
+ */
+ @BeforeClass
+ public static void setUpAndTraining() {
+
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
- MFS mfs = new MFS();
+ sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ sentence3 = WSDHelper.getTokenizer().tokenize(test3);
- /**
- * This is how to make the context for one-word-disambiguation using IMS
- */
- String test1 = "We need to discuss important topic, please write to me soon.";
- String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
- String[] tags1 = WSDHelper.getTagger().tag(sentence1);
+ tags1 = WSDHelper.getTagger().tag(sentence1);
+ tags2 = WSDHelper.getTagger().tag(sentence2);
+ tags3 = WSDHelper.getTagger().tag(sentence3);
+
List<String> tempLemmas1 = new ArrayList<String>();
for (int i = 0; i < sentence1.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence1[i], tags1[i]);
- tempLemmas1.add(lemma);
+ tempLemmas1
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
}
- String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+ lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
- // output
- String[] senses1 = mfs.disambiguate(sentence1, tags1, lemmas1, 8);
- System.out.print(lemmas1[8] + " :\t");
- WSDHelper.print(senses1);
- WSDHelper.print("*****************************");
-
- /**
- * This is how to make the context for disambiguation of span of words
- */
- String test2 = "The component was highly radioactive to the point that"
- + " it has been activated the second it touched water";
- String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
- String[] tags2 = WSDHelper.getTagger().tag(sentence2);
List<String> tempLemmas2 = new ArrayList<String>();
for (int i = 0; i < sentence2.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence2[i], tags2[i]);
- tempLemmas2.add(lemma);
+ tempLemmas2
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
}
- String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
- Span span = new Span(3, 7);
+ lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
- // output
- List<String[]> senses2 = mfs.disambiguate(sentence2, tags2, lemmas2, span);
- for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
- String[] senses = senses2.get(i - span.getStart());
- System.out.print(lemmas2[i] + " :\t");
- WSDHelper.print(senses);
- WSDHelper.print("----------");
- }
-
- WSDHelper.print("*****************************");
-
- /**
- * This is how to make the context for all-words-disambiguation
- */
- String test3 = "The summer is almost over and I have not been to the beach even once";
- String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
- String[] tags3 = WSDHelper.getTagger().tag(sentence3);
List<String> tempLemmas3 = new ArrayList<String>();
for (int i = 0; i < sentence3.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence3[i], tags3[i]);
- tempLemmas3.add(lemma);
+ tempLemmas3
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
}
- String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+ lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
- // output
- List<String[]> senses3 = mfs.disambiguate(sentence3, tags3, lemmas3);
- for (int i = 0; i < sentence3.length; i++) {
- String[] senses = senses3.get(i);
- System.out.print(lemmas3[i] + " :\t");
- WSDHelper.print(senses);
- WSDHelper.print("----------");
- }
+ mfs = new MFS();
}
+ /*
+ * Tests disambiguating only one word : The ambiguous word "please"
+ */
+ @Test
+ public void testOneWordDisambiguation() {
+ String[] senses = mfs.disambiguate(sentence1, tags1, lemmas1, 8);
+
+ assertEquals("Check number of senses", 1, senses.length);
+ }
+
+ /*
+ * Tests disambiguating a word Span In this case we test a mix of monosemous
+ * and polysemous words as well as words that do not need disambiguation such
+ * as determiners
+ */
+ @Test
+ public void testWordSpanDisambiguation() {
+ Span span = new Span(3, 7);
+ List<String[]> senses = mfs.disambiguate(sentence2, tags2, lemmas2, span);
+
+ assertEquals("Check number of returned words", 5, senses.size());
+ assertEquals("Check number of senses", 1, senses.get(0).length);
+ assertEquals("Check monosemous word", 1, senses.get(1).length);
+ assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
+ assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
+ }
+
+ /*
+ * Tests disambiguating all the words
+ */
+ @Test
+ public void testAllWordsDisambiguation() {
+ List<String[]> senses = mfs.disambiguate(sentence3, tags3, lemmas3);
+
+ assertEquals("Check number of returned words", 15, senses.size());
+ assertEquals("Check preposition", "WSDHELPER personal pronoun",
+ senses.get(6)[0]);
+ }
}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java
new file mode 100644
index 0000000..d6f55a6
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.oscc.OSCCFactory;
+import opennlp.tools.disambiguator.oscc.OSCCME;
+import opennlp.tools.disambiguator.oscc.OSCCModel;
+import opennlp.tools.disambiguator.oscc.OSCCParameters;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * This is the test class for {@link OSCCME}.
+ *
+ * The scope of this test is to make sure that the OSCC disambiguator code can
+ * be executed. This test can not detect mistakes which lead to incorrect
+ * feature generation or other mistakes which decrease the disambiguation
+ * performance of the disambiguator.
+ *
+ * In this test the {@link OSCCME} is trained with Semcor and then the computed
+ * model is used to predict sentences from the training sentences.
+ */
+public class OSCCMETester {
+ // TODO write more tests
+ // TODO modify when we fix the parameter model
+
+ static String modelsDir = "src\\test\\resources\\models\\";
+ static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
+
+ static OSCCParameters OSCCParams;
+ static OSCCME oscc;
+ static OSCCFactory osccFactory;
+ static OSCCModel model;
+
+ static String test = "please.v";
+ static File outFile;
+
+ static String test1 = "We need to discuss an important topic, please write to me soon.";
+ static String test2 = "The component was highly radioactive to the point that"
+ + " it has been activated the second it touched water";
+ static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+ static String[] sentence1;
+ static String[] sentence2;
+ static String[] sentence3;
+
+ static String[] tags1;
+ static String[] tags2;
+ static String[] tags3;
+
+ static String[] lemmas1;
+ static String[] lemmas2;
+ static String[] lemmas3;
+
+ /*
+ * Setup the testing variables
+ */
+ @BeforeClass
+ public static void setUpAndTraining() {
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+ tags1 = WSDHelper.getTagger().tag(sentence1);
+ tags2 = WSDHelper.getTagger().tag(sentence2);
+ tags3 = WSDHelper.getTagger().tag(sentence3);
+
+ List<String> tempLemmas1 = new ArrayList<String>();
+ for (int i = 0; i < sentence1.length; i++) {
+ tempLemmas1
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
+ }
+ lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+ List<String> tempLemmas2 = new ArrayList<String>();
+ for (int i = 0; i < sentence2.length; i++) {
+ tempLemmas2
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
+ }
+ lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+
+ List<String> tempLemmas3 = new ArrayList<String>();
+ for (int i = 0; i < sentence3.length; i++) {
+ tempLemmas3
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
+ }
+ lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+ OSCCParams = new OSCCParameters("");
+ OSCCParams.setTrainingDataDirectory(trainingDataDirectory);
+ osccFactory = new OSCCFactory();
+ TrainingParameters trainingParams = new TrainingParameters();
+ SemcorReaderExtended sr = new SemcorReaderExtended();
+ ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+ OSCCModel writeModel = null;
+ /*
+ * Tests training the disambiguator We test both writing and reading a model
+ * file trained by semcor
+ */
+
+ try {
+ writeModel = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,
+ osccFactory);
+ assertNotNull("Checking the model to be written", writeModel);
+ writeModel.writeModel(OSCCParams.getTrainingDataDirectory() + test);
+ outFile = new File(
+ OSCCParams.getTrainingDataDirectory() + test + ".oscc.model");
+ model = new OSCCModel(outFile);
+ assertNotNull("Checking the read model", model);
+ oscc = new OSCCME(model, OSCCParams);
+ assertNotNull("Checking the disambiguator", oscc);
+ } catch (IOException e1) {
+ e1.printStackTrace();
+ fail("Exception in training");
+ }
+ }
+
+ /*
+ * Tests disambiguating only one word : The ambiguous word "please"
+ */
+ @Test
+ public void testOneWordDisambiguation() {
+ String[] senses = oscc.disambiguate(sentence1, tags1, lemmas1, 8);
+
+ assertEquals("Check number of senses", 1, senses.length);
+ }
+
+ /*
+ * Tests disambiguating a word Span In this case we test a mix of monosemous
+ * and polysemous words as well as words that do not need disambiguation such
+ * as determiners
+ */
+ @Test
+ public void testWordSpanDisambiguation() {
+ Span span = new Span(3, 7);
+ List<String[]> senses = oscc.disambiguate(sentence2, tags2, lemmas2, span);
+
+ assertEquals("Check number of returned words", 5, senses.size());
+ assertEquals("Check number of senses", 1, senses.get(0).length);
+ assertEquals("Check monosemous word", 1, senses.get(1).length);
+ assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
+ assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
+ }
+
+ /*
+ * Tests disambiguating all the words
+ */
+ @Test
+ public void testAllWordsDisambiguation() {
+ List<String[]> senses = oscc.disambiguate(sentence3, tags3, lemmas3);
+
+ assertEquals("Check number of returned words", 15, senses.size());
+ assertEquals("Check preposition", "WSDHELPER personal pronoun",
+ senses.get(6)[0]);
+ }
+
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
deleted file mode 100644
index ec6377d..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.oscc.OSCCFactory;
-import opennlp.tools.disambiguator.oscc.OSCCME;
-import opennlp.tools.disambiguator.oscc.OSCCModel;
-import opennlp.tools.disambiguator.oscc.OSCCParameters;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.Span;
-import opennlp.tools.util.TrainingParameters;
-
-public class OSCCTester {
-
- public static void main(String[] args) {
-
- SemcorReaderExtended sr = new SemcorReaderExtended();
-
- String modelsDir = "src\\test\\resources\\models\\";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- String test = "write.v";
- TrainingParameters trainingParams = new TrainingParameters();
- OSCCParameters OSCCParams = new OSCCParameters("");
- OSCCFactory OSCCFactory = new OSCCFactory();
-
- ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
-
- OSCCModel model = null;
- OSCCModel readModel = null;
- try {
- model = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,
- OSCCFactory);
- model.writeModel(test);
- File outFile = new File(test + ".OSCC.model");
- readModel = new OSCCModel(outFile);
-
- } catch (IOException e1) {
- // TODO Auto-generated catch block
- e1.printStackTrace();
- }
- OSCCME OSCC = new OSCCME(readModel, OSCCParams);
-
- /**
- * This is how to make the context for one-word-disambiguation using OSCC
- */
- String test1 = "We need to discuss important topic, please write to me soon.";
- String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
- String[] tags1 = WSDHelper.getTagger().tag(sentence1);
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence1[i], tags1[i]);
- tempLemmas1.add(lemma);
- }
- String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- // output
- String[] senses1 = OSCC.disambiguate(sentence1, tags1, lemmas1, 8);
- System.out.print(lemmas1[8] + " :\t");
- WSDHelper.print(senses1);
- WSDHelper.print("*****************************");
-
- /**
- * This is how to make the context for disambiguation of span of words
- */
- String test2 = "The component was highly radioactive to the point that"
- + " it has been activated the second it touched water";
- String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
- String[] tags2 = WSDHelper.getTagger().tag(sentence2);
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence2[i], tags2[i]);
- tempLemmas2.add(lemma);
- }
- String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
- Span span = new Span(3, 7);
-
- // output
- List<String[]> senses2 = OSCC.disambiguate(sentence2, tags2, lemmas2, span);
- for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
- String[] senses = senses2.get(i - span.getStart());
- System.out.print(lemmas2[i] + " :\t");
- WSDHelper.print(senses);
- WSDHelper.print("----------");
- }
-
- WSDHelper.print("*****************************");
- }
-}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
index 3adcd7d..d657f56 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
@@ -1,36 +1,40 @@
package opennlp.tools.disambiguator;
+import java.util.ArrayList;
+import java.util.List;
+import opennlp.tools.disambiguator.ims.IMSME;
+import opennlp.tools.disambiguator.ims.IMSParameters;
public class Tester {
public static void main(String[] args) {
-//
-// String modelsDir = "src\\test\\resources\\models\\";
-// WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
-// WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
-// WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-//
-// IMSME ims = new IMSME();
-//
-// String test3 = "The summer is almost over and I haven't been to the beach even once";
-// String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-// String[] tags3 = WSDHelper.getTagger().tag(sentence3);
-// List<String> tempLemmas3 = new ArrayList<String>();
-// for (int i = 0; i < sentence3.length; i++) {
-// String lemma = WSDHelper.getLemmatizer()
-// .lemmatize(sentence3[i], tags3[i]);
-// tempLemmas3.add(lemma);
-// }
-// String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-//
-// // output
-// List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
-// for (int i = 0; i < sentence3.length; i++) {
-// System.out.print(sentence3[i] + " : ");
-// WSDHelper.printResults(ims, senses3.get(i));
-// WSDHelper.print("----------");
-// }
+
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ IMSME ims = new IMSME(new IMSParameters("\\"));
+
+ String test3 = "The summer is almost over and I haven't been to the beach even once";
+ String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+ String[] tags3 = WSDHelper.getTagger().tag(sentence3);
+ List<String> tempLemmas3 = new ArrayList<String>();
+ for (int i = 0; i < sentence3.length; i++) {
+ String lemma = WSDHelper.getLemmatizer().lemmatize(sentence3[i],
+ tags3[i]);
+ tempLemmas3.add(lemma);
+ }
+ String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+ // output
+ List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
+ for (int i = 0; i < sentence3.length; i++) {
+ System.out.print(sentence3[i] + " : ");
+ WSDHelper.printResults(ims, senses3.get(i));
+ WSDHelper.print("----------");
+ }
}
}
\ No newline at end of file