OPENNLP-843 - grouped the two supervised techniques into a common one with different context generators, the default context generator is from the IMS approach, updated the unit tests, need to remove the useless classes.
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
index af2d31d..dd50415 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
@@ -23,8 +23,6 @@
import java.util.Arrays;
import java.util.HashMap;
-import opennlp.tools.disambiguator.ims.WTDIMS;
-
/**
* Class for the extraction of features for the different Supervised
* Disambiguation approaches.<br>
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
index 7113f49..c48d950 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
@@ -42,7 +42,7 @@
return windowTags;
}
- public String[] extractSurroundingWords(int index, String[] toks,
+ public String[] extractSurroundingContext(int index, String[] toks,
String[] lemmas, int windowSize) {
// TODO consider the windowSize
@@ -117,7 +117,7 @@
HashSet<String> surroundingWords = new HashSet<>();
surroundingWords.addAll(Arrays
- .asList(extractSurroundingWords(index, tokens, lemmas, windowSize)));
+ .asList(extractSurroundingContext(index, tokens, lemmas, windowSize)));
String[] localCollocations = extractLocalCollocations(index, tokens, ngram);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java
index 6138bde..4ddfb13 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java
@@ -53,8 +53,8 @@
/**
* Initializes the WSDParameters object and sets the input parameters
- *
- * @param Input
+ *
+ * @param params
* Parameters
* @throws InvalidParameterException
*/
@@ -65,8 +65,8 @@
/**
* If the parameters are null set the default ones, else only set them if they
* valid. Invalid parameters will return a exception
- *
- * @param Input
+ *
+ * @param params
* parameters
* @throws InvalidParameterException
*/
@@ -75,7 +75,7 @@
if (params == null) {
this.params = new LeskParameters();
} else {
- if (params.isValid()) {
+ if (params.areValid()) {
this.params = (LeskParameters) params;
} else {
throw new InvalidParameterException("wrong params");
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java
index 47ac1d3..0e61672 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java
@@ -148,7 +148,7 @@
*
* @see opennlp.tools.disambiguator.WSDParameters#isValid()
*/
- public boolean isValid() {
+ public boolean areValid() {
switch (this.leskType) {
case LESK_BASIC:
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
index 42b812f..719fad8 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
@@ -30,7 +30,7 @@
*/
public class OSCCWSDContextGenerator implements WSDContextGenerator {
- public String[] extractSurroundingContextClusters(int index, String[] toks,
+ public String[] extractSurroundingContext(int index, String[] toks,
String[] tags, String[] lemmas, int windowSize) {
// TODO consider windowSize
@@ -78,7 +78,7 @@
HashSet<String> surroundingContextClusters = new HashSet<>();
surroundingContextClusters.addAll(Arrays.asList(
- extractSurroundingContextClusters(index, toks, tags, lemmas,
+ extractSurroundingContext(index, toks, tags, lemmas,
windowSize)));
String[] serializedFeatures = new String[model.size()];
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java
new file mode 100644
index 0000000..e65bccb
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+
+/**
+ * This class contains the parameters for the IMS approach as well as the
+ * directories containing the files used
+ */
+public class WSDDefaultParameters extends WSDParameters {
+
+ protected String languageCode;
+ protected int windowSize;
+ protected int ngram;
+
+ protected String trainingDataDirectory;
+
+ protected static final int DFLT_WIN_SIZE = 3;
+ protected static final int DFLT_NGRAM = 2;
+ protected static final String DFLT_LANG_CODE = "En";
+ protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
+
+ /**
+ * This constructor takes only two parameters. The default language used is
+ * <i>English</i>
+ *
+ * @param windowSize the size of the window used for the extraction of the features
+ * qualified of Surrounding Words
+ * @param ngram the number words used for the extraction of features qualified of
+ * Local Collocations
+ * @param senseSource the source of the training data
+ */
+ public WSDDefaultParameters(int windowSize, int ngram,
+ SenseSource senseSource, String trainingDataDirectory) {
+
+ this.languageCode = DFLT_LANG_CODE;
+ this.windowSize = windowSize;
+ this.ngram = ngram;
+ this.senseSource = senseSource;
+ this.trainingDataDirectory = trainingDataDirectory;
+
+ File folder = new File(trainingDataDirectory);
+ if (!folder.exists())
+ folder.mkdirs();
+ }
+
+ public WSDDefaultParameters(String trainingDataDirectory) {
+ this(DFLT_WIN_SIZE, DFLT_NGRAM, DFLT_SOURCE, trainingDataDirectory);
+ }
+
+ public String getLanguageCode() {
+ return languageCode;
+ }
+
+ public void setLanguageCode(String languageCode) {
+ this.languageCode = languageCode;
+ }
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ public int getNgram() {
+ return ngram;
+ }
+
+ public void setNgram(int ngram) {
+ this.ngram = ngram;
+ }
+
+ public String getTrainingDataDirectory() {
+ return trainingDataDirectory;
+ }
+
+ public void setTrainingDataDirectory(String trainingDataDirectory) {
+ this.trainingDataDirectory = trainingDataDirectory;
+ }
+
+ @Override public boolean areValid() {
+ // TODO recheck this pattern
+ return true;
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index 47f4168..4b5b329 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
@@ -23,7 +23,6 @@
* Disambiguation Parameters
*
*/
-// TODO make default params for supervised approaches
public abstract class WSDParameters {
public static enum SenseSource {
@@ -51,6 +50,6 @@
/*
* @return checks if the parameters are valid or not
*/
- public abstract boolean isValid();
+ public abstract boolean areValid();
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index bd98872..f7d516a 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
@@ -22,8 +22,6 @@
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.List;
-
-import opennlp.tools.disambiguator.ims.IMSParameters;
import opennlp.tools.util.Span;
/**
@@ -38,14 +36,9 @@
*
* Otherwise for multiple words, you can set a word span instead of simply one
* index. For the moment the source of sense definitions is from WordNet. *
- * Please see {@link Lesk} for an un-supervised approach. Please see {@link IMS}
- * {@link OSCC} for a supervised approach.
*
* Examples on how to use each approach are provided in the test section.
- *
- * @see Lesk
- * @see IMS
- * @see OSCC
+ *
*/
public abstract class WSDisambiguator {
@@ -59,8 +52,7 @@
}
/**
- * @param the
- * disambiguation implementation specific parameters.
+ * @param params disambiguation implementation specific parameters.
* @throws InvalidParameterException
*/
public void setParams(WSDParameters params) throws InvalidParameterException {
@@ -85,8 +77,8 @@
*
* @param tokenizedContext
* @param tokenTags
+ * @param lemmas
* @param ambiguousTokenIndexSpan
- * @param ambiguousTokenLemma
* @return result as an array of WordNet IDs
*/
public List<String> disambiguate(String[] tokenizedContext,
@@ -147,7 +139,7 @@
} else {
if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
- String sense = IMSParameters.SenseSource.WSDHELPER.name() + " "
+ String sense = WSDParameters.SenseSource.WSDHELPER.name() + " "
+ WSDHelper.getNonRelevWordsDef(tokenTags[i]);
senses.add(sense);
} else {
@@ -161,7 +153,7 @@
}
/**
- * @param WSDSample
+ * @param sample
* @return result as an array of WordNet IDs
*/
public abstract String disambiguate(WSDSample sample);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java
new file mode 100644
index 0000000..096b788
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.TrainingParameters;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class WSDisambiguatorME extends WSDisambiguator {
+
+ protected WSDModel model;
+
+ protected static WSDContextGenerator cg = new IMSWSDContextGenerator();
+
+ public WSDisambiguatorME(WSDParameters params) {
+ this.params = params;
+ }
+
+ public WSDisambiguatorME(WSDModel model, WSDParameters params) {
+ this.model = model;
+ this.params = params;
+ }
+
+ public WSDModel getModel() {
+ return model;
+ }
+
+ public void setModel(WSDModel model) {
+ this.model = model;
+ }
+
+ public void setParameters(WSDParameters parameters) {
+ this.params = parameters;
+ }
+
+ public static WSDModel train(String lang, ObjectStream<WSDSample> samples,
+ TrainingParameters mlParams, WSDParameters params) throws IOException {
+
+ ArrayList<String> surroundingContext = buildSurroundingContext(samples,
+ ((WSDDefaultParameters) params).getWindowSize());
+
+ HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ MaxentModel meModel = null;
+
+ ArrayList<Event> events = new ArrayList<Event>();
+ ObjectStream<Event> es = null;
+
+ WSDSample sample = samples.read();
+ String wordTag = "";
+ if (sample != null) {
+ wordTag = sample.getTargetWordTag();
+ do {
+ String sense = sample.getSenseIDs()[0];
+ String[] context = cg
+ .getContext(sample, ((WSDDefaultParameters) params).ngram,
+ ((WSDDefaultParameters) params).windowSize, surroundingContext);
+ Event ev = new Event(sense + "", context);
+ events.add(ev);
+ } while ((sample = samples.read()) != null);
+ }
+
+ es = ObjectStreamUtils.createObjectStream(events);
+ EventTrainer trainer = TrainerFactory
+ .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
+
+ meModel = trainer.train(es);
+
+ return new WSDModel(lang, wordTag,
+ ((WSDDefaultParameters) params).windowSize,
+ ((WSDDefaultParameters) params).ngram, meModel, surroundingContext,
+ manifestInfoEntries);
+ }
+
+ public static ArrayList<String> buildSurroundingContext(
+ ObjectStream<WSDSample> samples, int windowSize) throws IOException {
+ IMSWSDContextGenerator contextGenerator = new IMSWSDContextGenerator();
+ ArrayList<String> surroundingWordsModel = new ArrayList<String>();
+ WSDSample sample;
+ while ((sample = samples.read()) != null) {
+ String[] words = contextGenerator
+ .extractSurroundingContext(sample.getTargetPosition(),
+ sample.getSentence(), sample.getLemmas(), windowSize);
+
+ if (words.length > 0) {
+ for (String word : words) {
+ surroundingWordsModel.add(word);
+ }
+ }
+ }
+ samples.reset();
+ return surroundingWordsModel;
+ }
+
+ @Override public String disambiguate(WSDSample sample) {
+ if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
+ String wordTag = sample.getTargetWordTag();
+
+ if (model == null || !model.getWordTag()
+ .equals(sample.getTargetWordTag())) {
+
+ String trainingFile =
+ ((WSDDefaultParameters) this.getParams()).getTrainingDataDirectory()
+ + sample.getTargetWordTag();
+
+ File file = new File(trainingFile + ".wsd.model");
+ if (file.exists() && !file.isDirectory()) {
+ try {
+ setModel(new WSDModel(file));
+
+ } catch (InvalidFormatException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ String outcome = "";
+
+ String[] context = cg
+ .getContext(sample, ((WSDDefaultParameters) this.params).ngram,
+ ((WSDDefaultParameters) this.params).windowSize,
+ this.model.getContextEntries());
+
+ double[] outcomeProbs = model.getWSDMaxentModel().eval(context);
+ outcome = model.getWSDMaxentModel().getBestOutcome(outcomeProbs);
+
+ if (outcome != null && !outcome.equals("")) {
+
+ return this.getParams().getSenseSource().name() + " " + wordTag
+ .split("\\.")[0] + "%" + outcome;
+
+ } else {
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+
+ } else {
+
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+ } else {
+ String outcome = "";
+
+ String[] context = cg
+ .getContext(sample, ((WSDDefaultParameters) this.params).ngram,
+ ((WSDDefaultParameters) this.params).windowSize,
+ this.model.getContextEntries());
+
+ double[] outcomeProbs = model.getWSDMaxentModel().eval(context);
+ outcome = model.getWSDMaxentModel().getBestOutcome(outcomeProbs);
+
+ if (outcome != null && !outcome.equals("")) {
+
+ return this.getParams().getSenseSource().name() + " " + wordTag
+ .split("\\.")[0] + "%" + outcome;
+ } else {
+
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+ }
+ } else {
+
+ if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
+ return WSDParameters.SenseSource.WSDHELPER.name() + " " + sample
+ .getTargetTag();
+ } else {
+ return null;
+ }
+
+ }
+
+ }
+
+ /**
+ * The IMS disambiguation method for a single word
+ *
+ * @param tokenizedContext : the text containing the word to disambiguate
+ * @param tokenTags : the tags corresponding to the context
+ * @param lemmas : the lemmas of ALL the words in the context
+ * @param index : the index of the word to disambiguate
+ * @return an array of the senses of the word to disambiguate
+ */
+ public String disambiguate(String[] tokenizedContext, String[] tokenTags,
+ String[] lemmas, int index) {
+ return disambiguate(
+ new WSDSample(tokenizedContext, tokenTags, lemmas, index));
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WTDIMS.java
new file mode 100644
index 0000000..64a2d41
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WTDIMS.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import net.sf.extjwnl.data.POS;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+
+public class WTDIMS {
+
+ // Attributes related to the context
+ protected String[] sentence;
+ protected String[] posTags;
+ protected String[] lemmas;
+ protected int wordIndex;
+ protected int sense;
+ protected String[] senseIDs;
+
+ // Attributes related to IMS features
+ protected String[] posOfSurroundingWords;
+ protected String[] surroundingWords;
+ protected String[] localCollocations;
+ protected String[] features;
+
+ public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
+ int wordIndex) {
+ this.sentence = sentence;
+ this.posTags = posTags;
+ this.wordIndex = wordIndex;
+ this.lemmas = lemmas;
+ }
+
+ public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
+ int wordIndex, String[] senseIDs) {
+ this.sentence = sentence;
+ this.posTags = posTags;
+ this.wordIndex = wordIndex;
+ this.lemmas = lemmas;
+ this.senseIDs = senseIDs;
+
+ }
+
+ public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
+ String word, String[] senseIDs) {
+ super();
+
+ this.sentence = sentence;
+ this.posTags = posTags;
+ this.lemmas = lemmas;
+
+ for (int i = 0; i < sentence.length; i++) {
+ if (word.equals(sentence[i])) {
+ this.wordIndex = i;
+ break;
+ }
+ }
+
+ this.senseIDs = senseIDs;
+
+ }
+
+ public WTDIMS(WSDSample sample) {
+ this.sentence = sample.getSentence();
+ this.posTags = sample.getTags();
+ this.lemmas = sample.getLemmas();
+ this.wordIndex = sample.getTargetPosition();
+ this.senseIDs = sample.getSenseIDs();
+
+ }
+
+ public String[] getSentence() {
+ return sentence;
+ }
+
+ public void setSentence(String[] sentence) {
+ this.sentence = sentence;
+ }
+
+ public String[] getPosTags() {
+ return posTags;
+ }
+
+ public void setPosTags(String[] posTags) {
+ this.posTags = posTags;
+ }
+
+ public int getWordIndex() {
+ return wordIndex;
+ }
+
+ public void setWordIndex(int wordIndex) {
+ this.wordIndex = wordIndex;
+ }
+
+ public String[] getLemmas() {
+ return lemmas;
+ }
+
+ public void setLemmas(String[] lemmas) {
+ this.lemmas = lemmas;
+ }
+
+ public int getSense() {
+ return sense;
+ }
+
+ public void setSense(int sense) {
+ this.sense = sense;
+ }
+
+ public String[] getSenseIDs() {
+ return senseIDs;
+ }
+
+ public void setSenseIDs(String[] senseIDs) {
+ this.senseIDs = senseIDs;
+ }
+
+ public String getWord() {
+ return this.getSentence()[this.getWordIndex()];
+ }
+
+ public String getWordTag() {
+
+ String wordBaseForm = this.getLemmas()[this.getWordIndex()];
+
+ String ref = "";
+
+ if ((WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) != null)) {
+ if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+ .equals(POS.VERB)) {
+ ref = wordBaseForm + ".v";
+ } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+ .equals(POS.NOUN)) {
+ ref = wordBaseForm + ".n";
+ } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+ .equals(POS.ADJECTIVE)) {
+ ref = wordBaseForm + ".a";
+ } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+ .equals(POS.ADVERB)) {
+ ref = wordBaseForm + ".r";
+ }
+ }
+
+ return ref;
+ }
+
+ public String[] getPosOfSurroundingWords() {
+ return posOfSurroundingWords;
+ }
+
+ public void setPosOfSurroundingWords(String[] posOfSurroundingWords) {
+ this.posOfSurroundingWords = posOfSurroundingWords;
+ }
+
+ public String[] getSurroundingWords() {
+ return surroundingWords;
+ }
+
+ public void setSurroundingWords(String[] surroundingWords) {
+ this.surroundingWords = surroundingWords;
+ }
+
+ public String[] getLocalCollocations() {
+ return localCollocations;
+ }
+
+ public void setLocalCollocations(String[] localCollocations) {
+ this.localCollocations = localCollocations;
+ }
+
+ public String[] getFeatures() {
+ return this.features;
+ }
+
+ public void setFeatures(String[] features) {
+ this.features = features;
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java
deleted file mode 100644
index 2b3fbf7..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.datareader;
-
-public class ClusterMembership {
-
- public int clusterID;
- public double centroidSimilarity;
- public String phrase;
- public String[] phraseWords;
-
- public ClusterMembership(int clusterID, double centroidSimilarity) {
- this.clusterID = clusterID;
- this.centroidSimilarity = centroidSimilarity;
- }
-
- public ClusterMembership() {
- this(0, 0.0);
- }
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java
deleted file mode 100644
index e8b384e..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.datareader;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-public class ClustersReader {
-
- public static String path = "src\\test\\resources\\phraseclusters\\";
- private static HashMap<String, ArrayList<ClusterMembership>> map = new HashMap<String, ArrayList<ClusterMembership>>();
-
- public void readFile(String url) {
-
- File file = new File(url);
-
- try (BufferedReader clusterList = new BufferedReader(new FileReader(file))) {
-
- String line;
-
- // Read the file
- while ((line = clusterList.readLine()) != null) {
-
- String[] parts = line.split("\\t");
- String phraseKey = parts[0];
- String[] phraseWords = phraseKey.split("\\s");
-
- System.out.println(phraseKey);
-
- ArrayList<ClusterMembership> memberships = new ArrayList<ClusterMembership>();
-
- for (int i = 1; i < parts.length; i += 2) {
- ClusterMembership membership = new ClusterMembership(
- Integer.parseInt(parts[i]), Double.parseDouble(parts[i + 1]));
- membership.phrase = phraseKey;
- membership.phraseWords = phraseWords;
-
- memberships.add(membership);
- }
- map.put(phraseKey, memberships);
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public boolean getNgramClusters(String word) {
-
- File folder = new File(path);
- if (folder.isDirectory()) {
- for (File file : folder.listFiles()) {
- readFile(file.getAbsolutePath());
- }
-
- } else {
- return false;
- }
-
- return true;
-
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
index c3ddd79..e0decf2 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
@@ -64,9 +64,9 @@
private static final String ELEMENT_PUNCTUATION = "punc";
- private static String semcorDirectory = "src\\test\\resources\\semcor3.0\\";
+ private static String semcorDirectory = "src/test/resources/semcor3.0/";
private static String[] folders = { "brown1", "brown2", "brownv" };
- private static String tagfiles = "\\tagfiles\\";
+ private static String tagfiles = "/tagfiles/";
public static String getSemcorDirectory() {
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
index 40884aa..9dfbb94 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
@@ -47,7 +47,7 @@
*/
public class SensevalReader {
- protected String sensevalDirectory = "src\\test\\resources\\senseval3\\";
+ protected String sensevalDirectory = "src/test/resources/senseval3/";
protected String data = sensevalDirectory + "EnglishLS.train";
protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
@@ -72,7 +72,7 @@
/**
* This extracts the equivalent senses. This serves in the case of the
* coarse-grained disambiguation
- *
+ *
* @param sensemapFile
* the file containing the equivalent senses, each set of equivalent
* senses per line
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
index 2d04d8d..250b962 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
@@ -21,6 +21,7 @@
import opennlp.tools.disambiguator.WSDHelper;
+// TODO extend Word from Wordnet
public class Word {
public static enum Type {
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
index 737b8fa..822e9c1 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -32,7 +32,7 @@
@Test
public static void main(String[] args) {
WSDHelper.print("Evaluation Started");
- String modelsDir = "src\\test\\resources\\models\\";
+ String modelsDir = "src/test/resources/models/";
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
index 2aa3334..0ef0091 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
@@ -41,7 +41,7 @@
public class LeskTester {
// TODO write more tests
- static String modelsDir = "src\\test\\resources\\models\\";
+ static String modelsDir = "src/test/resources/models/";
static Lesk lesk;
@@ -131,7 +131,7 @@
List<String> senses = lesk.disambiguate(sentence2, tags2, lemmas2, span);
assertEquals("Check number of returned words", 5, senses.size());
- assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01:: 4.8",
+ assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01:: 3.8",
senses.get(0));
assertEquals("Check 'radioactive' sense ID",
"WORDNET radioactive%3:00:00:: 6.0", senses.get(1));
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
index 0195cae..f7dfc68 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
@@ -33,7 +33,7 @@
@Test
public static void main(String[] args) {
WSDHelper.print("Evaluation Started");
- String modelsDir = "src\\test\\resources\\models\\";
+ String modelsDir = "src/test/resources/models/";
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
index 025261e..c6ca4b0 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
@@ -41,7 +41,7 @@
// TODO write more tests
// TODO modify when we fix the parameter model
- static String modelsDir = "src\\test\\resources\\models\\";
+ static String modelsDir = "src/test/resources/models/";
static MFS mfs;
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
deleted file mode 100644
index 16172f8..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ /dev/null
@@ -1,40 +0,0 @@
-package opennlp.tools.disambiguator;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.disambiguator.ims.IMSME;
-import opennlp.tools.disambiguator.ims.IMSParameters;
-
-public class Tester {
-
- public static void main(String[] args) {
-
- String modelsDir = "src\\test\\resources\\models\\";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- IMSME ims = new IMSME(new IMSParameters("\\"));
-
- String test3 = "The summer is almost over and I haven't been to the beach even once";
- String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
- String[] tags3 = WSDHelper.getTagger().tag(sentence3);
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- String lemma = WSDHelper.getLemmatizer().lemmatize(sentence3[i],
- tags3[i]);
- tempLemmas3.add(lemma);
- }
- String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
- // output
- List<String> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
- for (int i = 0; i < sentence3.length; i++) {
- System.out.print(sentence3[i] + " : ");
- WSDHelper.printResults(ims, senses3.get(i));
- WSDHelper.print("----------");
- }
-
- }
-}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java
new file mode 100644
index 0000000..3b43d99
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+// TODO improve the tests improve parameters
+public class WSDEvaluatorTest {
+
+ static SensevalReader seReader;
+
+ static String modelsDir = "src/test/resources/models/";
+ static String trainingDataDirectory = "src/test/resources/supervised/models/";
+
+ static WSDDefaultParameters params = new WSDDefaultParameters("");
+ static WSDisambiguatorME wsdME;
+ static WSDModel model;
+
+ static ArrayList<String> testWords;
+
+ /*
+ * Setup the testing variables
+ */
+ public static void setUpAndTraining() {
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ seReader = new SensevalReader();
+ testWords = seReader.getSensevalWords();
+ params = new WSDDefaultParameters("");
+ params.setTrainingDataDirectory(trainingDataDirectory);
+
+ TrainingParameters trainingParams = new TrainingParameters();
+ SemcorReaderExtended sr = new SemcorReaderExtended();
+
+ WSDHelper.print("Training Started");
+ for (String word : testWords) {
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WSDSample> instances = seReader.getSensevalData(word);
+ if (instances != null && instances.size() > 1) {
+ WSDHelper.print("------------------" + word + "------------------");
+ ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(word);
+
+ WSDModel writeModel = null;
+ /*
+ * Tests training the disambiguator We test both writing and reading a model
+ * file trained by semcor
+ */
+ File outFile;
+ try {
+ writeModel = WSDisambiguatorME
+ .train("en", sampleStream, trainingParams, params);
+ assertNotNull("Checking the model to be written", writeModel);
+ writeModel.writeModel(params.getTrainingDataDirectory() + word);
+ outFile = new File(
+ params.getTrainingDataDirectory() + word + ".wsd.model");
+ model = new WSDModel(outFile);
+ assertNotNull("Checking the read model", model);
+ wsdME = new WSDisambiguatorME(model, params);
+ assertNotNull("Checking the disambiguator", wsdME);
+ } catch (IOException e1) {
+ e1.printStackTrace();
+ fail("Exception in training");
+ }
+ }
+ }
+ }
+ }
+
+ public static void disambiguationEval() {
+
+ WSDHelper.print("Evaluation Started");
+
+ for (String word : testWords) {
+ WSDEvaluator evaluator = new WSDEvaluator(wsdME);
+
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WSDSample> instances = seReader.getSensevalData(word);
+ if (instances != null && instances.size() > 1) {
+ WSDHelper.print("------------------" + word + "------------------");
+ for (WSDSample instance : instances) {
+ if (instance.getSenseIDs() != null && !instance.getSenseIDs()[0]
+ .equals("null")) {
+ evaluator.evaluateSample(instance);
+ }
+ }
+ WSDHelper.print(evaluator.toString());
+ } else {
+ WSDHelper.print("null instances");
+ }
+ }
+
+ }
+ }
+
+ public static void main(String[] args) {
+ setUpAndTraining();
+ disambiguationEval();
+ }
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java
new file mode 100644
index 0000000..8470928
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.ObjectStream;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * This is the test class for {@link WSDisambiguatorME}.
+ * <p/>
+ * The scope of this test is to make sure that the WSDisambiguatorME code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of
+ * the disambiguator.
+ * <p/>
+ * In this test the {@link WSDisambiguatorME} is trained with Semcor
+ * and then the computed model is used to predict sentences
+ * from the training sentences.
+ */
+
+public class WSDTester {
+ // TODO write more tests
+ // TODO modify when we fix the parameter model
+
+ static String modelsDir = "src/test/resources/models/";
+ static String trainingDataDirectory = "src/test/resources/supervised/models/";
+
+ static WSDDefaultParameters params;
+ static WSDisambiguatorME wsdME;
+ static WSDModel model;
+
+ static String test = "please.v";
+ static File outFile;
+
+ static String test1 = "We need to discuss an important topic, please write to me soon.";
+ static String test2 = "The component was highly radioactive to the point that"
+ + " it has been activated the second it touched water";
+ static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+ static String[] sentence1;
+ static String[] sentence2;
+ static String[] sentence3;
+
+ static String[] tags1;
+ static String[] tags2;
+ static String[] tags3;
+
+ static String[] lemmas1;
+ static String[] lemmas2;
+ static String[] lemmas3;
+
+ /*
+ * Setup the testing variables
+ */
+ @BeforeClass public static void setUpAndTraining() {
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+ tags1 = WSDHelper.getTagger().tag(sentence1);
+ tags2 = WSDHelper.getTagger().tag(sentence2);
+ tags3 = WSDHelper.getTagger().tag(sentence3);
+
+ List<String> tempLemmas1 = new ArrayList<String>();
+ for (int i = 0; i < sentence1.length; i++) {
+ tempLemmas1
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
+ }
+ lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+ List<String> tempLemmas2 = new ArrayList<String>();
+ for (int i = 0; i < sentence2.length; i++) {
+ tempLemmas2
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
+ }
+ lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+
+ List<String> tempLemmas3 = new ArrayList<String>();
+ for (int i = 0; i < sentence3.length; i++) {
+ tempLemmas3
+ .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
+ }
+ lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+ params = new WSDDefaultParameters("");
+ params.setTrainingDataDirectory(trainingDataDirectory);
+ TrainingParameters trainingParams = new TrainingParameters();
+ SemcorReaderExtended sr = new SemcorReaderExtended();
+ ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+ WSDModel writeModel = null;
+ /*
+ * Tests training the disambiguator We test both writing and reading a model
+ * file trained by semcor
+ */
+
+ try {
+ writeModel = WSDisambiguatorME
+ .train("en", sampleStream, trainingParams, params);
+ assertNotNull("Checking the model to be written", writeModel);
+ writeModel.writeModel(params.getTrainingDataDirectory() + test);
+ outFile = new File(
+ params.getTrainingDataDirectory() + test + ".wsd.model");
+ model = new WSDModel(outFile);
+ assertNotNull("Checking the read model", model);
+ wsdME = new WSDisambiguatorME(model, params);
+ assertNotNull("Checking the disambiguator", wsdME);
+ } catch (IOException e1) {
+ e1.printStackTrace();
+ fail("Exception in training");
+ }
+ }
+
+ /*
+ * Tests disambiguating only one word : The ambiguous word "please"
+ */
+ @Test public void testOneWordDisambiguation() {
+ String sense = wsdME.disambiguate(sentence1, tags1, lemmas1, 8);
+ assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00::", sense);
+ }
+
+ /*
+ * Tests disambiguating a word Span In this case we test a mix of monosemous
+ * and polysemous words as well as words that do not need disambiguation such
+ * as determiners
+ */
+ @Test public void testWordSpanDisambiguation() {
+ Span span = new Span(3, 7);
+ List<String> senses = wsdME.disambiguate(sentence2, tags2, lemmas2, span);
+
+ assertEquals("Check number of returned words", 5, senses.size());
+ assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01::",
+ senses.get(0));
+ assertEquals("Check 'radioactive' sense ID",
+ "WORDNET radioactive%3:00:00::", senses.get(1));
+ assertEquals("Check preposition", "WSDHELPER to", senses.get(2));
+ assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3));
+ }
+
+ /*
+ * Tests disambiguating all the words
+ */
+ @Test public void testAllWordsDisambiguation() {
+ List<String> senses = wsdME.disambiguate(sentence3, tags3, lemmas3);
+
+ assertEquals("Check number of returned words", 15, senses.size());
+ assertEquals("Check preposition", "WSDHELPER personal pronoun",
+ senses.get(6));
+ }
+
+}