OPENNLP-843 - grouped the two supervised techniques into a common one with different context generators, the default context generator is from the IMS approach, updated the unit tests, need to remove the useless classes.

commit: 0f08de2f24ab14c52160dfbabcbc7c76852013b2 [log] [tgz]
author: Anthony Beylerian <beylerian@apache.org> Tue Jun 07 09:23:03 2016 +0000
committer: Anthony Beylerian <beylerian@apache.org> Tue Jun 07 09:23:03 2016 +0000
tree: f737d2b15120a380a21c2a1ec79a80e855bc82cd
parent: 7009f233be93efc73127a7e72cde3bb669d494d2 [diff]
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
index af2d31d..dd50415 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java

@@ -23,8 +23,6 @@
 import java.util.Arrays;
 import java.util.HashMap;
 
-import opennlp.tools.disambiguator.ims.WTDIMS;
-
 /**
  * Class for the extraction of features for the different Supervised
  * Disambiguation approaches.<br>

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
index 7113f49..c48d950 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java

@@ -42,7 +42,7 @@
     return windowTags;

   }

 

-  public String[] extractSurroundingWords(int index, String[] toks,

+  public String[] extractSurroundingContext(int index, String[] toks,

     String[] lemmas, int windowSize) {

 

     // TODO consider the windowSize

@@ -117,7 +117,7 @@
 

     HashSet<String> surroundingWords = new HashSet<>();

     surroundingWords.addAll(Arrays

-      .asList(extractSurroundingWords(index, tokens, lemmas, windowSize)));

+      .asList(extractSurroundingContext(index, tokens, lemmas, windowSize)));

 

     String[] localCollocations = extractLocalCollocations(index, tokens, ngram);

 


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java
index 6138bde..4ddfb13 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Lesk.java

@@ -53,8 +53,8 @@
 
   /**
    * Initializes the WSDParameters object and sets the input parameters
-   * 
-   * @param Input
+   *
+   * @param params
    *          Parameters
    * @throws InvalidParameterException
    */
@@ -65,8 +65,8 @@
   /**
    * If the parameters are null set the default ones, else only set them if they
    * valid. Invalid parameters will return a exception
-   * 
-   * @param Input
+   *
+   * @param params
    *          parameters
    * @throws InvalidParameterException
    */
@@ -75,7 +75,7 @@
     if (params == null) {
       this.params = new LeskParameters();
     } else {
-      if (params.isValid()) {
+      if (params.areValid()) {
         this.params = (LeskParameters) params;
       } else {
         throw new InvalidParameterException("wrong params");

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java
index 47ac1d3..0e61672 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/LeskParameters.java

@@ -148,7 +148,7 @@
    * 
    * @see opennlp.tools.disambiguator.WSDParameters#isValid()
    */
-  public boolean isValid() {
+  public boolean areValid() {
 
     switch (this.leskType) {
     case LESK_BASIC:

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
index 42b812f..719fad8 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java

@@ -30,7 +30,7 @@
  */

 public class OSCCWSDContextGenerator implements WSDContextGenerator {

 

-  public String[] extractSurroundingContextClusters(int index, String[] toks,

+  public String[] extractSurroundingContext(int index, String[] toks,

     String[] tags, String[] lemmas, int windowSize) {

 

     // TODO consider windowSize

@@ -78,7 +78,7 @@
 

     HashSet<String> surroundingContextClusters = new HashSet<>();

     surroundingContextClusters.addAll(Arrays.asList(

-      extractSurroundingContextClusters(index, toks, tags, lemmas,

+      extractSurroundingContext(index, toks, tags, lemmas,

         windowSize)));

 

     String[] serializedFeatures = new String[model.size()];


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java
new file mode 100644
index 0000000..e65bccb
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDDefaultParameters.java

@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+
+/**
+ * This class contains the parameters for the IMS approach as well as the
+ * directories containing the files used
+ */
+public class WSDDefaultParameters extends WSDParameters {
+
+  protected String languageCode;
+  protected int windowSize;
+  protected int ngram;
+
+  protected String trainingDataDirectory;
+
+  protected static final int DFLT_WIN_SIZE = 3;
+  protected static final int DFLT_NGRAM = 2;
+  protected static final String DFLT_LANG_CODE = "En";
+  protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
+
+  /**
+   * This constructor takes only two parameters. The default language used is
+   * <i>English</i>
+   *
+   * @param windowSize  the size of the window used for the extraction of the features
+   *                    qualified of Surrounding Words
+   * @param ngram       the number words used for the extraction of features qualified of
+   *                    Local Collocations
+   * @param senseSource the source of the training data
+   */
+  public WSDDefaultParameters(int windowSize, int ngram,
+    SenseSource senseSource, String trainingDataDirectory) {
+
+    this.languageCode = DFLT_LANG_CODE;
+    this.windowSize = windowSize;
+    this.ngram = ngram;
+    this.senseSource = senseSource;
+    this.trainingDataDirectory = trainingDataDirectory;
+
+    File folder = new File(trainingDataDirectory);
+    if (!folder.exists())
+      folder.mkdirs();
+  }
+
+  public WSDDefaultParameters(String trainingDataDirectory) {
+    this(DFLT_WIN_SIZE, DFLT_NGRAM, DFLT_SOURCE, trainingDataDirectory);
+  }
+
+  public String getLanguageCode() {
+    return languageCode;
+  }
+
+  public void setLanguageCode(String languageCode) {
+    this.languageCode = languageCode;
+  }
+
+  public int getWindowSize() {
+    return windowSize;
+  }
+
+  public void setWindowSize(int windowSize) {
+    this.windowSize = windowSize;
+  }
+
+  public int getNgram() {
+    return ngram;
+  }
+
+  public void setNgram(int ngram) {
+    this.ngram = ngram;
+  }
+
+  public String getTrainingDataDirectory() {
+    return trainingDataDirectory;
+  }
+
+  public void setTrainingDataDirectory(String trainingDataDirectory) {
+    this.trainingDataDirectory = trainingDataDirectory;
+  }
+
+  @Override public boolean areValid() {
+    // TODO recheck this pattern
+    return true;
+  }
+
+}

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index 47f4168..4b5b329 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java

@@ -23,7 +23,6 @@
  * Disambiguation Parameters
  *
  */
-// TODO make default params for supervised approaches
 public abstract class WSDParameters {
 
   public static enum SenseSource {
@@ -51,6 +50,6 @@
   /*
    * @return checks if the parameters are valid or not
    */
-  public abstract boolean isValid();
+  public abstract boolean areValid();
 
 }

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index bd98872..f7d516a 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java

@@ -22,8 +22,6 @@
 import java.security.InvalidParameterException;
 import java.util.ArrayList;
 import java.util.List;
-
-import opennlp.tools.disambiguator.ims.IMSParameters;
 import opennlp.tools.util.Span;
 
 /**
@@ -38,14 +36,9 @@
  * 
  * Otherwise for multiple words, you can set a word span instead of simply one
  * index. For the moment the source of sense definitions is from WordNet. *
- * Please see {@link Lesk} for an un-supervised approach. Please see {@link IMS}
- * {@link OSCC} for a supervised approach.
  * 
  * Examples on how to use each approach are provided in the test section.
- * 
- * @see Lesk
- * @see IMS
- * @see OSCC
+ *
  */
 public abstract class WSDisambiguator {
 
@@ -59,8 +52,7 @@
   }
 
   /**
-   * @param the
-   *          disambiguation implementation specific parameters.
+   * @param params disambiguation implementation specific parameters.
    * @throws InvalidParameterException
    */
   public void setParams(WSDParameters params) throws InvalidParameterException {
@@ -85,8 +77,8 @@
    * 
    * @param tokenizedContext
    * @param tokenTags
+   * @param lemmas
    * @param ambiguousTokenIndexSpan
-   * @param ambiguousTokenLemma
    * @return result as an array of WordNet IDs
    */
   public List<String> disambiguate(String[] tokenizedContext,
@@ -147,7 +139,7 @@
       } else {
 
         if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
-          String sense = IMSParameters.SenseSource.WSDHELPER.name() + " "
+          String sense = WSDParameters.SenseSource.WSDHELPER.name() + " "
               + WSDHelper.getNonRelevWordsDef(tokenTags[i]);
           senses.add(sense);
         } else {
@@ -161,7 +153,7 @@
   }
 
   /**
-   * @param WSDSample
+   * @param sample
    * @return result as an array of WordNet IDs
    */
   public abstract String disambiguate(WSDSample sample);

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java
new file mode 100644
index 0000000..096b788
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java

@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.TrainingParameters;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class WSDisambiguatorME extends WSDisambiguator {
+
+  protected WSDModel model;
+
+  protected static WSDContextGenerator cg = new IMSWSDContextGenerator();
+
+  public WSDisambiguatorME(WSDParameters params) {
+    this.params = params;
+  }
+
+  public WSDisambiguatorME(WSDModel model, WSDParameters params) {
+    this.model = model;
+    this.params = params;
+  }
+
+  public WSDModel getModel() {
+    return model;
+  }
+
+  public void setModel(WSDModel model) {
+    this.model = model;
+  }
+
+  public void setParameters(WSDParameters parameters) {
+    this.params = parameters;
+  }
+
+  public static WSDModel train(String lang, ObjectStream<WSDSample> samples,
+    TrainingParameters mlParams, WSDParameters params) throws IOException {
+
+    ArrayList<String> surroundingContext = buildSurroundingContext(samples,
+      ((WSDDefaultParameters) params).getWindowSize());
+
+    HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+    MaxentModel meModel = null;
+
+    ArrayList<Event> events = new ArrayList<Event>();
+    ObjectStream<Event> es = null;
+
+    WSDSample sample = samples.read();
+    String wordTag = "";
+    if (sample != null) {
+      wordTag = sample.getTargetWordTag();
+      do {
+        String sense = sample.getSenseIDs()[0];
+        String[] context = cg
+          .getContext(sample, ((WSDDefaultParameters) params).ngram,
+            ((WSDDefaultParameters) params).windowSize, surroundingContext);
+        Event ev = new Event(sense + "", context);
+        events.add(ev);
+      } while ((sample = samples.read()) != null);
+    }
+
+    es = ObjectStreamUtils.createObjectStream(events);
+    EventTrainer trainer = TrainerFactory
+      .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
+
+    meModel = trainer.train(es);
+
+    return new WSDModel(lang, wordTag,
+      ((WSDDefaultParameters) params).windowSize,
+      ((WSDDefaultParameters) params).ngram, meModel, surroundingContext,
+      manifestInfoEntries);
+  }
+
+  public static ArrayList<String> buildSurroundingContext(
+    ObjectStream<WSDSample> samples, int windowSize) throws IOException {
+    IMSWSDContextGenerator contextGenerator = new IMSWSDContextGenerator();
+    ArrayList<String> surroundingWordsModel = new ArrayList<String>();
+    WSDSample sample;
+    while ((sample = samples.read()) != null) {
+      String[] words = contextGenerator
+        .extractSurroundingContext(sample.getTargetPosition(),
+          sample.getSentence(), sample.getLemmas(), windowSize);
+
+      if (words.length > 0) {
+        for (String word : words) {
+          surroundingWordsModel.add(word);
+        }
+      }
+    }
+    samples.reset();
+    return surroundingWordsModel;
+  }
+
+  @Override public String disambiguate(WSDSample sample) {
+    if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
+      String wordTag = sample.getTargetWordTag();
+
+      if (model == null || !model.getWordTag()
+        .equals(sample.getTargetWordTag())) {
+
+        String trainingFile =
+          ((WSDDefaultParameters) this.getParams()).getTrainingDataDirectory()
+            + sample.getTargetWordTag();
+
+        File file = new File(trainingFile + ".wsd.model");
+        if (file.exists() && !file.isDirectory()) {
+          try {
+            setModel(new WSDModel(file));
+
+          } catch (InvalidFormatException e) {
+            e.printStackTrace();
+          } catch (IOException e) {
+            e.printStackTrace();
+          }
+
+          String outcome = "";
+
+          String[] context = cg
+            .getContext(sample, ((WSDDefaultParameters) this.params).ngram,
+              ((WSDDefaultParameters) this.params).windowSize,
+              this.model.getContextEntries());
+
+          double[] outcomeProbs = model.getWSDMaxentModel().eval(context);
+          outcome = model.getWSDMaxentModel().getBestOutcome(outcomeProbs);
+
+          if (outcome != null && !outcome.equals("")) {
+
+            return this.getParams().getSenseSource().name() + " " + wordTag
+              .split("\\.")[0] + "%" + outcome;
+
+          } else {
+            MFS mfs = new MFS();
+            return mfs.disambiguate(wordTag);
+          }
+
+        } else {
+
+          MFS mfs = new MFS();
+          return mfs.disambiguate(wordTag);
+        }
+      } else {
+        String outcome = "";
+
+        String[] context = cg
+          .getContext(sample, ((WSDDefaultParameters) this.params).ngram,
+            ((WSDDefaultParameters) this.params).windowSize,
+            this.model.getContextEntries());
+
+        double[] outcomeProbs = model.getWSDMaxentModel().eval(context);
+        outcome = model.getWSDMaxentModel().getBestOutcome(outcomeProbs);
+
+        if (outcome != null && !outcome.equals("")) {
+
+          return this.getParams().getSenseSource().name() + " " + wordTag
+            .split("\\.")[0] + "%" + outcome;
+        } else {
+
+          MFS mfs = new MFS();
+          return mfs.disambiguate(wordTag);
+        }
+      }
+    } else {
+
+      if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
+        return WSDParameters.SenseSource.WSDHELPER.name() + " " + sample
+          .getTargetTag();
+      } else {
+        return null;
+      }
+
+    }
+
+  }
+
+  /**
+   * The IMS disambiguation method for a single word
+   *
+   * @param tokenizedContext : the text containing the word to disambiguate
+   * @param tokenTags        : the tags corresponding to the context
+   * @param lemmas           : the lemmas of ALL the words in the context
+   * @param index            : the index of the word to disambiguate
+   * @return an array of the senses of the word to disambiguate
+   */
+  public String disambiguate(String[] tokenizedContext, String[] tokenTags,
+    String[] lemmas, int index) {
+    return disambiguate(
+      new WSDSample(tokenizedContext, tokenTags, lemmas, index));
+  }
+
+}

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WTDIMS.java
new file mode 100644
index 0000000..64a2d41
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WTDIMS.java

@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import net.sf.extjwnl.data.POS;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+
+public class WTDIMS {
+
+  // Attributes related to the context
+  protected String[] sentence;
+  protected String[] posTags;
+  protected String[] lemmas;
+  protected int wordIndex;
+  protected int sense;
+  protected String[] senseIDs;
+
+  // Attributes related to IMS features
+  protected String[] posOfSurroundingWords;
+  protected String[] surroundingWords;
+  protected String[] localCollocations;
+  protected String[] features;
+
+  public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
+    int wordIndex) {
+    this.sentence = sentence;
+    this.posTags = posTags;
+    this.wordIndex = wordIndex;
+    this.lemmas = lemmas;
+  }
+
+  public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
+    int wordIndex, String[] senseIDs) {
+    this.sentence = sentence;
+    this.posTags = posTags;
+    this.wordIndex = wordIndex;
+    this.lemmas = lemmas;
+    this.senseIDs = senseIDs;
+
+  }
+
+  public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
+    String word, String[] senseIDs) {
+    super();
+
+    this.sentence = sentence;
+    this.posTags = posTags;
+    this.lemmas = lemmas;
+
+    for (int i = 0; i < sentence.length; i++) {
+      if (word.equals(sentence[i])) {
+        this.wordIndex = i;
+        break;
+      }
+    }
+
+    this.senseIDs = senseIDs;
+
+  }
+
+  public WTDIMS(WSDSample sample) {
+    this.sentence = sample.getSentence();
+    this.posTags = sample.getTags();
+    this.lemmas = sample.getLemmas();
+    this.wordIndex = sample.getTargetPosition();
+    this.senseIDs = sample.getSenseIDs();
+
+  }
+
+  public String[] getSentence() {
+    return sentence;
+  }
+
+  public void setSentence(String[] sentence) {
+    this.sentence = sentence;
+  }
+
+  public String[] getPosTags() {
+    return posTags;
+  }
+
+  public void setPosTags(String[] posTags) {
+    this.posTags = posTags;
+  }
+
+  public int getWordIndex() {
+    return wordIndex;
+  }
+
+  public void setWordIndex(int wordIndex) {
+    this.wordIndex = wordIndex;
+  }
+
+  public String[] getLemmas() {
+    return lemmas;
+  }
+
+  public void setLemmas(String[] lemmas) {
+    this.lemmas = lemmas;
+  }
+
+  public int getSense() {
+    return sense;
+  }
+
+  public void setSense(int sense) {
+    this.sense = sense;
+  }
+
+  public String[] getSenseIDs() {
+    return senseIDs;
+  }
+
+  public void setSenseIDs(String[] senseIDs) {
+    this.senseIDs = senseIDs;
+  }
+
+  public String getWord() {
+    return this.getSentence()[this.getWordIndex()];
+  }
+
+  public String getWordTag() {
+
+    String wordBaseForm = this.getLemmas()[this.getWordIndex()];
+
+    String ref = "";
+
+    if ((WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) != null)) {
+      if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+        .equals(POS.VERB)) {
+        ref = wordBaseForm + ".v";
+      } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+        .equals(POS.NOUN)) {
+        ref = wordBaseForm + ".n";
+      } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+        .equals(POS.ADJECTIVE)) {
+        ref = wordBaseForm + ".a";
+      } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
+        .equals(POS.ADVERB)) {
+        ref = wordBaseForm + ".r";
+      }
+    }
+
+    return ref;
+  }
+
+  public String[] getPosOfSurroundingWords() {
+    return posOfSurroundingWords;
+  }
+
+  public void setPosOfSurroundingWords(String[] posOfSurroundingWords) {
+    this.posOfSurroundingWords = posOfSurroundingWords;
+  }
+
+  public String[] getSurroundingWords() {
+    return surroundingWords;
+  }
+
+  public void setSurroundingWords(String[] surroundingWords) {
+    this.surroundingWords = surroundingWords;
+  }
+
+  public String[] getLocalCollocations() {
+    return localCollocations;
+  }
+
+  public void setLocalCollocations(String[] localCollocations) {
+    this.localCollocations = localCollocations;
+  }
+
+  public String[] getFeatures() {
+    return this.features;
+  }
+
+  public void setFeatures(String[] features) {
+    this.features = features;
+  }
+
+}

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java
deleted file mode 100644
index 2b3fbf7..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java
+++ /dev/null

@@ -1,37 +0,0 @@
-/*

- * Licensed to the Apache Software Foundation (ASF) under one

- * or more contributor license agreements.  See the NOTICE file

- * distributed with this work for additional information

- * regarding copyright ownership.  The ASF licenses this file

- * to you under the Apache License, Version 2.0 (the

- * "License"); you may not use this file except in compliance

- * with the License.  You may obtain a copy of the License at

- * 

- *   http://www.apache.org/licenses/LICENSE-2.0

- * 

- * Unless required by applicable law or agreed to in writing,

- * software distributed under the License is distributed on an

- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

- * KIND, either express or implied.  See the License for the

- * specific language governing permissions and limitations

- * under the License.

- */

-

-package opennlp.tools.disambiguator.datareader;

-

-public class ClusterMembership {

-

-  public int clusterID;

-  public double centroidSimilarity;

-  public String phrase;

-  public String[] phraseWords;

-

-  public ClusterMembership(int clusterID, double centroidSimilarity) {

-    this.clusterID = clusterID;

-    this.centroidSimilarity = centroidSimilarity;

-  }

-

-  public ClusterMembership() {

-    this(0, 0.0);

-  }

-}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java
deleted file mode 100644
index e8b384e..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java
+++ /dev/null

@@ -1,84 +0,0 @@
-/*

- * Licensed to the Apache Software Foundation (ASF) under one

- * or more contributor license agreements.  See the NOTICE file

- * distributed with this work for additional information

- * regarding copyright ownership.  The ASF licenses this file

- * to you under the Apache License, Version 2.0 (the

- * "License"); you may not use this file except in compliance

- * with the License.  You may obtain a copy of the License at

- * 

- *   http://www.apache.org/licenses/LICENSE-2.0

- * 

- * Unless required by applicable law or agreed to in writing,

- * software distributed under the License is distributed on an

- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

- * KIND, either express or implied.  See the License for the

- * specific language governing permissions and limitations

- * under the License.

- */

-

-package opennlp.tools.disambiguator.datareader;

-

-import java.io.BufferedReader;

-import java.io.File;

-import java.io.FileReader;

-import java.io.IOException;

-import java.util.ArrayList;

-import java.util.HashMap;

-

-public class ClustersReader {

-

-  public static String path = "src\\test\\resources\\phraseclusters\\";

-  private static HashMap<String, ArrayList<ClusterMembership>> map = new HashMap<String, ArrayList<ClusterMembership>>();

-

-  public void readFile(String url) {

-

-    File file = new File(url);

-

-    try (BufferedReader clusterList = new BufferedReader(new FileReader(file))) {

-

-      String line;

-

-      // Read the file

-      while ((line = clusterList.readLine()) != null) {

-

-        String[] parts = line.split("\\t");

-        String phraseKey = parts[0];

-        String[] phraseWords = phraseKey.split("\\s");

-

-        System.out.println(phraseKey);

-

-        ArrayList<ClusterMembership> memberships = new ArrayList<ClusterMembership>();

-

-        for (int i = 1; i < parts.length; i += 2) {

-          ClusterMembership membership = new ClusterMembership(

-              Integer.parseInt(parts[i]), Double.parseDouble(parts[i + 1]));

-          membership.phrase = phraseKey;

-          membership.phraseWords = phraseWords;

-

-          memberships.add(membership);

-        }

-        map.put(phraseKey, memberships);

-      }

-    } catch (IOException e) {

-      e.printStackTrace();

-    }

-  }

-

-  public boolean getNgramClusters(String word) {

-

-    File folder = new File(path);

-    if (folder.isDirectory()) {

-      for (File file : folder.listFiles()) {

-        readFile(file.getAbsolutePath());

-      }

-

-    } else {

-      return false;

-    }

-

-    return true;

-

-  }

-

-}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
index c3ddd79..e0decf2 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java

@@ -64,9 +64,9 @@
 

   private static final String ELEMENT_PUNCTUATION = "punc";

 

-  private static String semcorDirectory = "src\\test\\resources\\semcor3.0\\";

+  private static String semcorDirectory = "src/test/resources/semcor3.0/";

   private static String[] folders = { "brown1", "brown2", "brownv" };

-  private static String tagfiles = "\\tagfiles\\";

+  private static String tagfiles = "/tagfiles/";

 

   

   public static String getSemcorDirectory() {


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
index 40884aa..9dfbb94 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java

@@ -47,7 +47,7 @@
  */

 public class SensevalReader {

 

-  protected String sensevalDirectory = "src\\test\\resources\\senseval3\\";

+  protected String sensevalDirectory = "src/test/resources/senseval3/";

 

   protected String data = sensevalDirectory + "EnglishLS.train";

   protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";

@@ -72,7 +72,7 @@
   /**

    * This extracts the equivalent senses. This serves in the case of the

    * coarse-grained disambiguation

-   * 

+   *

    * @param sensemapFile

    *          the file containing the equivalent senses, each set of equivalent

    *          senses per line


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
index 2d04d8d..250b962 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java

@@ -21,6 +21,7 @@
 

 import opennlp.tools.disambiguator.WSDHelper;

 

+// TODO extend Word from Wordnet

 public class Word {

 

   public static enum Type {


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
index 737b8fa..822e9c1 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java

@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -32,7 +32,7 @@
   @Test
   public static void main(String[] args) {
     WSDHelper.print("Evaluation Started");
-    String modelsDir = "src\\test\\resources\\models\\";
+    String modelsDir = "src/test/resources/models/";
     WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
     WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
     WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
index 2aa3334..0ef0091 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java

@@ -41,7 +41,7 @@
 public class LeskTester {
   // TODO write more tests
 
-  static String modelsDir = "src\\test\\resources\\models\\";
+  static String modelsDir = "src/test/resources/models/";
 
   static Lesk lesk;
 
@@ -131,7 +131,7 @@
     List<String> senses = lesk.disambiguate(sentence2, tags2, lemmas2, span);
 
     assertEquals("Check number of returned words", 5, senses.size());
-    assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01:: 4.8",
+    assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01:: 3.8",
         senses.get(0));
     assertEquals("Check 'radioactive' sense ID",
         "WORDNET radioactive%3:00:00:: 6.0", senses.get(1));

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
index 0195cae..f7dfc68 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java

@@ -33,7 +33,7 @@
   @Test

   public static void main(String[] args) {

     WSDHelper.print("Evaluation Started");

-    String modelsDir = "src\\test\\resources\\models\\";

+    String modelsDir = "src/test/resources/models/";

     WSDHelper.loadTokenizer(modelsDir + "en-token.bin");

     WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");

     WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
index 025261e..c6ca4b0 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java

@@ -41,7 +41,7 @@
   // TODO write more tests

   // TODO modify when we fix the parameter model

 

-  static String modelsDir = "src\\test\\resources\\models\\";

+  static String modelsDir = "src/test/resources/models/";

 

   static MFS mfs;

 


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
deleted file mode 100644
index 16172f8..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ /dev/null

@@ -1,40 +0,0 @@
-package opennlp.tools.disambiguator;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.disambiguator.ims.IMSME;
-import opennlp.tools.disambiguator.ims.IMSParameters;
-
-public class Tester {
-
-  public static void main(String[] args) {
-
-    String modelsDir = "src\\test\\resources\\models\\";
-    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
-    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
-    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
-    IMSME ims = new IMSME(new IMSParameters("\\"));
-
-    String test3 = "The summer is almost over and I haven't been to the beach even once";
-    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
-    List<String> tempLemmas3 = new ArrayList<String>();
-    for (int i = 0; i < sentence3.length; i++) {
-      String lemma = WSDHelper.getLemmatizer().lemmatize(sentence3[i],
-          tags3[i]);
-      tempLemmas3.add(lemma);
-    }
-    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
-    // output
-    List<String> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
-    for (int i = 0; i < sentence3.length; i++) {
-      System.out.print(sentence3[i] + " : ");
-      WSDHelper.printResults(ims, senses3.get(i));
-      WSDHelper.print("----------");
-    }
-
-  }
-}
\ No newline at end of file

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java
new file mode 100644
index 0000000..3b43d99
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java

@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+// TODO improve the tests improve parameters
+public class WSDEvaluatorTest {
+
+  static SensevalReader seReader;
+
+  static String modelsDir = "src/test/resources/models/";
+  static String trainingDataDirectory = "src/test/resources/supervised/models/";
+
+  static WSDDefaultParameters params = new WSDDefaultParameters("");
+  static WSDisambiguatorME wsdME;
+  static WSDModel model;
+
+  static ArrayList<String> testWords;
+
+  /*
+   * Setup the testing variables
+   */
+  public static void setUpAndTraining() {
+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+    seReader = new SensevalReader();
+    testWords = seReader.getSensevalWords();
+    params = new WSDDefaultParameters("");
+    params.setTrainingDataDirectory(trainingDataDirectory);
+
+    TrainingParameters trainingParams = new TrainingParameters();
+    SemcorReaderExtended sr = new SemcorReaderExtended();
+
+    WSDHelper.print("Training Started");
+    for (String word : testWords) {
+      // don't take verbs because they are not from WordNet
+      if (!word.split("\\.")[1].equals("v")) {
+
+        ArrayList<WSDSample> instances = seReader.getSensevalData(word);
+        if (instances != null && instances.size() > 1) {
+          WSDHelper.print("------------------" + word + "------------------");
+          ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(word);
+
+          WSDModel writeModel = null;
+    /*
+     * Tests training the disambiguator We test both writing and reading a model
+     * file trained by semcor
+     */
+          File outFile;
+          try {
+            writeModel = WSDisambiguatorME
+              .train("en", sampleStream, trainingParams, params);
+            assertNotNull("Checking the model to be written", writeModel);
+            writeModel.writeModel(params.getTrainingDataDirectory() + word);
+            outFile = new File(
+              params.getTrainingDataDirectory() + word + ".wsd.model");
+            model = new WSDModel(outFile);
+            assertNotNull("Checking the read model", model);
+            wsdME = new WSDisambiguatorME(model, params);
+            assertNotNull("Checking the disambiguator", wsdME);
+          } catch (IOException e1) {
+            e1.printStackTrace();
+            fail("Exception in training");
+          }
+        }
+      }
+    }
+  }
+
+  public static void disambiguationEval() {
+
+    WSDHelper.print("Evaluation Started");
+
+    for (String word : testWords) {
+      WSDEvaluator evaluator = new WSDEvaluator(wsdME);
+
+      // don't take verbs because they are not from WordNet
+      if (!word.split("\\.")[1].equals("v")) {
+
+        ArrayList<WSDSample> instances = seReader.getSensevalData(word);
+        if (instances != null && instances.size() > 1) {
+          WSDHelper.print("------------------" + word + "------------------");
+          for (WSDSample instance : instances) {
+            if (instance.getSenseIDs() != null && !instance.getSenseIDs()[0]
+              .equals("null")) {
+              evaluator.evaluateSample(instance);
+            }
+          }
+          WSDHelper.print(evaluator.toString());
+        } else {
+          WSDHelper.print("null instances");
+        }
+      }
+
+    }
+  }
+
+  public static void main(String[] args) {
+    setUpAndTraining();
+    disambiguationEval();
+  }
+}

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java
new file mode 100644
index 0000000..8470928
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java

@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.ObjectStream;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * This is the test class for {@link WSDisambiguatorME}.
+ * <p/>
+ * The scope of this test is to make sure that the WSDisambiguatorME code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of
+ * the disambiguator.
+ * <p/>
+ * In this test the {@link WSDisambiguatorME} is trained with Semcor
+ * and then the computed model is used to predict sentences
+ * from the training sentences.
+ */
+
+public class WSDTester {
+  // TODO write more tests
+  // TODO modify when we fix the parameter model
+
+  static String modelsDir = "src/test/resources/models/";
+  static String trainingDataDirectory = "src/test/resources/supervised/models/";
+
+  static WSDDefaultParameters params;
+  static WSDisambiguatorME wsdME;
+  static WSDModel model;
+
+  static String test = "please.v";
+  static File outFile;
+
+  static String test1 = "We need to discuss an important topic, please write to me soon.";
+  static String test2 = "The component was highly radioactive to the point that"
+    + " it has been activated the second it touched water";
+  static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+  static String[] sentence1;
+  static String[] sentence2;
+  static String[] sentence3;
+
+  static String[] tags1;
+  static String[] tags2;
+  static String[] tags3;
+
+  static String[] lemmas1;
+  static String[] lemmas2;
+  static String[] lemmas3;
+
+  /*
+   * Setup the testing variables
+   */
+  @BeforeClass public static void setUpAndTraining() {
+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+    sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+    sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+    sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+    tags1 = WSDHelper.getTagger().tag(sentence1);
+    tags2 = WSDHelper.getTagger().tag(sentence2);
+    tags3 = WSDHelper.getTagger().tag(sentence3);
+
+    List<String> tempLemmas1 = new ArrayList<String>();
+    for (int i = 0; i < sentence1.length; i++) {
+      tempLemmas1
+        .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
+    }
+    lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+    List<String> tempLemmas2 = new ArrayList<String>();
+    for (int i = 0; i < sentence2.length; i++) {
+      tempLemmas2
+        .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
+    }
+    lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+
+    List<String> tempLemmas3 = new ArrayList<String>();
+    for (int i = 0; i < sentence3.length; i++) {
+      tempLemmas3
+        .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
+    }
+    lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+    params = new WSDDefaultParameters("");
+    params.setTrainingDataDirectory(trainingDataDirectory);
+    TrainingParameters trainingParams = new TrainingParameters();
+    SemcorReaderExtended sr = new SemcorReaderExtended();
+    ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+    WSDModel writeModel = null;
+    /*
+     * Tests training the disambiguator We test both writing and reading a model
+     * file trained by semcor
+     */
+
+    try {
+      writeModel = WSDisambiguatorME
+        .train("en", sampleStream, trainingParams, params);
+      assertNotNull("Checking the model to be written", writeModel);
+      writeModel.writeModel(params.getTrainingDataDirectory() + test);
+      outFile = new File(
+        params.getTrainingDataDirectory() + test + ".wsd.model");
+      model = new WSDModel(outFile);
+      assertNotNull("Checking the read model", model);
+      wsdME = new WSDisambiguatorME(model, params);
+      assertNotNull("Checking the disambiguator", wsdME);
+    } catch (IOException e1) {
+      e1.printStackTrace();
+      fail("Exception in training");
+    }
+  }
+
+  /*
+   * Tests disambiguating only one word : The ambiguous word "please"
+   */
+  @Test public void testOneWordDisambiguation() {
+    String sense = wsdME.disambiguate(sentence1, tags1, lemmas1, 8);
+    assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00::", sense);
+  }
+
+  /*
+   * Tests disambiguating a word Span In this case we test a mix of monosemous
+   * and polysemous words as well as words that do not need disambiguation such
+   * as determiners
+   */
+  @Test public void testWordSpanDisambiguation() {
+    Span span = new Span(3, 7);
+    List<String> senses = wsdME.disambiguate(sentence2, tags2, lemmas2, span);
+
+    assertEquals("Check number of returned words", 5, senses.size());
+    assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01::",
+      senses.get(0));
+    assertEquals("Check 'radioactive' sense ID",
+      "WORDNET radioactive%3:00:00::", senses.get(1));
+    assertEquals("Check preposition", "WSDHELPER to", senses.get(2));
+    assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3));
+  }
+
+  /*
+   * Tests disambiguating all the words
+   */
+  @Test public void testAllWordsDisambiguation() {
+    List<String> senses = wsdME.disambiguate(sentence3, tags3, lemmas3);
+
+    assertEquals("Check number of returned words", 15, senses.size());
+    assertEquals("Check preposition", "WSDHELPER personal pronoun",
+      senses.get(6));
+  }
+
+}
commit	0f08de2f24ab14c52160dfbabcbc7c76852013b2	[log] [tgz]
author	Anthony Beylerian <beylerian@apache.org>	Tue Jun 07 09:23:03 2016 +0000
committer	Anthony Beylerian <beylerian@apache.org>	Tue Jun 07 09:23:03 2016 +0000
tree	f737d2b15120a380a21c2a1ec79a80e855bc82cd
parent	7009f233be93efc73127a7e72cde3bb669d494d2 [diff]