added unit tests, corrected some mistakes, need more unit tests

commit: 092bff60a01652403d5fbf6782b3f83fc3c07fb8 [log] [tgz]
author: Anthony Beylerian <beylerian@apache.org> Fri Mar 11 17:37:07 2016 +0000
committer: Anthony Beylerian <beylerian@apache.org> Fri Mar 11 17:37:07 2016 +0000
tree: dc8e79a71474fe5cdc39a0a5353af080c0f407da
parent: ce88c13f0dc68578974330ab7b304172a3893c38 [diff]
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index 136d5f2..d890ba0 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java

@@ -27,6 +27,7 @@
 
   protected boolean isCoarseSense;
   public static boolean isStemCompare;
+  protected boolean returnMultiple;
 
   public static enum SenseSource {
     WORDNET, WSDHELPER, OTHER;
@@ -61,8 +62,17 @@
     this.senseSource = senseSource;
   }
 
+  public boolean isReturnMultiple() {
+    return returnMultiple;
+  }
+
+  public void setReturnMultiple(boolean returnMultiple) {
+    this.returnMultiple = returnMultiple;
+  }
+
   public WSDParameters() {
     this.isCoarseSense = false;
+    this.returnMultiple = false;
   }
 
   /**

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index 06451e5..a825e11 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java

@@ -150,7 +150,7 @@
 
         if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
           String s = IMSParameters.SenseSource.WSDHELPER.name() + " "
-              + tokenTags[i];
+              + WSDHelper.getNonRelevWordsDef(tokenTags[i]);
           String[] sense = { s };
 
           senses.add(sense);

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
index fbf8ba1..14413d6 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java

@@ -55,8 +55,9 @@
   }
 
   public String[] extractSurroundingWords(int index, String[] toks,
-      String[] lemmas) {
+      String[] lemmas, int windowSize) {
 
+    // TODO consider the windowSize 
     ArrayList<String> contextWords = new ArrayList<String>();
 
     for (int i = 0; i < toks.length; i++) {
@@ -123,7 +124,7 @@
 
     HashSet<String> surroundingWords = new HashSet<>();
     surroundingWords.addAll(Arrays.asList(extractSurroundingWords(index, toks,
-        lemmas)));
+        lemmas, windowSize)));
 
     String[] localCollocations = extractLocalCollocations(index, toks, ngram);
 

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
index 55bc0ed..b1e8a18 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java

@@ -41,16 +41,17 @@
 

   protected static IMSContextGenerator cg = new DefaultIMSContextGenerator();

 

-  public IMSME(IMSParameters params){

+  public IMSME(IMSParameters params) {

     this.params = params;

   }

-  

+

   public IMSME(IMSModel model, IMSParameters params) {

     this.imsModel = model;

     this.params = params;

-    

-//    Assert.assertEquals(model.getWindowSize(),params.getWindowSize());

-//    Assert.assertEquals(model.getNgram(),params.getNgram());

+  }

+

+  public IMSModel getModel() {

+    return imsModel;

   }

 

   public void setModel(IMSModel model) {

@@ -65,7 +66,7 @@
       TrainingParameters mlParams, IMSParameters imsParams,

       IMSFactory imsfactory) throws IOException {

 

-    ArrayList<String> surroundingWordModel = buildSurroundingWords(samples);

+    ArrayList<String> surroundingWordModel = buildSurroundingWords(samples, imsParams.getWindowSize());

 

     HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();

 

@@ -88,13 +89,13 @@
 

         events.add(ev);

 

-        es = ObjectStreamUtils.createObjectStream(events);

-

       } while ((sample = samples.read()) != null);

     }

 

-    EventTrainer trainer = TrainerFactory.getEventTrainer(

-        mlParams.getSettings(), manifestInfoEntries);

+    es = ObjectStreamUtils.createObjectStream(events);

+

+    EventTrainer trainer = TrainerFactory

+        .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);

     imsModel = trainer.train(es);

 

     return new IMSModel(lang, wordTag, imsParams.windowSize, imsParams.ngram,

@@ -102,13 +103,13 @@
   }

 

   public static ArrayList<String> buildSurroundingWords(

-      ObjectStream<WSDSample> samples) throws IOException {

+      ObjectStream<WSDSample> samples, int windowSize) throws IOException {

     DefaultIMSContextGenerator imsCG = new DefaultIMSContextGenerator();

     ArrayList<String> surroundingWordsModel = new ArrayList<String>();

     WSDSample sample;

     while ((sample = samples.read()) != null) {

-      String[] words = imsCG.extractSurroundingWords(

-          sample.getTargetPosition(), sample.getSentence(), sample.getLemmas());

+      String[] words = imsCG.extractSurroundingWords(sample.getTargetPosition(),

+          sample.getSentence(), sample.getLemmas(), windowSize);

 

       if (words.length > 0) {

         for (String word : words) {

@@ -125,10 +126,11 @@
     if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {

       String wordTag = sample.getTargetWordTag();

 

-      String trainingFile = ((IMSParameters) this.getParams())

-          .getTrainingDataDirectory() + sample.getTargetWordTag();

+      if (imsModel == null

+          || !imsModel.getWordTag().equals(sample.getTargetWordTag())) {

 

-      if (imsModel==null || !imsModel.getWordTag().equals(sample.getTargetWordTag())) {

+        String trainingFile = ((IMSParameters) this.getParams())

+            .getTrainingDataDirectory() + sample.getTargetWordTag();

 

         File file = new File(trainingFile + ".ims.model");

         if (file.exists() && !file.isDirectory()) {

@@ -167,11 +169,11 @@
           }

 

         } else {

-

           MFS mfs = new MFS();

           return mfs.disambiguate(wordTag);

         }

       } else {

+

         String outcome = "";

 

         String[] context = cg.getContext(sample,

@@ -226,8 +228,8 @@
    */

   public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,

       String[] lemmas, int index) {

-    return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,

-        index));

+    return disambiguate(

+        new WSDSample(tokenizedContext, tokenTags, lemmas, index));

   }

 

 }


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
index af85582..1e540cf 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java

@@ -20,6 +20,7 @@
 package opennlp.tools.disambiguator.ims;
 
 import java.io.File;
+import java.security.InvalidParameterException;
 
 import opennlp.tools.disambiguator.WSDParameters;
 
@@ -34,6 +35,11 @@
   protected int ngram;
 
   protected String trainingDataDirectory;
+  
+  protected static final int DFLT_WIN_SIZE = 3;
+  protected static final int DFLT_NGRAM = 2;
+  protected static final String DFLT_LANG_CODE = "En";
+  protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
 
   /**
    * This constructor takes only two parameters. The default language used is
@@ -49,8 +55,9 @@
    *          the source of the training data
    */
   public IMSParameters(int windowSize, int ngram, SenseSource senseSource,
-      String trainingDataDirectory) {
-    this.languageCode = "En";
+      String trainingDataDirectory){
+   
+    this.languageCode = DFLT_LANG_CODE;
     this.windowSize = windowSize;
     this.ngram = ngram;
     this.senseSource = senseSource;
@@ -63,19 +70,7 @@
   }
 
   public IMSParameters(String trainingDataDirectory) {
-    this(3, 2, SenseSource.WORDNET, trainingDataDirectory);
-
-    File folder = new File(trainingDataDirectory);
-    if (!folder.exists())
-      folder.mkdirs();
-  }
-
-  public IMSParameters() {
-    this(3, 2, SenseSource.WORDNET, null);
-  }
-
-  public IMSParameters(int windowSize, int ngram) {
-    this(windowSize, ngram, SenseSource.WORDNET, null);
+    this(DFLT_WIN_SIZE, DFLT_NGRAM, DFLT_SOURCE, trainingDataDirectory);
   }
 
   public String getLanguageCode() {
@@ -109,7 +104,6 @@
    * Creates the context generator of IMS
    */
   public IMSContextGenerator createContextGenerator() {
-
     return new DefaultIMSContextGenerator();
   }
 
@@ -123,7 +117,7 @@
 
   @Override
   public boolean isValid() {
-    // TODO Auto-generated method stub
+    // TODO recheck this pattern switch to maps
     return true;
   }
 

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
index e286658..fe82987 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java

@@ -194,9 +194,12 @@
       WordPOS wordPOS = new WordPOS(word, pos);

 

       ArrayList<Synset> synsets = wordPOS.getSynsets();

-

-      int size = synsets.size();

-

+      int size;

+      if (this.parameters.isReturnMultiple()) {

+        size = synsets.size();

+      } else {

+        size = 1;

+      }

       String[] senses = new String[size];

 

       for (int i = 0; i < size; i++) {


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
index 9584487..b935c45 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java

@@ -39,6 +39,7 @@
   public String[] extractSurroundingContextClusters(int index, String[] toks,

       String[] tags, String[] lemmas, int windowSize) {

 

+    // TODO consider windowSize

     ArrayList<String> contextClusters = new ArrayList<String>();

 

     for (int i = 0; i < toks.length; i++) {

@@ -49,19 +50,19 @@
 

           String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")

               .trim();

-          

-          WordPOS word = new WordPOS(lemma, tags[i]);

 

-          // TODO check fix for "_" and null pointers

-          if (lemma.length() > 1 && !lemma.contains("_")) {

-            try{

-            ArrayList<Synset> synsets = word.getSynsets();

-            if (synsets!=null && synsets.size() > 0 ){

-              contextClusters.add(synsets.get(0).getOffset() + "");

-            }

-            }catch(NullPointerException ex)

-            {

-              //TODO tagger mistake add proper exception

+          WordPOS word = new WordPOS(lemma, tags[i]);

+      

+          if (lemma.length() > 1) {

+            try {

+              ArrayList<Synset> synsets = word.getSynsets();

+              if (synsets != null && synsets.size() > 0) {

+                for (Synset syn : synsets){

+                  contextClusters.add(syn.getOffset() + "");

+                }

+              }

+            } catch (NullPointerException ex) {

+              // TODO tagger mistake add proper exception

             }

           }

 

@@ -80,30 +81,32 @@
    */

   @Override

   public String[] getContext(int index, String[] toks, String[] tags,

-      String[] lemmas, int windowSize) {

+      String[] lemmas, int windowSize, ArrayList<String> model) {

 

     HashSet<String> surroundingContextClusters = new HashSet<>();

-    surroundingContextClusters.addAll(Arrays

-        .asList(extractSurroundingContextClusters(index, toks, tags, lemmas,

-            windowSize)));

+    surroundingContextClusters

+        .addAll(Arrays.asList(extractSurroundingContextClusters(index, toks,

+            tags, lemmas, windowSize)));

 

-    String[] serializedFeatures = new String[surroundingContextClusters.size()];

+    String[] serializedFeatures = new String[model.size()];

 

     int i = 0;

-

-    for (String feature : surroundingContextClusters) {

-      serializedFeatures[i] = "F" + i + "=" + feature;

+    for (String word : model) {

+      if (surroundingContextClusters.contains(word.toString())) {

+        serializedFeatures[i] = "F" + i + "=1";

+      } else {

+        serializedFeatures[i] = "F" + i + "=0";

+      }

       i++;

     }

 

     return serializedFeatures;

-

   }

 

-  public String[] getContext(WSDSample sample, int windowSize) {

+  public String[] getContext(WSDSample sample, int windowSize, ArrayList<String> model) {

 

     return getContext(sample.getTargetPosition(), sample.getSentence(),

-        sample.getTags(), sample.getLemmas(), windowSize);

+        sample.getTags(), sample.getLemmas(), windowSize, model);

   }

 

 }


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
index 9c0055f..4e79c38 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java

@@ -19,6 +19,8 @@
 

 package opennlp.tools.disambiguator.oscc;

 

+import java.util.ArrayList;

+

 import opennlp.tools.disambiguator.WSDSample;

 

 /**

@@ -27,7 +29,7 @@
 public interface OSCCContextGenerator {

 

   String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,

-    int windowSize);

+    int windowSize, ArrayList<String> model);

 

-  String[] getContext(WSDSample sample, int windowSize);

+  String[] getContext(WSDSample sample, int windowSize, ArrayList<String> model);

 }


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
index 7202680..f2c67ba 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java

@@ -22,7 +22,6 @@
 import java.util.ArrayList;

 import java.util.HashMap;

 

-import junit.framework.Assert;

 import opennlp.tools.disambiguator.WSDHelper;

 import opennlp.tools.disambiguator.WSDSample;

 import opennlp.tools.disambiguator.WSDisambiguator;

@@ -53,11 +52,11 @@
  * Please see {@link DefaultOSCCContextGenerator}

  * 

  * The approach finds the context clusters surrounding the target and uses a

- * classifier to judge on the best case. 

+ * classifier to judge on the best case.

  * 

  * Here an ME classifier is used.

  * 

-*/

+ */

 public class OSCCME extends WSDisambiguator {

 

   protected OSCCModel osccModel;

@@ -69,10 +68,12 @@
   }

 

   public OSCCME(OSCCModel model, OSCCParameters params) {

-    this.osccModel = osccModel;

+    this.osccModel = model;

     this.params = params;

+  }

 

-    Assert.assertEquals(model.getWindowSize(), params.getWindowSize());

+  public OSCCModel getModel() {

+    return osccModel;

   }

 

   public void setModel(OSCCModel model) {

@@ -85,7 +86,10 @@
 

   public static OSCCModel train(String lang, ObjectStream<WSDSample> samples,

       TrainingParameters mlParams, OSCCParameters osccParams,

-      OSCCFactory imsfactory) throws IOException {

+      OSCCFactory osccFactory) throws IOException {

+

+    ArrayList<String> surroundingClusterModel = buildSurroundingClusters(

+        samples, osccParams.getWindowSize());

 

     HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();

 

@@ -99,39 +103,57 @@
     if (sample != null) {

       wordTag = sample.getTargetWordTag();

       do {

-

         String sense = sample.getSenseIDs().get(0);

-

-        String[] context = cg.getContext(sample, osccParams.windowSize);

+        String[] context = cg.getContext(sample, osccParams.windowSize,

+            surroundingClusterModel);

         Event ev = new Event(sense + "", context);

-

         events.add(ev);

-

-        es = ObjectStreamUtils.createObjectStream(events);

-

       } while ((sample = samples.read()) != null);

     }

 

-    EventTrainer trainer = TrainerFactory.getEventTrainer(

-        mlParams.getSettings(), manifestInfoEntries);

+    es = ObjectStreamUtils.createObjectStream(events);

+    EventTrainer trainer = TrainerFactory

+        .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);

+

     osccModel = trainer.train(es);

 

-    return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel, manifestInfoEntries, imsfactory);

+    return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel,

+        surroundingClusterModel, manifestInfoEntries, osccFactory);

   }

 

+  public static ArrayList<String> buildSurroundingClusters(

+      ObjectStream<WSDSample> samples, int windowSize) throws IOException {

+    // TODO modify to clusters

+    DefaultOSCCContextGenerator osccCG = new DefaultOSCCContextGenerator();

+    ArrayList<String> surroundingWordsModel = new ArrayList<String>();

+    WSDSample sample;

+    while ((sample = samples.read()) != null) {

+      String[] words = osccCG.extractSurroundingContextClusters(

+          sample.getTargetPosition(), sample.getSentence(), sample.getTags(),

+          sample.getLemmas(), windowSize);

+

+      if (words.length > 0) {

+        for (String word : words) {

+          surroundingWordsModel.add(word);

+        }

+      }

+    }

+    samples.reset();

+    return surroundingWordsModel;

+  }

 

   @Override

   public String[] disambiguate(WSDSample sample) {

     if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {

       String wordTag = sample.getTargetWordTag();

 

-      String trainingFile = ((OSCCParameters) this.getParams())

-          .getTrainingDataDirectory() + sample.getTargetWordTag();

-

       if (osccModel == null

           || !osccModel.getWordTag().equals(sample.getTargetWordTag())) {

 

-        File file = new File(trainingFile + ".ims.model");

+        String trainingFile = ((OSCCParameters) this.getParams())

+            .getTrainingDataDirectory() + sample.getTargetWordTag();

+

+        File file = new File(trainingFile + ".oscc.model");

         if (file.exists() && !file.isDirectory()) {

           try {

             setModel(new OSCCModel(file));

@@ -147,7 +169,8 @@
           String outcome = "";

 

           String[] context = cg.getContext(sample,

-              ((OSCCParameters) this.params).windowSize);

+              ((OSCCParameters) this.params).windowSize,

+              osccModel.getContextClusters());

 

           double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);

           outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);

@@ -174,7 +197,8 @@
         String outcome = "";

 

         String[] context = cg.getContext(sample,

-            ((OSCCParameters) this.params).windowSize);

+            ((OSCCParameters) this.params).windowSize,

+            osccModel.getContextClusters());

 

         double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);

         outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);

@@ -223,8 +247,8 @@
    */

   public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,

       String[] lemmas, int index) {

-    return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,

-        index));

+    return disambiguate(

+        new WSDSample(tokenizedContext, tokenTags, lemmas, index));

   }

 

 }


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
index f3b28ab..19069c3 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java

@@ -21,6 +21,7 @@
 import java.io.IOException;

 import java.io.InputStream;

 import java.util.ArrayList;

+import java.util.Arrays;

 import java.util.Map;

 import java.util.Properties;

 import java.net.URL;

@@ -43,13 +44,13 @@
   private static final String WINSIZE = "winsize";

   private static final String CONTEXTCLUSTERS = "contextclusters";

 

-  //private ArrayList<String> contextClusters = new ArrayList<String>();

+  private ArrayList<String> contextClusters = new ArrayList<String>();

   private String wordTag;

   private int windowSize;

 

-  /*public ArrayList<String> getContextClusters() {

+  public ArrayList<String> getContextClusters() {

     return contextClusters;

-  }*/

+  }

 

   public int getWindowSize() {

     return windowSize;

@@ -59,9 +60,9 @@
     this.windowSize = windowSize;

   }

 

- /* public void setContextClusters(ArrayList<String> contextClusters) {

+  public void setContextClusters(ArrayList<String> contextClusters) {

     this.contextClusters = contextClusters;

-  }*/

+  }

 

   public String getWordTag() {

     return wordTag;

@@ -72,7 +73,7 @@
   }

 

    public OSCCModel(String languageCode, String wordTag, int windowSize,

-   MaxentModel osccModel,

+   MaxentModel osccModel, ArrayList<String> contextClusters,

       Map<String, String> manifestInfoEntries, OSCCFactory factory) {

     super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);

 

@@ -80,17 +81,17 @@
     this.setManifestProperty(WORDTAG, wordTag);

     this.setManifestProperty(WINSIZE, windowSize + "");

     

-//    this.setManifestProperty(CONTEXTCLUSTERS,

-//        StringUtils.join(contextClusters, ","));

+    this.setManifestProperty(CONTEXTCLUSTERS,

+        StringUtils.join(contextClusters, ","));

 

-    //this.contextClusters = contextClusters;

+    this.contextClusters = contextClusters;

     checkArtifactMap();

   }

 

   public OSCCModel(String languageCode, String wordTag, int windowSize,

-      int ngram, MaxentModel osccModel, 

+      int ngram, MaxentModel osccModel, ArrayList<String> contextClusters,

       OSCCFactory factory) {

-    this(languageCode, wordTag, windowSize, osccModel,

+    this(languageCode, wordTag, windowSize, osccModel, contextClusters,

         null, factory);

   }

 

@@ -135,10 +136,10 @@
 

   public void updateAttributes() {

     Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);

-    //String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);

+    String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);

 

-   /* this.contextClusters = new ArrayList(

-        Arrays.asList(contextClusters.split(",")));*/

+    this.contextClusters = new ArrayList(

+        Arrays.asList(contextClusters.split(",")));

     this.wordTag = (String) manifest.get(WORDTAG);

     this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));

   }


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
index 42a7742..15f1004 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java

@@ -70,13 +70,11 @@
   }

 

   public OSCCParameters() {

-    // TODO change the "" into null ??

-    this(DFLT_WIN_SIZE, DFLT_SOURCE, "");

+    this(DFLT_WIN_SIZE, DFLT_SOURCE, null);

   }

 

   public OSCCParameters(int windowSize) {

-    // TODO change the "" into null ??

-    this(windowSize, DFLT_SOURCE, "");

+    this(windowSize, DFLT_SOURCE, null);

   }

 

   public String getLanguageCode() {


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java
new file mode 100644
index 0000000..ce0f86e
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java

@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.ims.IMSFactory;
+import opennlp.tools.disambiguator.ims.IMSME;
+import opennlp.tools.disambiguator.ims.IMSModel;
+import opennlp.tools.disambiguator.ims.IMSParameters;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * This is the test class for {@link IMSME}.
+ * 
+ * The scope of this test is to make sure that the IMS disambiguator code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of the
+ * disambiguator.
+ * 
+ * In this test the {@link IMSME} is trained with Semcor and then the computed
+ * model is used to predict sentences from the training sentences.
+ */
+public class IMSMETester {
+  // TODO write more tests
+  // TODO modify when we fix the parameter model
+
+  static String modelsDir = "src\\test\\resources\\models\\";
+  static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
+
+  static IMSParameters IMSParams;
+  static IMSME ims;
+  static IMSFactory IMSFactory;
+  static IMSModel model;
+
+  static String test = "please.v";
+  static File outFile;
+
+  static String test1 = "We need to discuss an important topic, please write to me soon.";
+  static String test2 = "The component was highly radioactive to the point that"
+      + " it has been activated the second it touched water";
+  static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+  static String[] sentence1;
+  static String[] sentence2;
+  static String[] sentence3;
+
+  static String[] tags1;
+  static String[] tags2;
+  static String[] tags3;
+
+  static String[] lemmas1;
+  static String[] lemmas2;
+  static String[] lemmas3;
+
+  /*
+   * Setup the testing variables
+   */
+  @BeforeClass
+  public static void setUpAndTraining() {
+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+    sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+    sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+    sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+    tags1 = WSDHelper.getTagger().tag(sentence1);
+    tags2 = WSDHelper.getTagger().tag(sentence2);
+    tags3 = WSDHelper.getTagger().tag(sentence3);
+
+    List<String> tempLemmas1 = new ArrayList<String>();
+    for (int i = 0; i < sentence1.length; i++) {
+      tempLemmas1
+          .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
+    }
+    lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+    List<String> tempLemmas2 = new ArrayList<String>();
+    for (int i = 0; i < sentence2.length; i++) {
+      tempLemmas2
+          .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
+    }
+    lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+
+    List<String> tempLemmas3 = new ArrayList<String>();
+    for (int i = 0; i < sentence3.length; i++) {
+      tempLemmas3
+          .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
+    }
+    lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+    IMSParams = new IMSParameters("");
+    IMSParams.setTrainingDataDirectory(trainingDataDirectory);
+    IMSFactory = new IMSFactory();
+    TrainingParameters trainingParams = new TrainingParameters();
+    SemcorReaderExtended sr = new SemcorReaderExtended();
+    ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+    IMSModel writeModel = null;
+    /*
+     * Tests training the disambiguator We test both writing and reading a model
+     * file trained by semcor
+     */
+
+    try {
+      writeModel = IMSME.train("en", sampleStream, trainingParams, IMSParams,
+          IMSFactory);
+      assertNotNull("Checking the model to be written", writeModel);
+      writeModel.writeModel(IMSParams.getTrainingDataDirectory() + test);
+      outFile = new File(
+          IMSParams.getTrainingDataDirectory() + test + ".ims.model");
+      model = new IMSModel(outFile);
+      assertNotNull("Checking the read model", model);
+      ims = new IMSME(model, IMSParams);
+      assertNotNull("Checking the disambiguator", ims);
+    } catch (IOException e1) {
+      e1.printStackTrace();
+      fail("Exception in training");
+    }
+  }
+
+  /*
+   * Tests disambiguating only one word : The ambiguous word "please"
+   */
+  @Test
+  public void testOneWordDisambiguation() {
+    String[] senses = ims.disambiguate(sentence1, tags1, lemmas1, 8);
+
+    assertEquals("Check number of senses", 1, senses.length);
+  }
+
+  /*
+   * Tests disambiguating a word Span In this case we test a mix of monosemous
+   * and polysemous words as well as words that do not need disambiguation such
+   * as determiners
+   */
+  @Test
+  public void testWordSpanDisambiguation() {
+    Span span = new Span(3, 7);
+    List<String[]> senses = ims.disambiguate(sentence2, tags2, lemmas2, span);
+
+    assertEquals("Check number of returned words", 5, senses.size());
+    assertEquals("Check number of senses", 1, senses.get(0).length);
+    assertEquals("Check monosemous word", 1, senses.get(1).length);
+    assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
+    assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
+  }
+
+  /*
+   * Tests disambiguating all the words
+   */
+  @Test
+  public void testAllWordsDisambiguation() {
+    List<String[]> senses = ims.disambiguate(sentence3, tags3, lemmas3);
+
+    assertEquals("Check number of returned words", 15, senses.size());
+    assertEquals("Check preposition", "WSDHELPER personal pronoun",
+        senses.get(6)[0]);
+  }
+
+}

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
deleted file mode 100644
index c832156..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import opennlp.tools.disambiguator.ims.IMSME;
-import opennlp.tools.disambiguator.ims.IMSParameters;
-import opennlp.tools.util.Span;
-
-/**
- * This is a typical example of how to call the disambiguation function in the
- * IMS class.
- * <ul>
- * <li>In the 2 first examples, the training data exist, therefore the IMS
- * approach is used.</li>
- * <li>In the 3rd example, the training data for the word to disambiguate are
- * absent, therefore the Most Frequent Sents (MFS) is returend</li>
- * </ul>
- */
-public class IMSTester {
-
-  public static void main(String[] args) {
-
-    // TODO write unit test
-    
-    String modelsDir = "src\\test\\resources\\models\\";
-    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
-    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
-    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
-    IMSParameters params = new IMSParameters("");
-
-    WSDHelper.print(params.getTrainingDataDirectory());
-
-    IMSME ims = new IMSME(params);
-
-  
-    // This is how to make the context for one-word-disambiguation using IMS
-     
-    String test1 = "We need to discuss important topic, please write to me soon.";
-    String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
-    String[] tags1 = WSDHelper.getTagger().tag(sentence1);
-    List<String> tempLemmas1 = new ArrayList<String>();
-    for (int i = 0; i < sentence1.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence1[i], tags1[i]);
-      tempLemmas1.add(lemma);
-    }
-    String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
-    // output
-    String[] senses1 = ims.disambiguate(sentence1, tags1, lemmas1, 8);
-    System.out.print(lemmas1[8] + " :\t");
-    WSDHelper.print(senses1);
-    WSDHelper.print("*****************************");
-
-    // This is how to make the context for disambiguation of span of words
-    
-    String test2 = "The component was highly radioactive to the point that"
-        + " it has been activated the second it touched water";
-    String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
-    String[] tags2 = WSDHelper.getTagger().tag(sentence2);
-    List<String> tempLemmas2 = new ArrayList<String>();
-    for (int i = 0; i < sentence2.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence2[i], tags2[i]);
-      tempLemmas2.add(lemma);
-    }
-    String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-    Span span = new Span(3, 7);
-
-    // output
-    List<String[]> senses2 = ims.disambiguate(sentence2, tags2, lemmas2, span);
-    for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
-      String[] senses = senses2.get(i - span.getStart());
-      System.out.print(lemmas2[i] + " :\t");
-      WSDHelper.print(senses);
-      WSDHelper.print("----------");
-    }
-
-    WSDHelper.print("*****************************");
-
-    // This is how to make the context for all-words-disambiguation
-    
-    String test3 = "The summer almost over and I not to the beach even once";
-    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
-    List<String> tempLemmas3 = new ArrayList<String>();
-    for (int i = 0; i < sentence3.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence3[i], tags3[i]);
-      tempLemmas3.add(lemma);
-    }
-    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
-    // output
-    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
-    for (int i = 0; i < sentence3.length; i++) {
-      String[] senses = senses3.get(i);
-      System.out.print(lemmas3[i] + " :\t");
-      WSDHelper.print(senses);
-      WSDHelper.print("----------");
-    }
-  }
-
-}

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
index 13c959b..edb1346 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java

@@ -19,81 +19,137 @@
 
 package opennlp.tools.disambiguator;
 
+import static org.junit.Assert.assertEquals;
+
 import java.util.ArrayList;
 import java.util.List;
 
 import opennlp.tools.disambiguator.lesk.Lesk;
 import opennlp.tools.disambiguator.lesk.LeskParameters;
 import opennlp.tools.disambiguator.lesk.LeskParameters.LESK_TYPE;
+import opennlp.tools.util.Span;
 
+import org.junit.BeforeClass;
 import org.junit.Test;
 
+/**
+ * This is the test class for {@link Lesk}.
+ * 
+ * The scope of this test is to make sure that the Lesk disambiguator code can be
+ * executed. This test can not detect mistakes which lead to incorrect feature
+ * generation or other mistakes which decrease the disambiguation performance of the
+ * disambiguator.
+ */
 public class LeskTester {
-  @Test
-  public static void main(String[] args) {
+  // TODO write more tests
 
-    Lesk lesk = new Lesk();
-    LeskParameters params = new LeskParameters();
-    params.setLeskType(LESK_TYPE.LESK_EXT);
-    boolean a[] = { true, true, true, true, true, true, true, true, true, true };
-    params.setFeatures(a);
-    lesk.setParams(params);
-    String modelsDir = "src\\test\\resources\\models\\";
+  static String modelsDir = "src\\test\\resources\\models\\";
+
+  static Lesk lesk;
+
+  static String test1 = "We need to discuss an important topic, please write to me soon.";
+  static String test2 = "The component was highly radioactive to the point that"
+      + " it has been activated the second it touched water";
+  static String test3 = "The summer is almost over and I did not go to the beach even once";
+
+  static String[] sentence1;
+  static String[] sentence2;
+  static String[] sentence3;
+
+  static String[] tags1;
+  static String[] tags2;
+  static String[] tags3;
+
+  static String[] lemmas1;
+  static String[] lemmas2;
+  static String[] lemmas3;
+
+  /*
+   * Setup the testing variables
+   */
+  @BeforeClass
+  public static void setUp() {
+
     WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
     WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
     WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
 
-    String test1 = "I went to the bank to deposit money.";
-    String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
-    int targetWordIndex1 = 5;
-    String[] tags1 = WSDHelper.getTagger().tag(sentence1);
+    sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+    sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+    sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+
+    tags1 = WSDHelper.getTagger().tag(sentence1);
+    tags2 = WSDHelper.getTagger().tag(sentence2);
+    tags3 = WSDHelper.getTagger().tag(sentence3);
+
     List<String> tempLemmas1 = new ArrayList<String>();
     for (int i = 0; i < sentence1.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence1[i], tags1[i]);
-      tempLemmas1.add(lemma);
+      tempLemmas1
+          .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
     }
-    String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-    String[] results1 = lesk.disambiguate(sentence1, tags1, lemmas1,
-        targetWordIndex1);
-    WSDHelper.print(results1);
-    WSDHelper.printResults(lesk, results1);
+    lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
 
-    WSDHelper.print("----------------------------------------");
-
-    String test2 = "it was a strong argument that his hypothesis was true";
-    String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
-    int targetWordIndex2 = 4;
-    String[] tags2 = WSDHelper.getTagger().tag(sentence2);
     List<String> tempLemmas2 = new ArrayList<String>();
-    for (int i = 0; i < sentence1.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence2[i], tags2[i]);
-      tempLemmas2.add(lemma);
+    for (int i = 0; i < sentence2.length; i++) {
+      tempLemmas2
+          .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
     }
-    String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-    String[] results2 = lesk.disambiguate(sentence2, tags2, lemmas2,
-        targetWordIndex2);
-    WSDHelper.print(results2);
-    WSDHelper.printResults(lesk, results2);
-    WSDHelper.print("----------------------------------------");
+    lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
 
-    String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
-    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-    int targetWordIndex3 = 3;
-    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
     List<String> tempLemmas3 = new ArrayList<String>();
     for (int i = 0; i < sentence3.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence3[i], tags3[i]);
-      tempLemmas3.add(lemma);
+      tempLemmas3
+          .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
     }
-    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-    String[] results3 = lesk.disambiguate(sentence3, tags3, lemmas3,
-        targetWordIndex3);
-    WSDHelper.print(results3);
-    WSDHelper.printResults(lesk, results3);
-    WSDHelper.print("----------------------------------------");
+    lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+    lesk = new Lesk();
+
+    LeskParameters params = new LeskParameters();
+    params.setLeskType(LESK_TYPE.LESK_EXT);
+    boolean a[] = { true, true, true, true, true, true, true, true, true,
+        true };
+    params.setFeatures(a);
+    lesk.setParams(params);
+  }
+
+  /*
+   * Tests disambiguating only one word : The ambiguous word "please"
+   */
+  @Test
+  public void testOneWordDisambiguation() {
+    String[] senses = lesk.disambiguate(sentence1, tags1, lemmas1, 8);
+
+    assertEquals("Check number of senses", 1, senses.length);
+  }
+
+  /*
+   * Tests disambiguating a word Span In this case we test a mix of monosemous
+   * and polysemous words as well as words that do not need disambiguation such
+   * as determiners
+   */
+  @Test
+  public void testWordSpanDisambiguation() {
+    Span span = new Span(3, 7);
+    List<String[]> senses = lesk.disambiguate(sentence2, tags2, lemmas2, span);
+
+    assertEquals("Check number of returned words", 5, senses.size());
+    assertEquals("Check number of senses", 3, senses.get(0).length);
+    assertEquals("Check monosemous word", 1, senses.get(1).length);
+    assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
+    assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
+  }
+
+  /*
+   * Tests disambiguating all the words
+   */
+  @Test
+  public void testAllWordsDisambiguation() {
+    List<String[]> senses = lesk.disambiguate(sentence3, tags3, lemmas3);
+
+    assertEquals("Check number of returned words", 15, senses.size());
+    assertEquals("Check preposition", "WSDHELPER personal pronoun",
+        senses.get(6)[0]);
   }
 
 }
\ No newline at end of file

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
index f74faad..a675268 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java

@@ -19,96 +19,128 @@
 

 package opennlp.tools.disambiguator;

 

+import static org.junit.Assert.assertEquals;

+

 import java.util.ArrayList;

 import java.util.List;

+import org.junit.BeforeClass;

+import org.junit.Test;

 

 import opennlp.tools.disambiguator.mfs.MFS;

 import opennlp.tools.util.Span;

 

 /**

- * This is a typical example of how to call the disambiguation function in the

- * MFS class.

+ * This is the test class for {@link MFS}.

+ * 

+ * The scope of this test is to make sure that the MFS disambiguator code can be

+ * executed. This test can not detect mistakes which lead to incorrect feature

+ * generation or other mistakes which decrease the disambiguation performance of the

+ * disambiguator.

  */

 public class MFSTester {

+  // TODO write more tests

+  // TODO modify when we fix the parameter model

 

-  public static void main(String[] args) {

-    String modelsDir = "src\\test\\resources\\models\\";

+  static String modelsDir = "src\\test\\resources\\models\\";

+

+  static MFS mfs;

+

+  static String test1 = "We need to discuss an important topic, please write to me soon.";

+  static String test2 = "The component was highly radioactive to the point that"

+      + " it has been activated the second it touched water";

+  static String test3 = "The summer is almost over and I did not go to the beach even once";

+

+  static String[] sentence1;

+  static String[] sentence2;

+  static String[] sentence3;

+

+  static String[] tags1;

+  static String[] tags2;

+  static String[] tags3;

+

+  static String[] lemmas1;

+  static String[] lemmas2;

+  static String[] lemmas3;

+

+  /*

+   * Setup the testing variables and the training files

+   */

+  @BeforeClass

+  public static void setUpAndTraining() {

+

     WSDHelper.loadTokenizer(modelsDir + "en-token.bin");

     WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");

     WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");

 

-    MFS mfs = new MFS();

+    sentence1 = WSDHelper.getTokenizer().tokenize(test1);

+    sentence2 = WSDHelper.getTokenizer().tokenize(test2);

+    sentence3 = WSDHelper.getTokenizer().tokenize(test3);

 

-    /**

-     * This is how to make the context for one-word-disambiguation using IMS

-     */

-    String test1 = "We need to discuss important topic, please write to me soon.";

-    String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);

-    String[] tags1 = WSDHelper.getTagger().tag(sentence1);

+    tags1 = WSDHelper.getTagger().tag(sentence1);

+    tags2 = WSDHelper.getTagger().tag(sentence2);

+    tags3 = WSDHelper.getTagger().tag(sentence3);

+

     List<String> tempLemmas1 = new ArrayList<String>();

     for (int i = 0; i < sentence1.length; i++) {

-      String lemma = WSDHelper.getLemmatizer()

-          .lemmatize(sentence1[i], tags1[i]);

-      tempLemmas1.add(lemma);

+      tempLemmas1

+          .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));

     }

-    String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);

+    lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);

 

-    // output

-    String[] senses1 = mfs.disambiguate(sentence1, tags1, lemmas1, 8);

-    System.out.print(lemmas1[8] + " :\t");

-    WSDHelper.print(senses1);

-    WSDHelper.print("*****************************");

-

-    /**

-     * This is how to make the context for disambiguation of span of words

-     */

-    String test2 = "The component was highly radioactive to the point that"

-        + " it has been activated the second it touched water";

-    String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);

-    String[] tags2 = WSDHelper.getTagger().tag(sentence2);

     List<String> tempLemmas2 = new ArrayList<String>();

     for (int i = 0; i < sentence2.length; i++) {

-      String lemma = WSDHelper.getLemmatizer()

-          .lemmatize(sentence2[i], tags2[i]);

-      tempLemmas2.add(lemma);

+      tempLemmas2

+          .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));

     }

-    String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);

-    Span span = new Span(3, 7);

+    lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);

 

-    // output

-    List<String[]> senses2 = mfs.disambiguate(sentence2, tags2, lemmas2, span);

-    for (int i = span.getStart(); i < span.getEnd() + 1; i++) {

-      String[] senses = senses2.get(i - span.getStart());

-      System.out.print(lemmas2[i] + " :\t");

-      WSDHelper.print(senses);

-      WSDHelper.print("----------");

-    }

-

-    WSDHelper.print("*****************************");

-

-    /**

-     * This is how to make the context for all-words-disambiguation

-     */

-    String test3 = "The summer is almost over and I have not been to the beach even once";

-    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);

-    String[] tags3 = WSDHelper.getTagger().tag(sentence3);

     List<String> tempLemmas3 = new ArrayList<String>();

     for (int i = 0; i < sentence3.length; i++) {

-      String lemma = WSDHelper.getLemmatizer()

-          .lemmatize(sentence3[i], tags3[i]);

-      tempLemmas3.add(lemma);

+      tempLemmas3

+          .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));

     }

-    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);

+    lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);

 

-    // output

-    List<String[]> senses3 = mfs.disambiguate(sentence3, tags3, lemmas3);

-    for (int i = 0; i < sentence3.length; i++) {

-      String[] senses = senses3.get(i);

-      System.out.print(lemmas3[i] + " :\t");

-      WSDHelper.print(senses);

-      WSDHelper.print("----------");

-    }

+    mfs = new MFS();

 

   }

 

+  /*

+   * Tests disambiguating only one word : The ambiguous word "please"

+   */

+  @Test

+  public void testOneWordDisambiguation() {

+    String[] senses = mfs.disambiguate(sentence1, tags1, lemmas1, 8);

+

+    assertEquals("Check number of senses", 1, senses.length);

+  }

+

+  /*

+   * Tests disambiguating a word Span In this case we test a mix of monosemous

+   * and polysemous words as well as words that do not need disambiguation such

+   * as determiners

+   */

+  @Test

+  public void testWordSpanDisambiguation() {

+    Span span = new Span(3, 7);

+    List<String[]> senses = mfs.disambiguate(sentence2, tags2, lemmas2, span);

+

+    assertEquals("Check number of returned words", 5, senses.size());

+    assertEquals("Check number of senses", 1, senses.get(0).length);

+    assertEquals("Check monosemous word", 1, senses.get(1).length);

+    assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);

+    assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);

+  }

+

+  /*

+   * Tests disambiguating all the words

+   */

+  @Test

+  public void testAllWordsDisambiguation() {

+    List<String[]> senses = mfs.disambiguate(sentence3, tags3, lemmas3);

+

+    assertEquals("Check number of returned words", 15, senses.size());

+    assertEquals("Check preposition", "WSDHELPER personal pronoun",

+        senses.get(6)[0]);

+  }

 }
\ No newline at end of file

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java
new file mode 100644
index 0000000..d6f55a6
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java

@@ -0,0 +1,193 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator;

+

+import static org.junit.Assert.assertEquals;

+import static org.junit.Assert.assertNotNull;

+import static org.junit.Assert.fail;

+

+import java.io.File;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.List;

+

+import org.junit.BeforeClass;

+import org.junit.Test;

+

+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;

+import opennlp.tools.disambiguator.oscc.OSCCFactory;

+import opennlp.tools.disambiguator.oscc.OSCCME;

+import opennlp.tools.disambiguator.oscc.OSCCModel;

+import opennlp.tools.disambiguator.oscc.OSCCParameters;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.Span;

+import opennlp.tools.util.TrainingParameters;

+

+/**

+ * This is the test class for {@link OSCCME}.

+ * 

+ * The scope of this test is to make sure that the OSCC disambiguator code can

+ * be executed. This test can not detect mistakes which lead to incorrect

+ * feature generation or other mistakes which decrease the disambiguation

+ * performance of the disambiguator.

+ * 

+ * In this test the {@link OSCCME} is trained with Semcor and then the computed

+ * model is used to predict sentences from the training sentences.

+ */

+public class OSCCMETester {

+  // TODO write more tests

+  // TODO modify when we fix the parameter model

+

+  static String modelsDir = "src\\test\\resources\\models\\";

+  static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";

+

+  static OSCCParameters OSCCParams;

+  static OSCCME oscc;

+  static OSCCFactory osccFactory;

+  static OSCCModel model;

+

+  static String test = "please.v";

+  static File outFile;

+

+  static String test1 = "We need to discuss an important topic, please write to me soon.";

+  static String test2 = "The component was highly radioactive to the point that"

+      + " it has been activated the second it touched water";

+  static String test3 = "The summer is almost over and I did not go to the beach even once";

+

+  static String[] sentence1;

+  static String[] sentence2;

+  static String[] sentence3;

+

+  static String[] tags1;

+  static String[] tags2;

+  static String[] tags3;

+

+  static String[] lemmas1;

+  static String[] lemmas2;

+  static String[] lemmas3;

+

+  /*

+   * Setup the testing variables

+   */

+  @BeforeClass

+  public static void setUpAndTraining() {

+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");

+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");

+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");

+

+    sentence1 = WSDHelper.getTokenizer().tokenize(test1);

+    sentence2 = WSDHelper.getTokenizer().tokenize(test2);

+    sentence3 = WSDHelper.getTokenizer().tokenize(test3);

+

+    tags1 = WSDHelper.getTagger().tag(sentence1);

+    tags2 = WSDHelper.getTagger().tag(sentence2);

+    tags3 = WSDHelper.getTagger().tag(sentence3);

+

+    List<String> tempLemmas1 = new ArrayList<String>();

+    for (int i = 0; i < sentence1.length; i++) {

+      tempLemmas1

+          .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));

+    }

+    lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);

+

+    List<String> tempLemmas2 = new ArrayList<String>();

+    for (int i = 0; i < sentence2.length; i++) {

+      tempLemmas2

+          .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));

+    }

+    lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);

+

+    List<String> tempLemmas3 = new ArrayList<String>();

+    for (int i = 0; i < sentence3.length; i++) {

+      tempLemmas3

+          .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));

+    }

+    lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);

+

+    OSCCParams = new OSCCParameters("");

+    OSCCParams.setTrainingDataDirectory(trainingDataDirectory);

+    osccFactory = new OSCCFactory();

+    TrainingParameters trainingParams = new TrainingParameters();

+    SemcorReaderExtended sr = new SemcorReaderExtended();

+    ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);

+

+    OSCCModel writeModel = null;

+    /*

+     * Tests training the disambiguator We test both writing and reading a model

+     * file trained by semcor

+     */

+

+    try {

+      writeModel = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,

+          osccFactory);

+      assertNotNull("Checking the model to be written", writeModel);

+      writeModel.writeModel(OSCCParams.getTrainingDataDirectory() + test);

+      outFile = new File(

+          OSCCParams.getTrainingDataDirectory() + test + ".oscc.model");

+      model = new OSCCModel(outFile);

+      assertNotNull("Checking the read model", model);

+      oscc = new OSCCME(model, OSCCParams);

+      assertNotNull("Checking the disambiguator", oscc);

+    } catch (IOException e1) {

+      e1.printStackTrace();

+      fail("Exception in training");

+    }

+  }

+

+  /*

+   * Tests disambiguating only one word : The ambiguous word "please"

+   */

+  @Test

+  public void testOneWordDisambiguation() {

+    String[] senses = oscc.disambiguate(sentence1, tags1, lemmas1, 8);

+

+    assertEquals("Check number of senses", 1, senses.length);

+  }

+

+  /*

+   * Tests disambiguating a word Span In this case we test a mix of monosemous

+   * and polysemous words as well as words that do not need disambiguation such

+   * as determiners

+   */

+  @Test

+  public void testWordSpanDisambiguation() {

+    Span span = new Span(3, 7);

+    List<String[]> senses = oscc.disambiguate(sentence2, tags2, lemmas2, span);

+

+    assertEquals("Check number of returned words", 5, senses.size());

+    assertEquals("Check number of senses", 1, senses.get(0).length);

+    assertEquals("Check monosemous word", 1, senses.get(1).length);

+    assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);

+    assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);

+  }

+

+  /*

+   * Tests disambiguating all the words

+   */

+  @Test

+  public void testAllWordsDisambiguation() {

+    List<String[]> senses = oscc.disambiguate(sentence3, tags3, lemmas3);

+

+    assertEquals("Check number of returned words", 15, senses.size());

+    assertEquals("Check preposition", "WSDHELPER personal pronoun",

+        senses.get(6)[0]);

+  }

+

+}
\ No newline at end of file

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
deleted file mode 100644
index ec6377d..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
+++ /dev/null

@@ -1,116 +0,0 @@
-/*

- * Licensed to the Apache Software Foundation (ASF) under one

- * or more contributor license agreements.  See the NOTICE file

- * distributed with this work for additional information

- * regarding copyright ownership.  The ASF licenses this file

- * to you under the Apache License, Version 2.0 (the

- * "License"); you may not use this file except in compliance

- * with the License.  You may obtain a copy of the License at

- * 

- *   http://www.apache.org/licenses/LICENSE-2.0

- * 

- * Unless required by applicable law or agreed to in writing,

- * software distributed under the License is distributed on an

- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

- * KIND, either express or implied.  See the License for the

- * specific language governing permissions and limitations

- * under the License.

- */

-

-package opennlp.tools.disambiguator;

-

-import java.io.File;

-import java.io.IOException;

-import java.util.ArrayList;

-import java.util.List;

-

-import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;

-import opennlp.tools.disambiguator.oscc.OSCCFactory;

-import opennlp.tools.disambiguator.oscc.OSCCME;

-import opennlp.tools.disambiguator.oscc.OSCCModel;

-import opennlp.tools.disambiguator.oscc.OSCCParameters;

-import opennlp.tools.util.ObjectStream;

-import opennlp.tools.util.Span;

-import opennlp.tools.util.TrainingParameters;

-

-public class OSCCTester {

-

-  public static void main(String[] args) {

-

-    SemcorReaderExtended sr = new SemcorReaderExtended();

-

-    String modelsDir = "src\\test\\resources\\models\\";

-    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");

-    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");

-    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");

-

-    String test = "write.v";

-    TrainingParameters trainingParams = new TrainingParameters();

-    OSCCParameters OSCCParams = new OSCCParameters("");

-    OSCCFactory OSCCFactory = new OSCCFactory();

-

-    ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);

-

-    OSCCModel model = null;

-    OSCCModel readModel = null;

-    try {

-      model = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,

-          OSCCFactory);

-      model.writeModel(test);

-      File outFile = new File(test + ".OSCC.model");

-      readModel = new OSCCModel(outFile);

-

-    } catch (IOException e1) {

-      // TODO Auto-generated catch block

-      e1.printStackTrace();

-    }

-    OSCCME OSCC = new OSCCME(readModel, OSCCParams);

-

-    /**

-     * This is how to make the context for one-word-disambiguation using OSCC

-     */

-    String test1 = "We need to discuss important topic, please write to me soon.";

-    String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);

-    String[] tags1 = WSDHelper.getTagger().tag(sentence1);

-    List<String> tempLemmas1 = new ArrayList<String>();

-    for (int i = 0; i < sentence1.length; i++) {

-      String lemma = WSDHelper.getLemmatizer()

-          .lemmatize(sentence1[i], tags1[i]);

-      tempLemmas1.add(lemma);

-    }

-    String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);

-

-    // output

-    String[] senses1 = OSCC.disambiguate(sentence1, tags1, lemmas1, 8);

-    System.out.print(lemmas1[8] + " :\t");

-    WSDHelper.print(senses1);

-    WSDHelper.print("*****************************");

-

-    /**

-     * This is how to make the context for disambiguation of span of words

-     */

-    String test2 = "The component was highly radioactive to the point that"

-        + " it has been activated the second it touched water";

-    String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);

-    String[] tags2 = WSDHelper.getTagger().tag(sentence2);

-    List<String> tempLemmas2 = new ArrayList<String>();

-    for (int i = 0; i < sentence2.length; i++) {

-      String lemma = WSDHelper.getLemmatizer()

-          .lemmatize(sentence2[i], tags2[i]);

-      tempLemmas2.add(lemma);

-    }

-    String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);

-    Span span = new Span(3, 7);

-

-    // output

-    List<String[]> senses2 = OSCC.disambiguate(sentence2, tags2, lemmas2, span);

-    for (int i = span.getStart(); i < span.getEnd() + 1; i++) {

-      String[] senses = senses2.get(i - span.getStart());

-      System.out.print(lemmas2[i] + " :\t");

-      WSDHelper.print(senses);

-      WSDHelper.print("----------");

-    }

-

-    WSDHelper.print("*****************************");

-  }

-}
\ No newline at end of file

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
index 3adcd7d..d657f56 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java

@@ -1,36 +1,40 @@
 package opennlp.tools.disambiguator;
 
+import java.util.ArrayList;
+import java.util.List;
 
+import opennlp.tools.disambiguator.ims.IMSME;
+import opennlp.tools.disambiguator.ims.IMSParameters;
 
 public class Tester {
 
   public static void main(String[] args) {
-//
-//    String modelsDir = "src\\test\\resources\\models\\";
-//    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
-//    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
-//    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-//
-//    IMSME ims = new IMSME();
-//
-//    String test3 = "The summer is almost over and I haven't been to the beach even once";
-//    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-//    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
-//    List<String> tempLemmas3 = new ArrayList<String>();
-//    for (int i = 0; i < sentence3.length; i++) {
-//      String lemma = WSDHelper.getLemmatizer()
-//          .lemmatize(sentence3[i], tags3[i]);
-//      tempLemmas3.add(lemma);
-//    }
-//    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-//
-//    // output
-//    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
-//    for (int i = 0; i < sentence3.length; i++) {
-//      System.out.print(sentence3[i] + " : ");
-//      WSDHelper.printResults(ims, senses3.get(i));
-//      WSDHelper.print("----------");
-//    }
+
+    String modelsDir = "src\\test\\resources\\models\\";
+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+    IMSME ims = new IMSME(new IMSParameters("\\"));
+
+    String test3 = "The summer is almost over and I haven't been to the beach even once";
+    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
+    List<String> tempLemmas3 = new ArrayList<String>();
+    for (int i = 0; i < sentence3.length; i++) {
+      String lemma = WSDHelper.getLemmatizer().lemmatize(sentence3[i],
+          tags3[i]);
+      tempLemmas3.add(lemma);
+    }
+    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+
+    // output
+    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
+    for (int i = 0; i < sentence3.length; i++) {
+      System.out.print(sentence3[i] + " : ");
+      WSDHelper.printResults(ims, senses3.get(i));
+      WSDHelper.print("----------");
+    }
 
   }
 }
\ No newline at end of file
commit	092bff60a01652403d5fbf6782b3f83fc3c07fb8	[log] [tgz]
author	Anthony Beylerian <beylerian@apache.org>	Fri Mar 11 17:37:07 2016 +0000
committer	Anthony Beylerian <beylerian@apache.org>	Fri Mar 11 17:37:07 2016 +0000
tree	dc8e79a71474fe5cdc39a0a5353af080c0f407da
parent	ce88c13f0dc68578974330ab7b304172a3893c38 [diff]