OPENNLP-802 The WSDisambiguator needs a baseline to compare the implemented approaches with. Lesk presents a good baseline, however Senseval and Semeval workshops demonstrated that MFS presents a better and more challenging baseline. Thanks to Mondher Bouazizi for providing a patch!

commit: 77f56ceb795ff62c53c07880ea607624dc85a6e2 [log] [tgz]
author: Jörn Kottmann <joern@apache.org> Mon Aug 03 08:11:04 2015 +0000
committer: Jörn Kottmann <joern@apache.org> Mon Aug 03 08:11:04 2015 +0000
tree: 2776e7d3b93f4fbdca0ca9811ed52d6e98ad898b
parent: ffafc92392ee89527acdb0920c395636975929c3 [diff]
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
index 0d6bfd7..d12ebb7 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java

@@ -25,6 +25,8 @@
 import java.io.IOException;

 import java.util.ArrayList;

 import java.util.HashMap;

+import java.util.Collections;

+import java.util.Arrays;

 

 import javax.xml.parsers.DocumentBuilder;

 import javax.xml.parsers.DocumentBuilderFactory;

@@ -209,6 +211,8 @@
                 ArrayList<String> answers = new ArrayList<String>();

                 String sentence = "";

                 String rawWord = "";

+                String[] finalText = null;

+                int index = 0;

 

                 NodeList nChildren = nInstance.getChildNodes();

 

@@ -230,18 +234,46 @@
                     sentence = ((Element) nChild).getTextContent();

 

                     if (nChild.hasChildNodes()) {

-                      // textbefore =

-                      // nChild.getChildNodes().item(0).getTextContent();

+                      String textBefore = nChild.getChildNodes().item(0)

+                          .getTextContent();

                       rawWord = nChild.getChildNodes().item(1).getTextContent();

-                      // textAfter =

-                      // nChild.getChildNodes().item(2).getTextContent();

+                      String textAfter = nChild.getChildNodes().item(2)

+                          .getTextContent();

+

+                      ArrayList<String> textBeforeTokenzed = new ArrayList<String>(

+                          Arrays.asList(textBefore.split("\\s")));

+                      ArrayList<String> textAfterTokenzed = new ArrayList<String>(

+                          Arrays.asList(textAfter.split("\\s")));

+

+                      textBeforeTokenzed.removeAll(Collections.singleton(null));

+                      textBeforeTokenzed.removeAll(Collections.singleton(""));

+

+                      textAfterTokenzed.removeAll(Collections.singleton(null));

+                      textAfterTokenzed.removeAll(Collections.singleton(""));

+

+                      finalText = new String[textBeforeTokenzed.size() + 1

+                          + textAfterTokenzed.size()];

+

+                      int l = 0;

+                      for (String tempWord : textBeforeTokenzed) {

+                        finalText[l] = tempWord;

+                        l++;

+                      }

+                      index = l;

+                      finalText[l] = rawWord.toLowerCase();

+                      l++;

+                      for (String tempWord : textAfterTokenzed) {

+                        finalText[l] = tempWord;

+                        l++;

+                      }

+

                     }

                   }

 

                 }

 

-                WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,

-                    rawWord);

+                WTDIMS wordToDisambiguate = new WTDIMS(finalText, index,

+                    answers);

                 setInstances.add(wordToDisambiguate);

               }

             }


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
index 34044af..e2580be 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java

@@ -67,6 +67,11 @@
     super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense());
     this.senseIDs = wtd.getSenseIDs();
   }
+  
+  public WTDIMS(String[] sentence, int wordIndex, ArrayList<String> senseIDs) {
+    super(sentence, wordIndex);
+    this.senseIDs = senseIDs;
+  }
 
   public String[] getPosOfSurroundingWords() {
     return posOfSurroundingWords;

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
new file mode 100644
index 0000000..e20bd6d
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java

@@ -0,0 +1,191 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator.mfs;

+

+import java.security.InvalidParameterException;

+import java.util.ArrayList;

+

+import net.sf.extjwnl.JWNLException;

+import net.sf.extjwnl.data.POS;

+import net.sf.extjwnl.data.Synset;

+import net.sf.extjwnl.data.Word;

+import opennlp.tools.disambiguator.Constants;

+import opennlp.tools.disambiguator.WSDParameters;

+import opennlp.tools.disambiguator.WSDisambiguator;

+import opennlp.tools.disambiguator.WordPOS;

+import opennlp.tools.disambiguator.WordToDisambiguate;

+import opennlp.tools.util.Span;

+

+/**

+ * Implementation of the <b>Most Frequent Sense</b> baseline approach. This

+ * approach returns the first sense retreived in WordNet which is supposed to be

+ * the most frequent sense:

+ * <ul>

+ * <li>PoS-tags of the surrounding words</li>

+ * <li>Local collocations</li>

+ * <li>Surrounding words</li>

+ * </ul>

+ * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details

+ * about this approach

+ */

+public class MFS implements WSDisambiguator {

+

+  public MFS(WSDParameters parameters) {

+    super();

+    this.parameters = parameters;

+  }

+

+  public MFS() {

+    super();

+    this.parameters = new MFSParameters();

+  }

+

+  public WSDParameters parameters;

+

+  private String[] getMostFrequentSense(WordToDisambiguate wordToDisambiguate) {

+

+    String word = wordToDisambiguate.getRawWord().toLowerCase();

+    POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());

+

+    if (pos != null) {

+

+      WordPOS wordPOS = new WordPOS(word, pos);

+

+      ArrayList<Synset> synsets = wordPOS.getSynsets();

+

+      int size = synsets.size();

+

+      String[] senses = new String[size];

+

+      for (int i = 0; i < size; i++) {

+        String senseKey = null;

+        for (Word wd : synsets.get(i).getWords()) {

+          if (wd.getLemma().equals(

+              wordToDisambiguate.getRawWord().split("\\.")[0])) {

+            try {

+              senseKey = wd.getSenseKey();

+            } catch (JWNLException e) {

+              e.printStackTrace();

+            }

+            senses[i] = "WordNet " + senseKey;

+            break;

+          }

+        }

+

+      }

+      return senses;

+    } else {

+      System.out.println("The word has no definitions in WordNet !");

+      return null;

+    }

+

+  }

+

+  /**

+   * This method returns the most frequent sense out of a wordTag. It serves for

+   * quick check of the most frequent sense without any need to create a

+   * {@link WordToDisambiguate} instance

+   * 

+   * @param wordTag

+   *          the word to disambiguate. It should be written in the format

+   *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"

+   * @return The most frequent sense if it exists in WordNet, null} otherwise

+   */

+  public String[] getMostFrequentSense(String wordTag) {

+

+    String word = wordTag.split("\\.")[0];

+    String tag = wordTag.split("\\.")[1];

+

+    POS pos;

+

+    if (tag.equalsIgnoreCase("a")) {

+      pos = POS.ADJECTIVE;

+    } else if (tag.equalsIgnoreCase("r")) {

+      pos = POS.ADVERB;

+    } else if (tag.equalsIgnoreCase("n")) {

+      pos = POS.NOUN;

+    } else if (tag.equalsIgnoreCase("a")) {

+      pos = POS.VERB;

+    } else

+      pos = null;

+

+    if (pos != null) {

+

+      WordPOS wordPOS = new WordPOS(word, pos);

+

+      ArrayList<Synset> synsets = wordPOS.getSynsets();

+

+      int size = synsets.size();

+

+      String[] senses = new String[size];

+

+      for (int i = 0; i < size; i++) {

+        String senseKey = null;

+        for (Word wd : synsets.get(i).getWords()) {

+          if (wd.getLemma().equals(word)) {

+            try {

+              senseKey = wd.getSenseKey();

+            } catch (JWNLException e) {

+              e.printStackTrace();

+            }

+            senses[i] = senseKey;

+            break;

+          }

+        }

+

+      }

+      return senses;

+    } else {

+      System.out.println("The word has no definitions in WordNet !");

+      return null;

+    }

+

+  }

+

+  @Override

+  public WSDParameters getParams() {

+    return this.parameters;

+  }

+

+  @Override

+  public void setParams(WSDParameters params) throws InvalidParameterException {

+    this.parameters = params;

+

+  }

+

+  @Override

+  public String[] disambiguate(String[] tokenizedContext,

+      int ambiguousTokenIndex) {

+    // System.out.println(tokenizedContext[ambiguousTokenIndex]);

+    WordToDisambiguate wtd = new WordToDisambiguate(tokenizedContext,

+        ambiguousTokenIndex);

+    // System.out.println(wtd.getPosTags()[ambiguousTokenIndex]);

+    return getMostFrequentSense(wtd);

+  }

+

+  @Override

+  public String[][] disambiguate(String[] tokenizedContext,

+      Span[] ambiguousTokenIndexSpans) {

+

+    // TODO Auto-generated method stub

+    return null;

+  }

+

+}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
new file mode 100644
index 0000000..52bd4af
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java

@@ -0,0 +1,61 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator.mfs;

+

+import opennlp.tools.disambiguator.WSDParameters;

+

+public class MFSParameters extends WSDParameters {

+

+  public MFSParameters(){

+    this.isCoarseSense = false;

+    this.source = Source.WORDNET;

+  }

+  

+  public static enum Source {

+    WORDNET(1, "wordnet");

+

+    public int code;

+    public String src;

+

+    private Source(int code, String src) {

+      this.code = code;

+      this.src = src;

+    }

+  }

+

+  protected Source source;

+

+  public Source getSource() {

+    return source;

+  }

+

+  public void setSource(Source source) {

+    this.source = source;

+  }

+

+  @Override

+  public boolean isValid() {

+    if (this.source.code == 1) {

+      return true;

+    }

+    return false;

+  }

+

+}


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
new file mode 100644
index 0000000..3e6f94d
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java

@@ -0,0 +1,102 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator;

+

+import java.util.ArrayList;

+

+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;

+import opennlp.tools.disambiguator.ims.WTDIMS;

+import opennlp.tools.disambiguator.mfs.MFS;

+import opennlp.tools.disambiguator.mfs.MFSParameters;

+

+import org.junit.Test;

+

+public class MFSEvaluatorTest {

+

+  static SensevalReader seReader = new SensevalReader();

+

+  @Test

+  public static void main(String[] args) {

+    Constants.print("Evaluation Started");

+

+    MFS mfs = new MFS();

+    MFSParameters mfsParams = new MFSParameters();

+    mfs.setParams(mfsParams);

+

+    ArrayList<String> words = seReader.getSensevalWords();

+

+    for (String word : words) {

+      WSDEvaluator evaluator = new WSDEvaluator(mfs);

+

+      // don't take verbs because they are not from WordNet

+      if (!word.split("\\.")[1].equals("v")) {

+

+        ArrayList<WTDIMS> instances = getTestData(word);

+

+        if (instances != null) {

+          Constants.print("------------------" + word + "------------------");

+          for (WordToDisambiguate instance : instances) {

+

+            if (instance.getSenseIDs() != null

+                && !instance.getSenseIDs().get(0).equals("null")) {

+              // Constants.print("sense IDs : " + instance.senseIDs);

+              evaluator.evaluateSample(instance);

+            }

+          }

+          Constants.print(evaluator.toString());

+        } else {

+          Constants.print("null instances");

+        }

+      }

+

+    }

+

+  }

+

+  /**

+   * For a specific word, return the Semeval3 corresponding instances in form of

+   * {@link WSDIMS}

+   * 

+   * @param wordTag

+   *          the word of which the instances are to be collected. wordTag has

+   *          to be in the format "word.POS" (e.g., "activate.v", "smart.a",

+   *          etc.)

+   * @return list of {@link WSDIMS} instances of the wordTag

+   */

+  protected static ArrayList<WTDIMS> getTestData(String wordTag) {

+

+    ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();

+    for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {

+      WTDIMS wtdims = new WTDIMS(wtd);

+      if (wtdims != null) {

+        if (wtdims.getSenseIDs().get(0) != null

+            && !wtdims.getSenseIDs().get(0).equalsIgnoreCase("U")) {

+          // System.out.println(wtdims.getRawWord() + " - " +

+          // wtdims.getPosTags() + " - " + wtdims.getSenseIDs().get(0));

+          instances.add(wtdims);

+        }

+      }

+

+    }

+

+    return instances;

+  }

+

+}


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
new file mode 100644
index 0000000..5b2f7cb
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java

@@ -0,0 +1,48 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator;

+

+import opennlp.tools.disambiguator.mfs.MFS;

+

+/**

+ * This is a typical example of how to call the disambiguation function in the

+ * MFS class.

+ */

+public class MFSTester {

+

+  public static void main(String[] args) {

+

+    MFS mfs = new MFS();

+

+    String test1 = "Please write to me soon.";

+    String[] sentence1 = Loader.getTokenizer().tokenize(test1);

+    Constants.print(mfs.disambiguate(sentence1, 1));

+

+    String test2 = "it was a strong argument that his hypothesis was true";

+    String[] sentence2 = Loader.getTokenizer().tokenize(test2);

+    Constants.print(mfs.disambiguate(sentence2, 3));

+

+    String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";

+    String[] sentence3 = Loader.getTokenizer().tokenize(test3);

+    Constants.print(mfs.disambiguate(sentence3, 12));

+

+  }

+

+}
commit	77f56ceb795ff62c53c07880ea607624dc85a6e2	[log] [tgz]
author	Jörn Kottmann <joern@apache.org>	Mon Aug 03 08:11:04 2015 +0000
committer	Jörn Kottmann <joern@apache.org>	Mon Aug 03 08:11:04 2015 +0000
tree	2776e7d3b93f4fbdca0ca9811ed52d6e98ad898b
parent	ffafc92392ee89527acdb0920c395636975929c3 [diff]