OPENNLP-802 The WSDisambiguator needs a baseline to compare the implemented approaches with.
Lesk presents a good baseline, however Senseval and Semeval workshops demonstrated that MFS presents a better and more challenging baseline.
Thanks to Mondher Bouazizi for providing a patch!
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
index 0d6bfd7..d12ebb7 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
@@ -25,6 +25,8 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.Collections;
+import java.util.Arrays;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
@@ -209,6 +211,8 @@
ArrayList<String> answers = new ArrayList<String>();
String sentence = "";
String rawWord = "";
+ String[] finalText = null;
+ int index = 0;
NodeList nChildren = nInstance.getChildNodes();
@@ -230,18 +234,46 @@
sentence = ((Element) nChild).getTextContent();
if (nChild.hasChildNodes()) {
- // textbefore =
- // nChild.getChildNodes().item(0).getTextContent();
+ String textBefore = nChild.getChildNodes().item(0)
+ .getTextContent();
rawWord = nChild.getChildNodes().item(1).getTextContent();
- // textAfter =
- // nChild.getChildNodes().item(2).getTextContent();
+ String textAfter = nChild.getChildNodes().item(2)
+ .getTextContent();
+
+ ArrayList<String> textBeforeTokenzed = new ArrayList<String>(
+ Arrays.asList(textBefore.split("\\s")));
+ ArrayList<String> textAfterTokenzed = new ArrayList<String>(
+ Arrays.asList(textAfter.split("\\s")));
+
+ textBeforeTokenzed.removeAll(Collections.singleton(null));
+ textBeforeTokenzed.removeAll(Collections.singleton(""));
+
+ textAfterTokenzed.removeAll(Collections.singleton(null));
+ textAfterTokenzed.removeAll(Collections.singleton(""));
+
+ finalText = new String[textBeforeTokenzed.size() + 1
+ + textAfterTokenzed.size()];
+
+ int l = 0;
+ for (String tempWord : textBeforeTokenzed) {
+ finalText[l] = tempWord;
+ l++;
+ }
+ index = l;
+ finalText[l] = rawWord.toLowerCase();
+ l++;
+ for (String tempWord : textAfterTokenzed) {
+ finalText[l] = tempWord;
+ l++;
+ }
+
}
}
}
- WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
- rawWord);
+ WTDIMS wordToDisambiguate = new WTDIMS(finalText, index,
+ answers);
setInstances.add(wordToDisambiguate);
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
index 34044af..e2580be 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
@@ -67,6 +67,11 @@
super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense());
this.senseIDs = wtd.getSenseIDs();
}
+
+ public WTDIMS(String[] sentence, int wordIndex, ArrayList<String> senseIDs) {
+ super(sentence, wordIndex);
+ this.senseIDs = senseIDs;
+ }
public String[] getPosOfSurroundingWords() {
return posOfSurroundingWords;
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
new file mode 100644
index 0000000..e20bd6d
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.mfs;
+
+import java.security.InvalidParameterException;
+import java.util.ArrayList;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
+import net.sf.extjwnl.data.Synset;
+import net.sf.extjwnl.data.Word;
+import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.WSDParameters;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.WordPOS;
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.util.Span;
+
+/**
+ * Implementation of the <b>Most Frequent Sense</b> baseline approach. This
+ * approach returns the first sense retreived in WordNet which is supposed to be
+ * the most frequent sense:
+ * <ul>
+ * <li>PoS-tags of the surrounding words</li>
+ * <li>Local collocations</li>
+ * <li>Surrounding words</li>
+ * </ul>
+ * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
+ * about this approach
+ */
+public class MFS implements WSDisambiguator {
+
+ public MFS(WSDParameters parameters) {
+ super();
+ this.parameters = parameters;
+ }
+
+ public MFS() {
+ super();
+ this.parameters = new MFSParameters();
+ }
+
+ public WSDParameters parameters;
+
+ private String[] getMostFrequentSense(WordToDisambiguate wordToDisambiguate) {
+
+ String word = wordToDisambiguate.getRawWord().toLowerCase();
+ POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());
+
+ if (pos != null) {
+
+ WordPOS wordPOS = new WordPOS(word, pos);
+
+ ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+ int size = synsets.size();
+
+ String[] senses = new String[size];
+
+ for (int i = 0; i < size; i++) {
+ String senseKey = null;
+ for (Word wd : synsets.get(i).getWords()) {
+ if (wd.getLemma().equals(
+ wordToDisambiguate.getRawWord().split("\\.")[0])) {
+ try {
+ senseKey = wd.getSenseKey();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ senses[i] = "WordNet " + senseKey;
+ break;
+ }
+ }
+
+ }
+ return senses;
+ } else {
+ System.out.println("The word has no definitions in WordNet !");
+ return null;
+ }
+
+ }
+
+ /**
+ * This method returns the most frequent sense out of a wordTag. It serves for
+ * quick check of the most frequent sense without any need to create a
+ * {@link WordToDisambiguate} instance
+ *
+ * @param wordTag
+ * the word to disambiguate. It should be written in the format
+ * "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
+ * @return The most frequent sense if it exists in WordNet, null} otherwise
+ */
+ public String[] getMostFrequentSense(String wordTag) {
+
+ String word = wordTag.split("\\.")[0];
+ String tag = wordTag.split("\\.")[1];
+
+ POS pos;
+
+ if (tag.equalsIgnoreCase("a")) {
+ pos = POS.ADJECTIVE;
+ } else if (tag.equalsIgnoreCase("r")) {
+ pos = POS.ADVERB;
+ } else if (tag.equalsIgnoreCase("n")) {
+ pos = POS.NOUN;
+ } else if (tag.equalsIgnoreCase("a")) {
+ pos = POS.VERB;
+ } else
+ pos = null;
+
+ if (pos != null) {
+
+ WordPOS wordPOS = new WordPOS(word, pos);
+
+ ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+ int size = synsets.size();
+
+ String[] senses = new String[size];
+
+ for (int i = 0; i < size; i++) {
+ String senseKey = null;
+ for (Word wd : synsets.get(i).getWords()) {
+ if (wd.getLemma().equals(word)) {
+ try {
+ senseKey = wd.getSenseKey();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ senses[i] = senseKey;
+ break;
+ }
+ }
+
+ }
+ return senses;
+ } else {
+ System.out.println("The word has no definitions in WordNet !");
+ return null;
+ }
+
+ }
+
+ @Override
+ public WSDParameters getParams() {
+ return this.parameters;
+ }
+
+ @Override
+ public void setParams(WSDParameters params) throws InvalidParameterException {
+ this.parameters = params;
+
+ }
+
+ @Override
+ public String[] disambiguate(String[] tokenizedContext,
+ int ambiguousTokenIndex) {
+ // System.out.println(tokenizedContext[ambiguousTokenIndex]);
+ WordToDisambiguate wtd = new WordToDisambiguate(tokenizedContext,
+ ambiguousTokenIndex);
+ // System.out.println(wtd.getPosTags()[ambiguousTokenIndex]);
+ return getMostFrequentSense(wtd);
+ }
+
+ @Override
+ public String[][] disambiguate(String[] tokenizedContext,
+ Span[] ambiguousTokenIndexSpans) {
+
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
new file mode 100644
index 0000000..52bd4af
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.mfs;
+
+import opennlp.tools.disambiguator.WSDParameters;
+
+public class MFSParameters extends WSDParameters {
+
+ public MFSParameters(){
+ this.isCoarseSense = false;
+ this.source = Source.WORDNET;
+ }
+
+ public static enum Source {
+ WORDNET(1, "wordnet");
+
+ public int code;
+ public String src;
+
+ private Source(int code, String src) {
+ this.code = code;
+ this.src = src;
+ }
+ }
+
+ protected Source source;
+
+ public Source getSource() {
+ return source;
+ }
+
+ public void setSource(Source source) {
+ this.source = source;
+ }
+
+ @Override
+ public boolean isValid() {
+ if (this.source.code == 1) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
new file mode 100644
index 0000000..3e6f94d
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+import opennlp.tools.disambiguator.mfs.MFS;
+import opennlp.tools.disambiguator.mfs.MFSParameters;
+
+import org.junit.Test;
+
+public class MFSEvaluatorTest {
+
+ static SensevalReader seReader = new SensevalReader();
+
+ @Test
+ public static void main(String[] args) {
+ Constants.print("Evaluation Started");
+
+ MFS mfs = new MFS();
+ MFSParameters mfsParams = new MFSParameters();
+ mfs.setParams(mfsParams);
+
+ ArrayList<String> words = seReader.getSensevalWords();
+
+ for (String word : words) {
+ WSDEvaluator evaluator = new WSDEvaluator(mfs);
+
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WTDIMS> instances = getTestData(word);
+
+ if (instances != null) {
+ Constants.print("------------------" + word + "------------------");
+ for (WordToDisambiguate instance : instances) {
+
+ if (instance.getSenseIDs() != null
+ && !instance.getSenseIDs().get(0).equals("null")) {
+ // Constants.print("sense IDs : " + instance.senseIDs);
+ evaluator.evaluateSample(instance);
+ }
+ }
+ Constants.print(evaluator.toString());
+ } else {
+ Constants.print("null instances");
+ }
+ }
+
+ }
+
+ }
+
+ /**
+ * For a specific word, return the Semeval3 corresponding instances in form of
+ * {@link WSDIMS}
+ *
+ * @param wordTag
+ * the word of which the instances are to be collected. wordTag has
+ * to be in the format "word.POS" (e.g., "activate.v", "smart.a",
+ * etc.)
+ * @return list of {@link WSDIMS} instances of the wordTag
+ */
+ protected static ArrayList<WTDIMS> getTestData(String wordTag) {
+
+ ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+ for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+ WTDIMS wtdims = new WTDIMS(wtd);
+ if (wtdims != null) {
+ if (wtdims.getSenseIDs().get(0) != null
+ && !wtdims.getSenseIDs().get(0).equalsIgnoreCase("U")) {
+ // System.out.println(wtdims.getRawWord() + " - " +
+ // wtdims.getPosTags() + " - " + wtdims.getSenseIDs().get(0));
+ instances.add(wtdims);
+ }
+ }
+
+ }
+
+ return instances;
+ }
+
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
new file mode 100644
index 0000000..5b2f7cb
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import opennlp.tools.disambiguator.mfs.MFS;
+
+/**
+ * This is a typical example of how to call the disambiguation function in the
+ * MFS class.
+ */
+public class MFSTester {
+
+ public static void main(String[] args) {
+
+ MFS mfs = new MFS();
+
+ String test1 = "Please write to me soon.";
+ String[] sentence1 = Loader.getTokenizer().tokenize(test1);
+ Constants.print(mfs.disambiguate(sentence1, 1));
+
+ String test2 = "it was a strong argument that his hypothesis was true";
+ String[] sentence2 = Loader.getTokenizer().tokenize(test2);
+ Constants.print(mfs.disambiguate(sentence2, 3));
+
+ String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
+ String[] sentence3 = Loader.getTokenizer().tokenize(test3);
+ Constants.print(mfs.disambiguate(sentence3, 12));
+
+ }
+
+}