OPENNLP-758 Updated Lesk with new data readers and added MFS in case no overlaps are found (similar to the simplified version). Thanks to Anthony Beylerian for providing a patch.
diff --git a/opennlp-wsd/.project b/opennlp-wsd/.project
new file mode 100644
index 0000000..a15cd73
--- /dev/null
+++ b/opennlp-wsd/.project
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>opennlp-wsd</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ </buildSpec>
+ <natures>
+ </natures>
+</projectDescription>
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
index 3fabe96..934f033 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
@@ -138,33 +138,48 @@
"you're", "yours", "yourself", "yourselves", "you've", "zero"));
// Print a text in the console
- public static void printResults(WSDisambiguator disambiguator,
- String[] results) {
+//Print a text in the console
+ public static void printResults(WSDisambiguator disambiguator,
+ String[] results) {
- if (results != null) {
+ if (results != null) {
- if (disambiguator instanceof Lesk) {
- POS pos;
- long offset;
- double score;
- String[] parts;
+ String[] parts;
+ String sensekey;
+ if (disambiguator instanceof Lesk) {
- for (String result : results) {
- parts = result.split("@");
- pos = POS.getPOSForKey(parts[0]);
- offset = Long.parseLong(parts[1]);
- score = Double.parseDouble(parts[3]);
- try {
- Constants.print("score : " + score + " for : "
- + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
- } catch (JWNLException e) {
- e.printStackTrace();
- }
- }
- }
- }
+ Double score;
- }
+ for (String result : results) {
+ parts = result.split(" ");
+ sensekey = parts[1];
+ score = Double.parseDouble(parts[2]);
+ try {
+ Constants.print("score : "
+ + score
+ + " for : "
+ + Loader.getDictionary().getWordBySenseKey(sensekey)
+ .getSynset().getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ } else {
+ for (String result : results) {
+ parts = result.split(" ");
+ sensekey = parts[1];
+ try {
+ Constants.print("sense : "
+ + Loader.getDictionary().getWordBySenseKey(sensekey)
+ .getSynset().getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ }
public static void print(Object in) {
if (in == null) {
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
index f91a4ed..3d9ddae 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
@@ -30,7 +30,7 @@
import net.sf.extjwnl.dictionary.Dictionary;
import net.sf.extjwnl.dictionary.MorphologicalProcessor;
import opennlp.tools.cmdline.postag.POSModelLoader;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.lemmatizer.SimpleLemmatizer;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
similarity index 75%
rename from opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
rename to opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
index b600384..6e632a8 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
@@ -17,27 +17,27 @@
* under the License.
*/
-package opennlp.tools.disambiguator.DatasetsReader;
+package opennlp.tools.disambiguator.datareader;
import java.util.ArrayList;
-public class IParagraph {
+public class Paragraph {
protected int pnum;
- protected ArrayList<ISentence> isentences;
+ protected ArrayList<Sentence> isentences;
- public IParagraph() {
+ public Paragraph() {
super();
- this.isentences = new ArrayList<ISentence>();
+ this.isentences = new ArrayList<Sentence>();
}
- public IParagraph(int pnum) {
+ public Paragraph(int pnum) {
super();
this.pnum = pnum;
- this.isentences = new ArrayList<ISentence>();
+ this.isentences = new ArrayList<Sentence>();
}
- public IParagraph(int pnum, ArrayList<ISentence> sentences) {
+ public Paragraph(int pnum, ArrayList<Sentence> sentences) {
super();
this.pnum = pnum;
this.isentences = sentences;
@@ -51,15 +51,15 @@
this.pnum = pnum;
}
- public ArrayList<ISentence> getSsentences() {
+ public ArrayList<Sentence> getSsentences() {
return isentences;
}
- public void setIsentences(ArrayList<ISentence> isentences) {
+ public void setIsentences(ArrayList<Sentence> isentences) {
this.isentences = isentences;
}
- public void addIsentence(ISentence isentence) {
+ public void addIsentence(Sentence isentence) {
this.isentences.add(isentence);
}
@@ -84,8 +84,8 @@
*/
public boolean contains(String wordTag) {
- for (ISentence isentence : this.getSsentences()) {
- for (IWord iword : isentence.getIwords()) {
+ for (Sentence isentence : this.getSsentences()) {
+ for (Word iword : isentence.getIwords()) {
if (iword.equals(iword))
return true;
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
similarity index 91%
rename from opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
rename to opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
index 4d08564..efd8603 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package opennlp.tools.disambiguator.DatasetsReader;
+package opennlp.tools.disambiguator.datareader;
import java.io.File;
import java.util.ArrayList;
@@ -72,9 +72,9 @@
/**
* This serves to read one Semcor XML file
*/
- public ArrayList<ISentence> readFile(String file) {
+ public ArrayList<Sentence> readFile(String file) {
- ArrayList<ISentence> result = new ArrayList<ISentence>();
+ ArrayList<Sentence> result = new ArrayList<Sentence>();
try {
@@ -109,7 +109,7 @@
// THE SENTENCE ID
int sentenceID = Integer.parseInt(eSentence
.getAttribute(ATTRIBUTE_SENTENCENUM));
- ISentence isentence = new ISentence(paragraphID, sentenceID);
+ Sentence isentence = new Sentence(paragraphID, sentenceID);
NodeList nWords = nSentence.getChildNodes();
@@ -132,8 +132,8 @@
String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);
- IWord iword = new IWord(paragraphID, sentenceID, wnum,
- IWord.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
+ Word iword = new Word(paragraphID, sentenceID, wnum,
+ Word.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
isentence.addIword(iword);
wnum++;
@@ -145,8 +145,8 @@
String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
String pos = eWord.getAttribute(ATTRIBUTE_POS);
- IWord iword = new IWord(paragraphID, sentenceID, wnum,
- IWord.Type.WORD, word, cmd, pos);
+ Word iword = new Word(paragraphID, sentenceID, wnum,
+ Word.Type.WORD, word, cmd, pos);
isentence.addIword(iword);
wnum++;
}
@@ -154,8 +154,8 @@
} else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
Element eWord = (Element) nWord;
String word = eWord.getTextContent();
- IWord iword = new IWord(paragraphID, sentenceID, wnum,
- IWord.Type.PUNCTUATIONMARK, word);
+ Word iword = new Word(paragraphID, sentenceID, wnum,
+ Word.Type.PUNCTUATIONMARK, word);
isentence.addIword(iword);
wnum++;
}
@@ -182,12 +182,12 @@
try {
- ArrayList<ISentence> isentences = readFile(file);
+ ArrayList<Sentence> isentences = readFile(file);
for (int j = 0; j < isentences.size(); j++) {
- ISentence isentence = isentences.get(j);
- ArrayList<IWord> iwords = isentence.getIwords();
+ Sentence isentence = isentences.get(j);
+ ArrayList<Word> iwords = isentence.getIwords();
for (int k = 0; k < iwords.size(); k++) {
- IWord iword = iwords.get(k);
+ Word iword = iwords.get(k);
if (iword.isInstanceOf(wordTag)) {
String sentence;
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
similarity index 99%
rename from opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
rename to opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
index d12ebb7..19325d8 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package opennlp.tools.disambiguator.DatasetsReader;
+package opennlp.tools.disambiguator.datareader;
import java.io.BufferedReader;
import java.io.File;
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
similarity index 77%
rename from opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
rename to opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
index 3b480e4..a2eefd9 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
@@ -17,29 +17,29 @@
* under the License.
*/
-package opennlp.tools.disambiguator.DatasetsReader;
+package opennlp.tools.disambiguator.datareader;
import java.util.ArrayList;
-public class ISentence {
+public class Sentence {
protected int pnum;
protected int snum;
- protected ArrayList<IWord> iwords;
+ protected ArrayList<Word> iwords;
- public ISentence() {
+ public Sentence() {
super();
- this.iwords = new ArrayList<IWord>();
+ this.iwords = new ArrayList<Word>();
}
- public ISentence(int pnum, int snum) {
+ public Sentence(int pnum, int snum) {
super();
this.pnum = pnum;
this.snum = snum;
- this.iwords = new ArrayList<IWord>();
+ this.iwords = new ArrayList<Word>();
}
- public ISentence(int pnum, int snum, ArrayList<IWord> iwords) {
+ public Sentence(int pnum, int snum, ArrayList<Word> iwords) {
super();
this.pnum = pnum;
this.snum = snum;
@@ -62,15 +62,15 @@
this.snum = snum;
}
- public ArrayList<IWord> getIwords() {
+ public ArrayList<Word> getIwords() {
return iwords;
}
- public void setIwords(ArrayList<IWord> iwords) {
+ public void setIwords(ArrayList<Word> iwords) {
this.iwords = iwords;
}
- public void addIword(IWord iword) {
+ public void addIword(Word iword) {
this.iwords.add(iword);
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
similarity index 92%
rename from opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
rename to opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
index 248801d..13a93c6 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
@@ -17,11 +17,11 @@
* under the License.
*/
-package opennlp.tools.disambiguator.DatasetsReader;
+package opennlp.tools.disambiguator.datareader;
import opennlp.tools.disambiguator.Constants;
-public class IWord {
+public class Word {
public static enum Type {
WORD(1, "word"), PUNCTUATIONMARK(2, "pm");
@@ -49,11 +49,11 @@
protected String wnsn;
protected String lexsn;
- public IWord() {
+ public Word() {
super();
}
- public IWord(String lemma, String pos) {
+ public Word(String lemma, String pos) {
super();
this.word = lemma;
this.lemma = lemma;
@@ -88,7 +88,7 @@
* should be linked
*
*/
- public IWord(int pnum, int snum, int wnum, Type type, String word,
+ public Word(int pnum, int snum, int wnum, Type type, String word,
String cmd, String pos, String lemma, String wnsn, String lexsn) {
super();
this.pnum = pnum;
@@ -121,7 +121,7 @@
* The PoS Tag of the word
*
*/
- public IWord(int pnum, int snum, int wnum, Type type, String word,
+ public Word(int pnum, int snum, int wnum, Type type, String word,
String cmd, String pos) {
super();
this.wnum = wnum;
@@ -141,7 +141,7 @@
* @param word
* The punctuation mark, as it appears in the sentence
*/
- public IWord(int pnum, int snum, int wnum, Type type, String word) {
+ public Word(int pnum, int snum, int wnum, Type type, String word) {
super();
this.pnum = pnum;
this.snum = snum;
@@ -236,12 +236,12 @@
public boolean equals(Object oword) {
- if (!(oword instanceof IWord))
+ if (!(oword instanceof Word))
return false;
if (oword == this)
return true;
- IWord iword = (IWord) oword;
+ Word iword = (Word) oword;
if (this.lemma != null && iword.getLemma() != null) {
if (iword.getLemma().equals(this.getLemma())
@@ -278,12 +278,12 @@
public boolean senseEquals(Object oword) {
- if (!(oword instanceof IWord))
+ if (!(oword instanceof Word))
return false;
if (oword == this)
return true;
- IWord iword = (IWord) oword;
+ Word iword = (Word) oword;
if (iword.getLemma().equals(this.getLemma())
&& Constants.getPOS(iword.getPos()).equals(
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
index d7a77d5..0ab637a 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
@@ -58,8 +58,8 @@
import opennlp.tools.disambiguator.WordPOS;
import opennlp.tools.disambiguator.WSDisambiguator;
import opennlp.tools.disambiguator.WordToDisambiguate;
-import opennlp.tools.disambiguator.DatasetsReader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
/**
* Implementation of the <b>It Makes Sense</b> approach originally proposed in
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
index cfd28e2..dce333b 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
@@ -33,6 +33,7 @@
import opennlp.tools.disambiguator.WordSense;
import opennlp.tools.util.Span;
import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
import net.sf.extjwnl.data.Word;
@@ -95,6 +96,36 @@
return params;
}
+ /*
+ * @return the most frequent senses from wordnet
+ */
+ protected String getMostFrequentSenseKey(WTDLesk wtd) {
+
+ String word = wtd.getRawWord().toLowerCase();
+ POS pos = Constants.getPOS(wtd.getPosTag());
+ String senseKey = null;
+
+ if (pos != null) {
+
+ WordPOS wordPOS = new WordPOS(word, pos);
+
+ ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+ for (Word wd : synsets.get(0).getWords()) {
+ if (wd.getLemma().equals(wtd.getRawWord().split("\\.")[0])) {
+ try {
+ senseKey = wd.getSenseKey();
+ break;
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ break;
+ }
+ }
+ }
+ return senseKey;
+ }
+
/**
* The basic Lesk method where the entire context is considered for overlaps
*
@@ -980,23 +1011,30 @@
Collections.sort(wsenses);
- List<Word> synsetWords;
- String[] senses = new String[wsenses.size()];
- String senseKey = "?";
- for (int i = 0; i < wsenses.size(); i++) {
- synsetWords = wsenses.get(i).getNode().synset.getWords();
- for (Word synWord : synsetWords) {
- if (synWord.getLemma().equals(wtd.getWord())) {
- try {
- senseKey = synWord.getSenseKey();
- } catch (JWNLException e) {
- e.printStackTrace();
+ String[] senses;
+ if (wsenses.get(0).getScore() > 0) { // if at least one overlap
+ List<Word> synsetWords;
+ senses = new String[wsenses.size()];
+ String senseKey = "?";
+ for (int i = 0; i < wsenses.size(); i++) {
+ synsetWords = wsenses.get(i).getNode().synset.getWords();
+ for (Word synWord : synsetWords) {
+ if (synWord.getLemma().equals(wtd.getWord())) {
+ try {
+ senseKey = synWord.getSenseKey();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ break;
}
- break;
}
- }
- senses[i] = "WordNet" + " " + senseKey + " " + wsenses.get(i).getScore();
+ senses[i] = "WordNet" + " " + senseKey + " "
+ + wsenses.get(i).getScore();
+ }
+ } else { // get the MFS if no overlaps
+ senses = new String[1];
+ senses[0] = "WordNet" + " " + this.getMostFrequentSenseKey(wtd) + " -1";
}
return senses;
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
index d0aa8f5..ce833f6 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
@@ -40,10 +40,10 @@
// DEFAULTS
protected static final LESK_TYPE DFLT_LESK_TYPE = LESK_TYPE.LESK_EXT_EXP_CTXT_WIN;
- protected static final int DFLT_WIN_SIZE = 4;
- protected static final int DFLT_DEPTH = 3;
- protected static final double DFLT_IEXP = 0.3;
- protected static final double DFLT_DEXP = 0.3;
+ protected static final int DFLT_WIN_SIZE = 5;
+ protected static final int DFLT_DEPTH = 2;
+ protected static final double DFLT_IEXP = 0.4;
+ protected static final double DFLT_DEXP = 0.4;
protected LESK_TYPE leskType;
protected int win_f_size;
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
index a650ba9..8d048bb 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
@@ -22,7 +22,7 @@
import java.io.File;
import java.util.ArrayList;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.ims.IMS;
import opennlp.tools.disambiguator.ims.IMSParameters;
import opennlp.tools.disambiguator.ims.WTDIMS;
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
index 8c371da..8b1625e 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
@@ -19,10 +19,9 @@
package opennlp.tools.disambiguator;
-import java.io.File;
import java.util.ArrayList;
-import java.util.HashMap;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.ims.WTDIMS;
import opennlp.tools.disambiguator.lesk.Lesk;
import opennlp.tools.disambiguator.lesk.LeskParameters;
@@ -31,80 +30,58 @@
public class LeskEvaluatorTest {
- static DataExtractor dExtractor = new DataExtractor();
+ static SensevalReader seReader = new SensevalReader();
@Test
public static void main(String[] args) {
Constants.print("Evaluation Started");
- String testDataLoc = "src\\test\\resources\\data\\";
- String helpersLoc = "src\\test\\resources\\helpers\\";
-
- File[] listOfFiles;
- File testFolder = new File(testDataLoc);
-
- // these are needed for mapping the sense IDs from the current data
- String dict = helpersLoc + "EnglishLS.dictionary.xml";
- String map = helpersLoc + "EnglishLS.sensemap";
-
Lesk lesk = new Lesk();
LeskParameters leskParams = new LeskParameters();
leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_EXP_CTXT_WIN);
lesk.setParams(leskParams);
- if (testFolder.isDirectory()) {
- listOfFiles = testFolder.listFiles();
- for (File file : listOfFiles) {
- WSDEvaluator evaluator = new WSDEvaluator(lesk);
- if (file.isFile()) {
- // don't take verbs because they are not from WordNet
- if (!file.getName().split("\\.")[1].equals("v")) {
- HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
- .extractWordSenses(dict, map, file.getName());
- ArrayList<WTDIMS> instances = getTestData(file.getAbsolutePath(),
- senses);
+ ArrayList<String> words = seReader.getSensevalWords();
- if (instances != null) {
- Constants.print("------------------" + file.getName()
- + "------------------");
- for (WordToDisambiguate instance : instances) {
- // Constants.print("sense IDs : " + instance.senseIDs);
- evaluator.evaluateSample(instance);
- }
- Constants.print(evaluator.toString());
- } else {
- Constants.print("null instances");
+ for (String word : words) {
+ WSDEvaluator evaluator = new WSDEvaluator(lesk);
+
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WTDIMS> instances = getTestData(word);
+
+ if (instances != null) {
+ Constants.print("------------------" + word + "------------------");
+ for (WordToDisambiguate instance : instances) {
+
+ if (instance.getSenseIDs() != null
+ && !instance.getSenseIDs().get(0).equals("null")) {
+ evaluator.evaluateSample(instance);
}
}
+ Constants.print(evaluator.toString());
+ } else {
+ Constants.print("null instances");
}
}
+
}
}
- protected static ArrayList<WTDIMS> getTestData(String testFile,
- HashMap<String, ArrayList<DictionaryInstance>> senses) {
- /**
- * word tag has to be in the format "word.POS" (e.g., "activate.v",
- * "smart.a", etc.)
- */
- ArrayList<WTDIMS> trainingData = dExtractor.extractWSDInstances(testFile);
+ protected static ArrayList<WTDIMS> getTestData(String wordTag) {
- // HashMap<Integer, WTDIMS> trainingData =
- // dExtractor.extractWSDInstances(wordTrainingxmlFile);
- for (WTDIMS data : trainingData) {
- for (String senseId : data.getSenseIDs()) {
- for (String dictKey : senses.keySet()) {
- for (DictionaryInstance instance : senses.get(dictKey)) {
- if (senseId.equals(instance.getId())) {
- data.setSense(Integer.parseInt(dictKey.split("_")[1]));
- break;
- }
- }
+ ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+ for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+ WTDIMS wtdims = new WTDIMS(wtd);
+ if (wtdims != null) {
+ if (wtdims.getSenseIDs().get(0) != null
+ && !wtdims.getSenseIDs().get(0).equalsIgnoreCase("U")) {
+ instances.add(wtdims);
}
}
}
-
- return trainingData;
+ return instances;
}
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
index 3e6f94d..9d24484 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
@@ -21,7 +21,7 @@
import java.util.ArrayList;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.ims.WTDIMS;
import opennlp.tools.disambiguator.mfs.MFS;
import opennlp.tools.disambiguator.mfs.MFSParameters;