OPENNLP-801 Also includes some more cleanups. Thanks to Anthony Beylerian for providing a patch!
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
index 8af6c2c..bc61e9f 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
@@ -31,10 +31,12 @@
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.PerformanceMonitor;
import opennlp.tools.cmdline.TerminateToolException;
-import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.WSDSampleStream;
import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.ims.IMS;
+import opennlp.tools.disambiguator.lesk.Lesk;
import opennlp.tools.disambiguator.mfs.MFS;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -85,7 +87,7 @@
WSDSample sample = WSDSample.parse(line);
- Constants.printResults(disambiguator,
+ WSDHelper.printResults(disambiguator,
disambiguator.disambiguate(sample));
perfMon.incrementCounter();
@@ -105,7 +107,9 @@
if (params.getType().equalsIgnoreCase("mfs")) {
wsd = new MFS();
} else if (params.getType().equalsIgnoreCase("lesk")) {
+ wsd = new Lesk();
} else if (params.getType().equalsIgnoreCase("ims")) {
+ wsd = new IMS();
}
return wsd;
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
index fcb9d6b..3dfd00d 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
@@ -55,7 +55,7 @@
private String[] extractPosOfSurroundingWords(String[] sentence,
int wordIndex, int windowSize) {
- String[] taggedSentence = Loader.getTagger().tag(sentence);
+ String[] taggedSentence = WSDHelper.getTagger().tag(sentence);
String[] tags = new String[2 * windowSize + 1];
@@ -75,20 +75,20 @@
private String[] extractSurroundingWords(String[] sentence, int wordIndex) {
- String[] posTags = Loader.getTagger().tag(sentence);
+ String[] posTags = WSDHelper.getTagger().tag(sentence);
ArrayList<String> contextWords = new ArrayList<String>();
for (int i = 0; i < sentence.length; i++) {
- if (!Constants.stopWords.contains(sentence[i].toLowerCase())
+ if (!WSDHelper.stopWords.contains(sentence[i].toLowerCase())
&& (wordIndex != i)) {
String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();
// if (!word.equals("") /*&& Constants.isRelevant(posTags[i])*/) {
- if (Loader.getEnglishWords().containsKey(word)) {
- String lemma = Loader.getLemmatizer().lemmatize(word, posTags[i]);
+ if (WSDHelper.getEnglishWords().containsKey(word)) {
+ String lemma = WSDHelper.getLemmatizer().lemmatize(word, posTags[i]);
contextWords.add(lemma);
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java
index aadb6f3..e84b72e 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java
@@ -42,6 +42,11 @@
public ArrayList<Synset> hyponyms = new ArrayList<Synset>();
public ArrayList<Synset> meronyms = new ArrayList<Synset>();
public ArrayList<Synset> holonyms = new ArrayList<Synset>();
+ public ArrayList<Synset> entailments = new ArrayList<Synset>();
+ public ArrayList<Synset> coordinateTerms = new ArrayList<Synset>();
+ public ArrayList<Synset> causes = new ArrayList<Synset>();
+ public ArrayList<Synset> attributes = new ArrayList<Synset>();
+ public ArrayList<Synset> pertainyms = new ArrayList<Synset>();
public ArrayList<WordPOS> synonyms = new ArrayList<WordPOS>();
@@ -139,6 +144,101 @@
}
}
+ public void setEntailements() {
+ // PointerUtils pointerUtils = PointerUtils.get();
+ PointerTargetNodeList pentailments = new PointerTargetNodeList();
+ try {
+ pentailments = PointerUtils.getEntailments(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the hypernyms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < pentailments.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) pentailments.get(i);
+ this.entailments.add(ptn.getSynset());
+ }
+
+ }
+
+ public void setCoordinateTerms() {
+ // PointerUtils pointerUtils = PointerUtils.get();
+ PointerTargetNodeList pcoordinateTerms = new PointerTargetNodeList();
+ try {
+ pcoordinateTerms = PointerUtils.getCoordinateTerms(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the coordinate terms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < pcoordinateTerms.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) pcoordinateTerms.get(i);
+ this.coordinateTerms.add(ptn.getSynset());
+ }
+
+ }
+
+ public void setCauses() {
+ // PointerUtils pointerUtils = PointerUtils.get();
+ PointerTargetNodeList pcauses = new PointerTargetNodeList();
+ try {
+ pcauses = PointerUtils.getCauses(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the cause terms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < pcauses.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) pcauses.get(i);
+ this.causes.add(ptn.getSynset());
+ }
+
+ }
+
+ public void setAttributes() {
+ // PointerUtils pointerUtils = PointerUtils.get();
+ PointerTargetNodeList pattributes = new PointerTargetNodeList();
+ try {
+ pattributes = PointerUtils.getAttributes(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the attributes");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < pattributes.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) pattributes.get(i);
+ this.attributes.add(ptn.getSynset());
+ }
+
+ }
+
+ public void setPertainyms() {
+ // PointerUtils pointerUtils = PointerUtils.get();
+ PointerTargetNodeList ppertainyms = new PointerTargetNodeList();
+ try {
+ ppertainyms = PointerUtils.getPertainyms(this.synset);
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ } catch (NullPointerException e) {
+ System.err.println("Error finding the pertainyms");
+ e.printStackTrace();
+ }
+
+ for (int i = 0; i < ppertainyms.size(); i++) {
+ PointerTargetNode ptn = (PointerTargetNode) ppertainyms.get(i);
+ this.pertainyms.add(ptn.getSynset());
+ }
+
+ }
+
public void setSynonyms() {
for (Word word : synset.getWords())
synonyms.add(new WordPOS(word.toString(), word.getPOS()));
@@ -160,18 +260,38 @@
return holonyms;
}
+ public ArrayList<Synset> getEntailments() {
+ return entailments;
+ }
+
+ public ArrayList<Synset> getCoordinateTerms() {
+ return coordinateTerms;
+ }
+
+ public ArrayList<Synset> getCauses() {
+ return causes;
+ }
+
+ public ArrayList<Synset> getAttributes() {
+ return attributes;
+ }
+
+ public ArrayList<Synset> getPertainyms() {
+ return pertainyms;
+ }
+
public ArrayList<WordPOS> getSynonyms() {
return synonyms;
}
-
+
public String getGloss() {
return this.synset.getGloss().toString();
}
-
+
public long getSynsetID() {
return this.synset.getOffset();
}
-
+
/**
* Gets the senses of the nodes
*
@@ -182,8 +302,9 @@
ArrayList<WordSense> scoredSenses = new ArrayList<WordSense>();
for (int i = 0; i < nodes.size(); i++) {
- ArrayList<WordPOS> sensesComponents = PreProcessor
- .getAllRelevantWords(PreProcessor.tokenize(nodes.get(i).getGloss()));
+ ArrayList<WordPOS> sensesComponents = WSDHelper
+ .getAllRelevantWords(WSDHelper.getTokenizer().tokenize(
+ nodes.get(i).getGloss()));
WordSense wordSense = new WordSense();
nodes.get(i).setSenseRelevantWords(sensesComponents);
wordSense.setNode(nodes.get(i));
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
new file mode 100644
index 0000000..ae8c893
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
@@ -0,0 +1,664 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
+import net.sf.extjwnl.dictionary.Dictionary;
+import net.sf.extjwnl.dictionary.MorphologicalProcessor;
+import opennlp.tools.cmdline.postag.POSModelLoader;
+import opennlp.tools.disambiguator.lesk.Lesk;
+import opennlp.tools.lemmatizer.SimpleLemmatizer;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+public class WSDHelper {
+
+ protected static TokenizerME tokenizer;
+ protected static POSTaggerME tagger;
+ protected static SimpleLemmatizer lemmatizer;
+ protected static Dictionary dictionary;
+ protected static MorphologicalProcessor morph;
+
+ protected static String tokenizerModelPath;
+ protected static String taggerModelPath;
+ protected static String lemmatizerDictionaryPath;
+
+ // local caches for faster lookup
+ private static HashMap<String, Object> stemCache;
+ private static HashMap<String, Object> stopCache;
+ private static HashMap<String, Object> relvCache;
+
+ private static HashMap<String, Object> englishWords;
+
+ // List of all the PoS tags
+ public static String[] allPOS = { "CC", "CD", "DT", "EX", "FW", "IN", "JJ",
+ "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS",
+ "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD",
+ "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB" };
+
+ // List of the PoS tags of which the senses are to be extracted
+ public static String[] relevantPOS = { "JJ", "JJR", "JJS", "NN", "NNS", "RB",
+ "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" };
+
+ // List of Negation Words
+ public static ArrayList<String> negationWords = new ArrayList<String>(
+ Arrays.asList("not", "no", "never", "none", "nor", "non"));
+
+ // List of Stop Words
+ public static ArrayList<String> stopWords = new ArrayList<String>(
+ Arrays.asList("a", "able", "about", "above", "according", "accordingly",
+ "across", "actually", "after", "afterwards", "again", "against",
+ "ain't", "all", "allow", "allows", "almost", "alone", "along",
+ "already", "also", "although", "always", "am", "among", "amongst",
+ "an", "and", "another", "any", "anybody", "anyhow", "anyone",
+ "anything", "anyway", "anyways", "anywhere", "apart", "appear",
+ "appreciate", "appropriate", "are", "aren't", "around", "as",
+ "aside", "ask", "asking", "associated", "at", "available", "away",
+ "awfully", "be", "became", "because", "become", "becomes",
+ "becoming", "been", "before", "beforehand", "behind", "being",
+ "believe", "below", "beside", "besides", "best", "better", "between",
+ "beyond", "both", "brief", "but", "by", "came", "can", "cannot",
+ "cant", "can't", "cause", "causes", "certain", "certainly",
+ "changes", "clearly", "c'mon", "co", "com", "come", "comes",
+ "concerning", "consequently", "consider", "considering", "contain",
+ "containing", "contains", "corresponding", "could", "couldn't",
+ "course", "c's", "currently", "definitely", "described", "despite",
+ "did", "didn't", "different", "do", "does", "doesn't", "doing",
+ "done", "don't", "down", "downwards", "during", "each", "edu", "eg",
+ "eight", "either", "else", "elsewhere", "enough", "entirely",
+ "especially", "et", "etc", "even", "ever", "every", "everybody",
+ "everyone", "everything", "everywhere", "ex", "exactly", "example",
+ "except", "far", "few", "fifth", "first", "five", "followed",
+ "following", "follows", "for", "former", "formerly", "forth", "four",
+ "from", "further", "furthermore", "get", "gets", "getting", "given",
+ "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings",
+ "had", "hadn't", "happens", "hardly", "has", "hasn't", "have",
+ "haven't", "having", "he", "hello", "help", "hence", "her", "here",
+ "hereafter", "hereby", "herein", "here's", "hereupon", "hers",
+ "herself", "he's", "hi", "him", "himself", "his", "hither",
+ "hopefully", "how", "howbeit", "however", "i", "i'd", "ie", "if",
+ "ignored", "i'll", "i'm", "immediate", "in", "inasmuch", "inc",
+ "indeed", "indicate", "indicated", "indicates", "inner", "insofar",
+ "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll",
+ "its", "it's", "itself", "i've", "just", "keep", "keeps", "kept",
+ "know", "known", "knows", "last", "lately", "later", "latter",
+ "latterly", "least", "less", "lest", "let", "let's", "like", "liked",
+ "likely", "little", "look", "looking", "looks", "ltd", "mainly",
+ "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might",
+ "more", "moreover", "most", "mostly", "much", "must", "my", "myself",
+ "name", "namely", "nd", "near", "nearly", "necessary", "need",
+ "needs", "neither", "never", "nevertheless", "new", "next", "nine",
+ "no", "nobody", "non", "none", "noone", "nor", "normally", "not",
+ "nothing", "novel", "now", "nowhere", "obviously", "of", "off",
+ "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones",
+ "only", "onto", "or", "other", "others", "otherwise", "ought", "our",
+ "ours", "ourselves", "out", "outside", "over", "overall", "own",
+ "particular", "particularly", "per", "perhaps", "placed", "please",
+ "plus", "possible", "presumably", "probably", "provides", "que",
+ "quite", "qv", "rather", "rd", "re", "really", "reasonably",
+ "regarding", "regardless", "regards", "relatively", "respectively",
+ "right", "said", "same", "saw", "say", "saying", "says", "second",
+ "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems",
+ "seen", "self", "selves", "sensible", "sent", "serious", "seriously",
+ "seven", "several", "shall", "she", "should", "shouldn't", "since",
+ "six", "so", "some", "somebody", "somehow", "someone", "something",
+ "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry",
+ "specified", "specify", "specifying", "still", "sub", "such", "sup",
+ "sure", "take", "taken", "tell", "tends", "th", "than", "thank",
+ "thanks", "thanx", "that", "thats", "that's", "the", "their",
+ "theirs", "them", "themselves", "then", "thence", "there",
+ "thereafter", "thereby", "therefore", "therein", "theres", "there's",
+ "thereupon", "these", "they", "they'd", "they'll", "they're",
+ "they've", "think", "third", "this", "thorough", "thoroughly",
+ "those", "though", "three", "through", "throughout", "thru", "thus",
+ "to", "together", "too", "took", "toward", "towards", "tried",
+ "tries", "truly", "try", "trying", "t's", "twice", "two", "un",
+ "under", "unfortunately", "unless", "unlikely", "until", "unto",
+ "up", "upon", "us", "use", "used", "useful", "uses", "using",
+ "usually", "value", "various", "very", "via", "viz", "vs", "want",
+ "wants", "was", "wasn't", "way", "we", "we'd", "welcome", "well",
+ "we'll", "went", "were", "we're", "weren't", "we've", "what",
+ "whatever", "what's", "when", "whence", "whenever", "where",
+ "whereafter", "whereas", "whereby", "wherein", "where's",
+ "whereupon", "wherever", "whether", "which", "while", "whither",
+ "who", "whoever", "whole", "whom", "who's", "whose", "why", "will",
+ "willing", "wish", "with", "within", "without", "wonder", "won't",
+ "would", "wouldn't", "yes", "yet", "you", "you'd", "you'll", "your",
+ "you're", "yours", "yourself", "yourselves", "you've", "zero"));
+
+ public static HashMap<String, Object> getRelvCache() {
+ if (relvCache == null || relvCache.keySet().isEmpty()) {
+ relvCache = new HashMap<String, Object>();
+ for (String t : relevantPOS) {
+ relvCache.put(t, null);
+ }
+ }
+ return relvCache;
+ }
+
+ public static HashMap<String, Object> getStopCache() {
+ if (stopCache == null || stopCache.keySet().isEmpty()) {
+ stopCache = new HashMap<String, Object>();
+ for (String s : stopWords) {
+ stopCache.put(s, null);
+ }
+ }
+ return stopCache;
+ }
+
+ public static HashMap<String, Object> getStemCache() {
+ if (stemCache == null || stemCache.keySet().isEmpty()) {
+ stemCache = new HashMap<String, Object>();
+ for (Object pos : POS.getAllPOS()) {
+ stemCache.put(((POS) pos).getKey(), new HashMap());
+ }
+ }
+ return stemCache;
+ }
+
+ public static HashMap<String, Object> getEnglishWords() {
+ if (englishWords == null || englishWords.keySet().isEmpty()) {
+ englishWords = getEnglishWords(lemmatizerDictionaryPath);
+ }
+ return englishWords;
+ }
+
+ public static MorphologicalProcessor getMorph() {
+ if (morph == null) {
+ getDictionary();
+ morph = dictionary.getMorphologicalProcessor();
+ }
+ return morph;
+ }
+
+ public static Dictionary getDictionary() {
+ if (dictionary == null) {
+ try {
+ dictionary = Dictionary.getDefaultResourceInstance();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ return dictionary;
+ }
+
+ public static SimpleLemmatizer getLemmatizer() {
+ if (lemmatizer == null) {
+ try {
+ lemmatizer = new SimpleLemmatizer(new FileInputStream(
+ lemmatizerDictionaryPath));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return lemmatizer;
+ }
+
+ public static POSTaggerME getTagger() {
+ if (tagger == null) {
+ tagger = new POSTaggerME(new POSModelLoader().load(new File(
+ taggerModelPath)));
+ }
+ return tagger;
+ }
+
+ public static TokenizerME getTokenizer() {
+ if (tokenizer == null) {
+ try {
+ tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(
+ tokenizerModelPath)));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+ return tokenizer;
+ }
+
+ public static TokenizerME loadTokenizer(String path) {
+ tokenizerModelPath = path;
+ return getTokenizer();
+ }
+
+ public static POSTaggerME loadTagger(String path) {
+ taggerModelPath = path;
+ return getTagger();
+ }
+
+ public static SimpleLemmatizer loadLemmatizer(String path) {
+ lemmatizerDictionaryPath = path;
+ return getLemmatizer();
+ }
+
+ /*
+ * checks if the word is or contains a number
+ */
+ public static boolean containsNumbers(String word) {
+ return word.matches(".*[0-9].*");
+ }
+
+ // Print a text in the console
+ public static void printResults(WSDisambiguator disambiguator,
+ String[] results) {
+
+ if (results != null) {
+
+ String[] parts;
+ String sensekey;
+ if (disambiguator instanceof Lesk) {
+
+ Double score;
+
+ for (int i = 0; i < results.length; i++) {
+ parts = results[i].split(" ");
+ sensekey = parts[1];
+ score = Double.parseDouble(parts[2]);
+ try {
+ print("score : "
+ + score
+ + " for sense "
+ + i
+ + " : "
+ + sensekey
+ + " : "
+ + getDictionary().getWordBySenseKey(sensekey).getSynset()
+ .getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ } else {
+ for (int i = 0; i < results.length; i++) {
+ parts = results[i].split(" ");
+ sensekey = parts[1];
+ try {
+ print("sense "
+ + i
+ + " : "
+ + sensekey
+ + " : "
+ + getDictionary().getWordBySenseKey(sensekey).getSynset()
+ .getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ }
+
+ public static void print(Object in) {
+ if (in == null) {
+ System.out.println("object is null");
+ } else {
+ System.out.println(in);
+ }
+ }
+
+ public static void print(Object[] array) {
+ if (array == null) {
+ System.out.println("object is null");
+ } else {
+ System.out.println(Arrays.asList(array));
+ }
+ }
+
+ public static void print(Object[][] array) {
+ if (array == null) {
+ System.out.println("object is null");
+ } else {
+ System.out.print("[");
+ for (int i = 0; i < array.length; i++) {
+ print(array[i]);
+ if (i != array.length - 1) {
+ System.out.print("\n");
+ }
+ print("]");
+ }
+ }
+ }
+
+ /**
+ * Extract the list of ALL English words
+ *
+ * @param dict
+ * this file is the same that is used in the simple Lemmatizer
+ * (i.e.,"en-lemmatizer.dict")
+ *
+ * @return a list of all the English words
+ */
+ public static HashMap<String, Object> getEnglishWords(String dict) {
+
+ HashMap<String, Object> words = new HashMap<String, Object>();
+
+ BufferedReader br = null;
+
+ File file = new File(lemmatizerDictionaryPath);
+
+ if (file.exists()) {
+
+ try {
+ br = new BufferedReader(new FileReader(file));
+ String line = br.readLine();
+ while (line != null) {
+ line = br.readLine();
+ if (line != null) {
+ String word = line.split("\\t")[0];
+ words.put(word, null);
+ }
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ return words;
+ } else {
+ return null;
+ }
+
+ }
+
+ /**
+ * return the PoS (Class POS) out of the PoS-tag
+ *
+ * @param posTag
+ * PoS tag (e.g., "JJS", "NNP", etc.)
+ * @return the Part of Speech (type {@link POS})
+ */
+ public static POS getPOS(String posTag) {
+
+ ArrayList<String> adjective = new ArrayList<String>(Arrays.asList("JJ",
+ "JJR", "JJS"));
+ ArrayList<String> adverb = new ArrayList<String>(Arrays.asList("RB", "RBR",
+ "RBS"));
+ ArrayList<String> noun = new ArrayList<String>(Arrays.asList("NN", "NNS",
+ "NNP", "NNPS"));
+ ArrayList<String> verb = new ArrayList<String>(Arrays.asList("VB", "VBD",
+ "VBG", "VBN", "VBP", "VBZ"));
+
+ if (adjective.contains(posTag))
+ return POS.ADJECTIVE;
+ else if (adverb.contains(posTag))
+ return POS.ADVERB;
+ else if (noun.contains(posTag))
+ return POS.NOUN;
+ else if (verb.contains(posTag))
+ return POS.VERB;
+ else
+ return null;
+
+ }
+
+ /**
+ * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
+ * relevant when it corresponds to:
+ * <ul>
+ * <li>VERB</li>
+ * <li>ADJECTIVE</li>
+ * <li>ADVERB</li>
+ * <li>NOUN</li>
+ * </ul>
+ *
+ * @param posTag
+ * the PoS Tag to verify the relevance.
+ * @return whether a PoS Tag corresponds to a relevant Part of Speech (type
+ * {@link POS}) or not ( true} if it is, false} otherwise)
+ */
+ public static boolean isRelevant(String posTag) {
+ return getPOS(posTag) != null;
+ }
+
+ /**
+ * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
+ * relevant when it is:
+ * <ul>
+ * <li>VERB</li>
+ * <li>ADJECTIVE</li>
+ * <li>ADVERB</li>
+ * <li>NOUN</li>
+ * </ul>
+ *
+ * @param pos
+ * The Part of Speech of Type {@link POS}
+ * @return whether a Part of Speech is relevant (true) or not (false)
+ */
+ public static boolean isRelevant(POS pos) {
+ return pos.equals(POS.ADJECTIVE) || pos.equals(POS.ADVERB)
+ || pos.equals(POS.NOUN) || pos.equals(POS.VERB);
+ }
+
+ public static String getPOSabbreviation(String posTag) {
+
+ if (posTag == null) {
+ return null;
+ }
+ if (posTag.startsWith("JJ")) {
+ return "a";
+ } else if (posTag.startsWith("RB")) {
+ return "r";
+ } else if (posTag.startsWith("VB") || posTag.equals("MD")) {
+ return "v";
+ } else if (posTag.startsWith("NN")) {
+ return "n";
+ }
+
+ return null;
+
+ }
+
+ /**
+ * Check whether a list of arrays contains an array
+ *
+ * @param array
+ * The array To check
+ * @param fullList
+ * The full list of Arrays
+ * @return whether the {@link ArrayList} of arrays contains the array (true)
+ * or not (false)
+ */
+ public static boolean belongsTo(String[] array, ArrayList<String[]> fullList) {
+ for (String[] refArray : fullList) {
+ if (areStringArraysEqual(array, refArray))
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Check whether two arrays of strings are equal
+ *
+ * @param array1
+ * first array
+ * @param array2
+ * second array
+ * @return whether the two arrays are identical (true) or not (false)
+ */
+ public static boolean areStringArraysEqual(String[] array1, String[] array2) {
+
+ if (array1.equals(null) || array2.equals(null))
+ return false;
+
+ if (array1.length != array2.length) {
+ return false;
+ }
+ for (int i = 0; i < array1.length; i++) {
+ if (!array1[i].equals(array2[i])) {
+ return false;
+ }
+ }
+
+ return true;
+
+ }
+
+ public static ArrayList<WordPOS> getAllRelevantWords(String[] sentence) {
+
+ ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
+
+ String[] tags = WSDHelper.getTagger().tag(sentence);
+
+ for (int i = 0; i < sentence.length; i++) {
+ if (!WSDHelper.getStopCache().containsKey(sentence[i])) {
+ if (WSDHelper.getRelvCache().containsKey(tags[i])) {
+ relevantWords.add(new WordPOS(sentence[i], tags[i]));
+ }
+
+ }
+ }
+ return relevantWords;
+ }
+
+ public static ArrayList<WordPOS> getAllRelevantWords(WordToDisambiguate word) {
+ ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
+
+ String[] tags = WSDHelper.getTagger().tag(word.getSentence());
+
+ for (int i = 0; i < word.getSentence().length; i++) {
+ if (!WSDHelper.getStopCache().containsKey(word.getSentence()[i])) {
+ if (WSDHelper.getRelvCache().containsKey(tags[i])) {
+ WordPOS wordpos = new WordPOS(word.getSentence()[i], tags[i]);
+ if (i == word.getWordIndex()) {
+ wordpos.isTarget = true;
+ }
+ relevantWords.add(wordpos);
+ }
+
+ }
+ }
+ return relevantWords;
+ }
+
+ public static ArrayList<WordPOS> getRelevantWords(WordToDisambiguate word,
+ int winBackward, int winForward) {
+
+ ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
+
+ String[] sentence = word.getSentence();
+ String[] tags = WSDHelper.getTagger().tag(sentence);
+
+ int index = word.getWordIndex();
+
+ for (int i = index - winBackward; i <= index + winForward; i++) {
+
+ if (i >= 0 && i < sentence.length && i != index) {
+ if (!WSDHelper.getStopCache().containsKey(sentence[i])) {
+
+ if (WSDHelper.getRelvCache().containsKey(tags[i])) {
+ relevantWords.add(new WordPOS(sentence[i], tags[i]));
+ }
+
+ }
+ }
+ }
+ return relevantWords;
+ }
+
+ /**
+ * Stem a single word with WordNet dictionnary
+ *
+ * @param wordToStem
+ * word to be stemmed
+ * @return stemmed list of words
+ */
+ public static ArrayList<String> StemWordWithWordNet(WordPOS wordToStem) {
+ if (wordToStem == null)
+ return null;
+ ArrayList<String> stems = new ArrayList<String>();
+ try {
+ for (Object pos : POS.getAllPOS()) {
+ stems.addAll(WSDHelper.getMorph().lookupAllBaseForms((POS) pos,
+ wordToStem.getWord()));
+ }
+
+ if (stems.size() > 0)
+ return stems;
+ else {
+ return null;
+ }
+
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ /**
+ * Stem a single word tries to look up the word in the stemCache HashMap If
+ * the word is not found it is stemmed with WordNet and put into stemCache
+ *
+ * @param wordToStem
+ * word to be stemmed
+ * @return stemmed word list, null means the word is incorrect
+ */
+ public static ArrayList<String> Stem(WordPOS wordToStem) {
+
+ // check if we already cached the stem map
+ HashMap posMap = (HashMap) WSDHelper.getStemCache().get(
+ wordToStem.getPOS().getKey());
+
+ // don't check words with digits in them
+ if (WSDHelper.containsNumbers(wordToStem.getWord())) {
+ return null;
+ }
+
+ ArrayList<String> stemList = (ArrayList<String>) posMap.get(wordToStem
+ .getWord());
+ if (stemList != null) { // return it if we already cached it
+ return stemList;
+
+ } else { // unCached list try to stem it
+ stemList = StemWordWithWordNet(wordToStem);
+ if (stemList != null) {
+ // word was recognized and stemmed with wordnet:
+ // add it to cache and return the stemmed list
+ posMap.put(wordToStem.getWord(), stemList);
+ WSDHelper.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
+ return stemList;
+ } else { // could not be stemmed add it anyway (as incorrect with null
+ // list)
+ posMap.put(wordToStem.getWord(), null);
+ WSDHelper.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
+ return null;
+ }
+ }
+ }
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index f42f8c0..385b17e 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
@@ -24,10 +24,10 @@
*
*/
public abstract class WSDParameters {
-
+
protected boolean isCoarseSense;
public static boolean isStemCompare;
-
+
public static enum Source {
WORDNET
}
@@ -42,14 +42,14 @@
public void setCoarseSense(boolean isCoarseSense) {
this.isCoarseSense = isCoarseSense;
}
-
- public WSDParameters(){
+
+ public WSDParameters() {
this.isCoarseSense = true;
}
-
+
/**
* @return checks if the parameters are valid or not
*/
public abstract boolean isValid();
-
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
index 8c0fbe0..443686c 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
@@ -230,7 +230,7 @@
try {
return Dictionary
.getDefaultResourceInstance()
- .lookupIndexWord(Constants.getPOS(this.getTargetTag()),
+ .lookupIndexWord(WSDHelper.getPOS(this.getTargetTag()),
this.getTargetWord()).getSenses();
} catch (JWNLException e) {
e.printStackTrace();
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
index cc77002..af81c97 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
@@ -37,7 +37,7 @@
private POS pos;
private String posTag;
private int wordIndex;
- public boolean isTarget=false;
+ public boolean isTarget = false;
public WordPOS(String word, String tag) throws IllegalArgumentException {
if (word == null || tag == null) {
@@ -45,9 +45,9 @@
}
this.word = word;
this.posTag = tag;
- this.pos = Constants.getPOS(tag);
+ this.pos = WSDHelper.getPOS(tag);
}
-
+
public WordPOS(String word, POS pos) throws IllegalArgumentException {
if (word == null || pos == null) {
throw new IllegalArgumentException("Args are null");
@@ -70,7 +70,7 @@
public List getStems() {
if (stems == null) {
- return PreProcessor.Stem(this);
+ return WSDHelper.Stem(this);
} else {
return stems;
}
@@ -81,9 +81,9 @@
IndexWord indexWord;
try {
- indexWord = Loader.getDictionary().lookupIndexWord(pos, word);
+ indexWord = WSDHelper.getDictionary().lookupIndexWord(pos, word);
if (indexWord == null) {
- Constants
+ WSDHelper
.print("NULL synset probably a POS tagger mistake ! :: [POS] : "
+ pos.getLabel() + " [word] : " + word);
return null;
@@ -101,20 +101,17 @@
// check if there is intersection in the stems;
List originalList = this.getStems();
List listToCompare = wordToCompare.getStems();
-
-
- if (originalList == null || listToCompare == null) {
+
+ if (originalList == null || listToCompare == null) {
return false;
} else {
ListIterator<String> iterator = originalList.listIterator();
- while (iterator.hasNext())
- {
- iterator.set(iterator.next().toLowerCase());
+ while (iterator.hasNext()) {
+ iterator.set(iterator.next().toLowerCase());
}
iterator = listToCompare.listIterator();
- while (iterator.hasNext())
- {
- iterator.set(iterator.next().toLowerCase());
+ while (iterator.hasNext()) {
+ iterator.set(iterator.next().toLowerCase());
}
return !Collections.disjoint(originalList, listToCompare);
}
@@ -127,10 +124,10 @@
ArrayList<String> lemmas_word = new ArrayList();
ArrayList<String> lemmas_wordToCompare = new ArrayList();
- for (String pos : Constants.allPOS) {
- Loader.getLemmatizer().lemmatize(wordToCompare.getWord(), pos);
+ for (String pos : WSDHelper.allPOS) {
+ WSDHelper.getLemmatizer().lemmatize(wordToCompare.getWord(), pos);
}
return false;
}
-
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
index 1861f52..8fb2045 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
@@ -19,7 +19,8 @@
package opennlp.tools.disambiguator;
-import opennlp.tools.disambiguator.WSDSample;;
+import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.disambiguator.SynNode;
public class WordSense implements Comparable {
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
index 42f251e..4ea9276 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
@@ -37,7 +37,7 @@
protected int sense;
protected ArrayList<String> senseIDs;
-
+
public WordToDisambiguate(String[] sentence, int wordIndex)
throws IllegalArgumentException {
super();
@@ -47,7 +47,7 @@
}
this.sentence = sentence;
- this.posTags = PreProcessor.tag(sentence);
+ this.posTags = WSDHelper.getTagger().tag(sentence);
this.wordIndex = wordIndex;
@@ -63,7 +63,7 @@
}
this.sentence = sentence;
- this.posTags = PreProcessor.tag(sentence);
+ this.posTags = WSDHelper.getTagger().tag(sentence);
this.wordIndex = wordIndex;
@@ -79,14 +79,14 @@
}
this.sentence = sentence;
- this.posTags = PreProcessor.tag(sentence);
+ this.posTags = WSDHelper.getTagger().tag(sentence);
this.wordIndex = wordIndex;
this.senseIDs = senseIDs;
}
- public WordToDisambiguate(String[] sentence, String[] tokenTags, int wordIndex) {
+ public WordToDisambiguate(String[] sentence, String[] tokenTags, int wordIndex) {
this(sentence, wordIndex, -1);
}
@@ -125,20 +125,20 @@
public String getRawWord() {
- String wordBaseForm = Loader.getLemmatizer().lemmatize(
+ String wordBaseForm = WSDHelper.getLemmatizer().lemmatize(
this.sentence[wordIndex], this.posTags[wordIndex]);
String ref = "";
- if ((Constants.getPOS(this.posTags[wordIndex]) != null)) {
- if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
+ if ((WSDHelper.getPOS(this.posTags[wordIndex]) != null)) {
+ if (WSDHelper.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
ref = wordBaseForm + ".v";
- } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
+ } else if (WSDHelper.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
ref = wordBaseForm + ".n";
- } else if (Constants.getPOS(this.posTags[wordIndex])
+ } else if (WSDHelper.getPOS(this.posTags[wordIndex])
.equals(POS.ADJECTIVE)) {
ref = wordBaseForm + ".a";
- } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
+ } else if (WSDHelper.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
ref = wordBaseForm + ".r";
}
@@ -182,11 +182,10 @@
public String toString() {
return (wordIndex + "\t" + getWord() + "\n" + sentence);
}
-
+
public void print() {
- Constants.print("Sentence: " + Arrays.asList(sentence) + "\n" +
- "Index: " + wordIndex + "\n" +
- "Word: "+ getWord() + "\n" +
- "Sense ID: " + senseIDs.get(0));
+ WSDHelper.print("Sentence: " + Arrays.asList(sentence) + "\n" + "Index: "
+ + wordIndex + "\n" + "Word: " + getWord() + "\n" + "Sense ID: "
+ + senseIDs.get(0));
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
index 13a93c6..2d04d8d 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
@@ -19,7 +19,7 @@
package opennlp.tools.disambiguator.datareader;
-import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.WSDHelper;
public class Word {
@@ -245,14 +245,14 @@
if (this.lemma != null && iword.getLemma() != null) {
if (iword.getLemma().equals(this.getLemma())
- && Constants.getPOS(iword.getPos()).equals(
- Constants.getPOS(this.getPos()))) {
+ && WSDHelper.getPOS(iword.getPos()).equals(
+ WSDHelper.getPOS(this.getPos()))) {
return true;
}
} else {
if (this.word.equals(iword.getWord())
- && Constants.getPOSabbreviation(this.getPos()).equals(
- Constants.getPOSabbreviation(iword.getPos()))) {
+ && WSDHelper.getPOSabbreviation(this.getPos()).equals(
+ WSDHelper.getPOSabbreviation(iword.getPos()))) {
return true;
}
}
@@ -261,7 +261,7 @@
public boolean isInstanceOf(String wordTag) {
- String tag = Constants.getPOSabbreviation(this.getPos());
+ String tag = WSDHelper.getPOSabbreviation(this.getPos());
String oword = wordTag.split("\\.")[0];
String otag = wordTag.split("\\.")[1];
@@ -286,8 +286,8 @@
Word iword = (Word) oword;
if (iword.getLemma().equals(this.getLemma())
- && Constants.getPOS(iword.getPos()).equals(
- Constants.getPOS(this.getPos()))
+ && WSDHelper.getPOS(iword.getPos()).equals(
+ WSDHelper.getPOS(this.getPos()))
&& iword.getLexsn().equals(this.getLexsn())) {
return true;
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
index e2580be..6a5fcad 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
@@ -22,8 +22,7 @@
import java.util.ArrayList;
import net.sf.extjwnl.data.POS;
-import opennlp.tools.disambiguator.Constants;
-import opennlp.tools.disambiguator.PreProcessor;
+import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WordToDisambiguate;
public class WTDIMS extends WordToDisambiguate {
@@ -49,8 +48,8 @@
// this.word = xmlWord;
- this.sentence = PreProcessor.tokenize(xmlSentence);
- this.posTags = PreProcessor.tag(this.sentence);
+ this.sentence = WSDHelper.getTokenizer().tokenize(xmlSentence);
+ this.posTags = WSDHelper.getTagger().tag(this.sentence);
for (int i = 0; i < sentence.length; i++) {
if (xmlrawWord.equals(sentence[i])) {
@@ -67,7 +66,7 @@
super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense());
this.senseIDs = wtd.getSenseIDs();
}
-
+
public WTDIMS(String[] sentence, int wordIndex, ArrayList<String> senseIDs) {
super(sentence, wordIndex);
this.senseIDs = senseIDs;
@@ -107,19 +106,19 @@
public String getWordTag() {
- String wordBaseForm = PreProcessor.lemmatize(this.getWord(),
+ String wordBaseForm = WSDHelper.getLemmatizer().lemmatize(this.getWord(),
this.getPosTag());
String ref = "";
- if ((Constants.getPOS(this.getPosTag()) != null)) {
- if (Constants.getPOS(this.getPosTag()).equals(POS.VERB)) {
+ if ((WSDHelper.getPOS(this.getPosTag()) != null)) {
+ if (WSDHelper.getPOS(this.getPosTag()).equals(POS.VERB)) {
ref = wordBaseForm + ".v";
- } else if (Constants.getPOS(this.getPosTag()).equals(POS.NOUN)) {
+ } else if (WSDHelper.getPOS(this.getPosTag()).equals(POS.NOUN)) {
ref = wordBaseForm + ".n";
- } else if (Constants.getPOS(this.getPosTag()).equals(POS.ADJECTIVE)) {
+ } else if (WSDHelper.getPOS(this.getPosTag()).equals(POS.ADJECTIVE)) {
ref = wordBaseForm + ".a";
- } else if (Constants.getPOS(this.getPosTag()).equals(POS.ADVERB)) {
+ } else if (WSDHelper.getPOS(this.getPosTag()).equals(POS.ADVERB)) {
ref = wordBaseForm + ".r";
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
index 960800c..7ede37f 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
@@ -23,10 +23,8 @@
import java.util.Collections;
import java.util.List;
-import opennlp.tools.disambiguator.Constants;
-import opennlp.tools.disambiguator.Loader;
+import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.SynNode;
-import opennlp.tools.disambiguator.PreProcessor;
import opennlp.tools.disambiguator.WSDParameters;
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.WSDisambiguator;
@@ -62,7 +60,7 @@
}
/**
- * Initializes the loader object and sets the input parameters
+ * Initializes the WSDParameters object and sets the input parameters
*
* @param Input
* Parameters
@@ -103,8 +101,8 @@
/**
* The basic Lesk method where the entire context is considered for overlaps
*
- * @param The
- * word to disambiguate
+ * @param sample
+ * the word sample to disambiguate
* @return The array of WordSenses with their scores
*/
public ArrayList<WordSense> basic(WSDSample sample) {
@@ -144,37 +142,11 @@
/**
* The basic Lesk method but applied to a default context windows
*
- * @param The
- * word to disambiguate
+ * @param sample
+ * the word sample to disambiguate
* @return The array of WordSenses with their scores
*/
public ArrayList<WordSense> basicContextual(WSDSample sample) {
- return this.basicContextual(sample, LeskParameters.DFLT_WIN_SIZE);
- }
-
- /**
- * The basic Lesk method but applied to a custom context windows
- *
- * @param The
- * word to disambiguate
- * @param windowSize
- * @return The array of WordSenses with their scores
- */
- public ArrayList<WordSense> basicContextual(WSDSample sample, int windowSize) {
- return this.basicContextual(sample, windowSize, windowSize);
- }
-
- /**
- * The basic Lesk method but applied to a context windows set by custom
- * backward and forward window lengths
- *
- * @param wtd
- * the word to disambiguate
- * @param windowBackward
- * @return the array of WordSenses with their scores
- */
- public ArrayList<WordSense> basicContextual(WSDSample sample,
- int windowBackward, int windowForward) {
WordPOS word = new WordPOS(sample.getTargetWord(), sample.getTargetTag());
@@ -183,7 +155,8 @@
int index = sample.getTargetPosition();
- for (int i = index - windowBackward; i <= index + windowForward; i++) {
+ for (int i = index - getParams().win_b_size; i <= index
+ + getParams().win_f_size; i++) {
if (i >= 0 && i < sample.getSentence().length && i != index) {
contextWords.add(new WordPOS(sample.getSentence()[i],
sample.getTags()[i]));
@@ -222,27 +195,14 @@
* semantically related feature overlaps across the entire context The scoring
* function uses linear weights.
*
- * @param wtd
- * the word to disambiguate
- * @param depth
- * how deep to go into each feature tree
- * @param depthScoreWeight
- * the weighing per depth level
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
+ * @param sample
+ * the word sample to disambiguate
* @return the array of WordSenses with their scores
*/
- public ArrayList<WordSense> extended(WSDSample sample, int depth,
- double depthScoreWeight, boolean includeSynonyms,
- boolean includeHypernyms, boolean includeHyponyms,
- boolean includeMeronyms, boolean includeHolonyms) {
-
- return extendedContextual(sample, 0, depth, depthScoreWeight,
- includeSynonyms, includeHypernyms, includeHyponyms, includeMeronyms,
- includeHolonyms);
+ public ArrayList<WordSense> extended(WSDSample sample) {
+ params.setWin_b_size(0);
+ params.setWin_f_size(0);
+ return extendedContextual(sample);
}
@@ -251,117 +211,69 @@
* semantically related feature overlaps in a default context window The
* scoring function uses linear weights.
*
- * @param wtd
- * the word to disambiguate
- * @param depth
- * how deep to go into each feature tree
- * @param depthScoreWeight
- * the weighing per depth level
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
+ * @param sample
+ * the word sample to disambiguate
* @return the array of WordSenses with their scores
*/
- public ArrayList<WordSense> extendedContextual(WSDSample sample, int depth,
- double depthScoreWeight, boolean includeSynonyms,
- boolean includeHypernyms, boolean includeHyponyms,
- boolean includeMeronyms, boolean includeHolonyms) {
-
- return extendedContextual(sample, LeskParameters.DFLT_WIN_SIZE, depth,
- depthScoreWeight, includeSynonyms, includeHypernyms, includeHyponyms,
- includeMeronyms, includeHolonyms);
-
- }
-
- /**
- * An extended version of the Lesk approach that takes into consideration
- * semantically related feature overlaps in a custom context window The
- * scoring function uses linear weights.
- *
- * @param wtd
- * the word to disambiguate
- * @param windowSize
- * the custom context window size
- * @param depth
- * how deep to go into each feature tree
- * @param depthScoreWeight
- * the weighing per depth level
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
- * @return the array of WordSenses with their scores
- */
- public ArrayList<WordSense> extendedContextual(WSDSample sample,
- int windowSize, int depth, double depthScoreWeight,
- boolean includeSynonyms, boolean includeHypernyms,
- boolean includeHyponyms, boolean includeMeronyms, boolean includeHolonyms) {
-
- return extendedContextual(sample, windowSize, windowSize, depth,
- depthScoreWeight, includeSynonyms, includeHypernyms, includeHyponyms,
- includeMeronyms, includeHolonyms);
- }
-
- /**
- * An extended version of the Lesk approach that takes into consideration
- * semantically related feature overlaps in a custom context window The
- * scoring function uses linear weights.
- *
- * @param wtd
- * the word to disambiguate
- * @param windowBackward
- * the custom context backward window size
- * @param windowForward
- * the custom context forward window size
- * @param depth
- * how deep to go into each feature tree
- * @param depthScoreWeight
- * the weighing per depth level
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
- * @return the array of WordSenses with their scores
- */
- public ArrayList<WordSense> extendedContextual(WSDSample sample,
- int windowBackward, int windowForward, int depth,
- double depthScoreWeight, boolean includeSynonyms,
- boolean includeHypernyms, boolean includeHyponyms,
- boolean includeMeronyms, boolean includeHolonyms) {
-
- ArrayList<WordSense> scoredSenses = basicContextual(sample, windowBackward,
- windowForward);
-
+ public ArrayList<WordSense> extendedContextual(WSDSample sample) {
+ ArrayList<WordSense> scoredSenses;
+ if (params.getWin_b_size() == 0 && params.getWin_f_size() == 0) {
+ scoredSenses = basic(sample);
+ } else {
+ scoredSenses = basicContextual(sample);
+ }
for (WordSense wordSense : scoredSenses) {
- if (includeSynonyms) {
- wordSense.setScore(wordSense.getScore() + depthScoreWeight
+ if (getParams().getFeatures()[0]) {
+ wordSense.setScore(wordSense.getScore() + getParams().depth_weight
* assessSynonyms(wordSense.getNode().getSynonyms(), contextWords));
}
- if (includeHypernyms) {
+ if (getParams().getFeatures()[1]) {
fathomHypernyms(wordSense, wordSense.getNode().synset, contextWords,
- depth, depth, depthScoreWeight);
+ params.depth, params.depth, params.depth_weight);
}
- if (includeHyponyms) {
+ if (getParams().getFeatures()[2]) {
fathomHyponyms(wordSense, wordSense.getNode().synset, contextWords,
- depth, depth, depthScoreWeight);
+ params.depth, params.depth, params.depth_weight);
}
- if (includeMeronyms) {
+ if (getParams().getFeatures()[3]) {
fathomMeronyms(wordSense, wordSense.getNode().synset, contextWords,
- depth, depth, depthScoreWeight);
+ params.depth, params.depth, params.depth_weight);
}
- if (includeHolonyms) {
+ if (getParams().getFeatures()[4]) {
fathomHolonyms(wordSense, wordSense.getNode().synset, contextWords,
- depth, depth, depthScoreWeight);
+ params.depth, params.depth, params.depth_weight);
+
+ }
+
+ if (getParams().getFeatures()[5]) {
+ fathomEntailments(wordSense, wordSense.getNode().synset, contextWords,
+ params.depth, params.depth, params.depth_weight);
+
+ }
+ if (getParams().getFeatures()[6]) {
+ fathomCoordinateTerms(wordSense, wordSense.getNode().synset,
+ contextWords, params.depth, params.depth, params.depth_weight);
+
+ }
+ if (getParams().getFeatures()[7]) {
+ fathomCauses(wordSense, wordSense.getNode().synset, contextWords,
+ params.depth, params.depth, params.depth_weight);
+
+ }
+ if (getParams().getFeatures()[8]) {
+ fathomAttributes(wordSense, wordSense.getNode().synset, contextWords,
+ params.depth, params.depth, params.depth_weight);
+
+ }
+ if (getParams().getFeatures()[9]) {
+ fathomPertainyms(wordSense, wordSense.getNode().synset, contextWords,
+ params.depth, params.depth, params.depth_weight);
}
@@ -371,61 +283,20 @@
}
- /**
+ /*
* An extended version of the Lesk approach that takes into consideration
* semantically related feature overlaps in all the context. The scoring
* function uses exponential weights.
*
- * @param wtd
- * the word to disambiguate
- * @param depth
- * how deep to go into each feature tree
- * @param intersectionExponent
- * @param depthExponent
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
- * @return the array of WordSenses with their scores
- */
- public ArrayList<WordSense> extendedExponential(WSDSample sample, int depth,
- double intersectionExponent, double depthExponent,
- boolean includeSynonyms, boolean includeHypernyms,
- boolean includeHyponyms, boolean includeMeronyms, boolean includeHolonyms) {
-
- return extendedExponentialContextual(sample, 0, depth,
- intersectionExponent, depthExponent, includeSynonyms, includeHypernyms,
- includeHyponyms, includeMeronyms, includeHolonyms);
-
- }
-
- /**
- * An extended version of the Lesk approach that takes into consideration
- * semantically related feature overlaps in a default window in the context.
- * The scoring function uses exponential weights.
+ * @param sample the word sample to disambiguate
*
- * @param wtd
- * the word to disambiguate
- * @param depth
- * how deep to go into each feature tree
- * @param intersectionExponent
- * @param depthExponent
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
* @return the array of WordSenses with their scores
*/
- public ArrayList<WordSense> extendedExponentialContextual(WSDSample sample,
- int depth, double intersectionExponent, double depthExponent,
- boolean includeSynonyms, boolean includeHypernyms,
- boolean includeHyponyms, boolean includeMeronyms, boolean includeHolonyms) {
+ public ArrayList<WordSense> extendedExponential(WSDSample sample) {
+ params.setWin_b_size(0);
+ params.setWin_f_size(0);
+ return extendedExponentialContextual(sample);
- return extendedExponentialContextual(sample, LeskParameters.DFLT_WIN_SIZE,
- depth, intersectionExponent, depthExponent, includeSynonyms,
- includeHypernyms, includeHyponyms, includeMeronyms, includeHolonyms);
}
/**
@@ -433,87 +304,73 @@
* semantically related feature overlaps in a custom window in the context.
* The scoring function uses exponential weights.
*
- * @param wtd
- * the word to disambiguate
- * @param windowSize
- * @param depth
- * how deep to go into each feature tree
- * @param intersectionExponent
- * @param depthExponent
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
+ * @param sample
+ * the word sample to disambiguate
* @return the array of WordSenses with their scores
*/
- public ArrayList<WordSense> extendedExponentialContextual(WSDSample sample,
- int windowSize, int depth, double intersectionExponent,
- double depthExponent, boolean includeSynonyms, boolean includeHypernyms,
- boolean includeHyponyms, boolean includeMeronyms, boolean includeHolonyms) {
-
- return extendedExponentialContextual(sample, windowSize, windowSize, depth,
- intersectionExponent, depthExponent, includeSynonyms, includeHypernyms,
- includeHyponyms, includeMeronyms, includeHolonyms);
- }
-
- /**
- * An extended version of the Lesk approach that takes into consideration
- * semantically related feature overlaps in a custom window in the context.
- * The scoring function uses exponential weights.
- *
- * @param wtd
- * the word to disambiguate
- * @param windowBackward
- * @param windowForward
- * @param depth
- * @param intersectionExponent
- * @param depthExponent
- * @param includeSynonyms
- * @param includeHypernyms
- * @param includeHyponyms
- * @param includeMeronyms
- * @param includeHolonyms
- * @return the array of WordSenses with their scores
- */
- public ArrayList<WordSense> extendedExponentialContextual(WSDSample sample,
- int windowBackward, int windowForward, int depth,
- double intersectionExponent, double depthExponent,
- boolean includeSynonyms, boolean includeHypernyms,
- boolean includeHyponyms, boolean includeMeronyms, boolean includeHolonyms) {
-
- ArrayList<WordSense> scoredSenses = basicContextual(sample, windowForward,
- windowBackward);
+ public ArrayList<WordSense> extendedExponentialContextual(WSDSample sample) {
+ ArrayList<WordSense> scoredSenses;
+ if (params.getWin_b_size() == 0 && params.getWin_f_size() == 0) {
+ scoredSenses = basic(sample);
+ } else {
+ scoredSenses = basicContextual(sample);
+ }
for (WordSense wordSense : scoredSenses) {
- if (includeSynonyms) {
+ if (params.features[0]) {
wordSense.setScore(wordSense.getScore()
+ Math
.pow(
assessSynonyms(wordSense.getNode().getSynonyms(),
- contextWords), intersectionExponent));
+ contextWords), params.iexp));
}
- if (includeHypernyms) {
+ if (params.features[1]) {
fathomHypernymsExponential(wordSense, wordSense.getNode().synset,
- contextWords, depth, depth, intersectionExponent, depthExponent);
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
}
- if (includeHyponyms) {
+ if (params.features[2]) {
fathomHyponymsExponential(wordSense, wordSense.getNode().synset,
- contextWords, depth, depth, intersectionExponent, depthExponent);
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
}
- if (includeMeronyms) {
+ if (params.features[3]) {
fathomMeronymsExponential(wordSense, wordSense.getNode().synset,
- contextWords, depth, depth, intersectionExponent, depthExponent);
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
}
- if (includeHolonyms) {
+ if (params.features[4]) {
fathomHolonymsExponential(wordSense, wordSense.getNode().synset,
- contextWords, depth, depth, intersectionExponent, depthExponent);
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
+
+ }
+
+ if (params.features[5]) {
+ fathomEntailmentsExponential(wordSense, wordSense.getNode().synset,
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
+ }
+
+ if (params.features[6]) {
+ fathomCoordinateTermsExponential(wordSense, wordSense.getNode().synset,
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
+
+ }
+ if (params.features[7]) {
+ fathomCausesExponential(wordSense, wordSense.getNode().synset,
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
+
+ }
+ if (params.features[8]) {
+ fathomAttributesExponential(wordSense, wordSense.getNode().synset,
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
+
+ }
+ if (params.features[9]) {
+ fathomPertainymsExponential(wordSense, wordSense.getNode().synset,
+ contextWords, params.depth, params.depth, params.iexp, params.dexp);
}
@@ -539,9 +396,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -573,9 +430,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -606,9 +463,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -641,9 +498,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -675,9 +532,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -710,9 +567,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -744,9 +601,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -779,9 +636,9 @@
if (depth == 0)
return;
- String[] tokenizedGloss = Loader.getTokenizer().tokenize(
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
child.getGloss().toString());
- ArrayList<WordPOS> relvGlossWords = PreProcessor
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
.getAllRelevantWords(tokenizedGloss);
SynNode childNode = new SynNode(child, relvGlossWords);
@@ -797,6 +654,246 @@
}
}
+ private void fathomEntailments(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setEntailements();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getEntailments(), relvWords));
+ for (Synset entailment : childNode.getEntailments()) {
+ fathomEntailments(wordSense, entailment, relvGlossWords, depth - 1,
+ maxDepth, depthScoreWeight);
+ }
+
+ }
+
+ private void fathomEntailmentsExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setEntailements();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(assessFeature(childNode.getEntailments(), relvWords),
+ intersectionExponent) / Math.pow(depth, depthScoreExponent));
+ for (Synset entailment : childNode.getEntailments()) {
+ fathomEntailmentsExponential(wordSense, entailment, relvGlossWords,
+ depth - 1, maxDepth, intersectionExponent, depthScoreExponent);
+ }
+
+ }
+
+ private void fathomCoordinateTerms(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setCoordinateTerms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getCoordinateTerms(), relvWords));
+ for (Synset coordinate : childNode.getCoordinateTerms()) {
+ fathomCoordinateTerms(wordSense, coordinate, relvGlossWords, depth - 1,
+ maxDepth, depthScoreWeight);
+ }
+
+ }
+
+ private void fathomCoordinateTermsExponential(WordSense wordSense,
+ Synset child, ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setCoordinateTerms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(assessFeature(childNode.getCoordinateTerms(), relvWords),
+ intersectionExponent) / Math.pow(depth, depthScoreExponent));
+ for (Synset coordinate : childNode.getCoordinateTerms()) {
+ fathomCoordinateTermsExponential(wordSense, coordinate, relvGlossWords,
+ depth - 1, maxDepth, intersectionExponent, depthScoreExponent);
+ }
+
+ }
+
+ private void fathomCauses(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setCauses();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getCauses(), relvWords));
+ for (Synset cause : childNode.getCauses()) {
+ fathomEntailments(wordSense, cause, relvGlossWords, depth - 1, maxDepth,
+ depthScoreWeight);
+ }
+
+ }
+
+ private void fathomCausesExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setCauses();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(assessFeature(childNode.getCauses(), relvWords),
+ intersectionExponent) / Math.pow(depth, depthScoreExponent));
+ for (Synset cause : childNode.getCauses()) {
+ fathomCausesExponential(wordSense, cause, relvGlossWords, depth - 1,
+ maxDepth, intersectionExponent, depthScoreExponent);
+ }
+
+ }
+
+ private void fathomAttributes(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setAttributes();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getAttributes(), relvWords));
+ for (Synset attribute : childNode.getAttributes()) {
+ fathomAttributes(wordSense, attribute, relvGlossWords, depth - 1,
+ maxDepth, depthScoreWeight);
+ }
+
+ }
+
+ private void fathomAttributesExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setAttributes();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(assessFeature(childNode.getAttributes(), relvWords),
+ intersectionExponent) / Math.pow(depth, depthScoreExponent));
+ for (Synset attribute : childNode.getAttributes()) {
+ fathomAttributesExponential(wordSense, attribute, relvGlossWords,
+ depth - 1, maxDepth, intersectionExponent, depthScoreExponent);
+ }
+
+ }
+
+ private void fathomPertainyms(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double depthScoreWeight) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setPertainyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(depthScoreWeight, maxDepth - depth + 1)
+ * assessFeature(childNode.getPertainyms(), relvWords));
+ for (Synset pertainym : childNode.getPertainyms()) {
+ fathomPertainyms(wordSense, pertainym, relvGlossWords, depth - 1,
+ maxDepth, depthScoreWeight);
+ }
+
+ }
+
+ private void fathomPertainymsExponential(WordSense wordSense, Synset child,
+ ArrayList<WordPOS> relvWords, int depth, int maxDepth,
+ double intersectionExponent, double depthScoreExponent) {
+ if (depth == 0)
+ return;
+
+ String[] tokenizedGloss = WSDHelper.getTokenizer().tokenize(
+ child.getGloss().toString());
+ ArrayList<WordPOS> relvGlossWords = WSDHelper
+ .getAllRelevantWords(tokenizedGloss);
+
+ SynNode childNode = new SynNode(child, relvGlossWords);
+
+ childNode.setPertainyms();
+ wordSense.setScore(wordSense.getScore()
+ + Math.pow(assessFeature(childNode.getPertainyms(), relvWords),
+ intersectionExponent) / Math.pow(depth, depthScoreExponent));
+ for (Synset pertainym : childNode.getPertainyms()) {
+ fathomPertainymsExponential(wordSense, pertainym, relvGlossWords,
+ depth - 1, maxDepth, intersectionExponent, depthScoreExponent);
+ }
+
+ }
+
/**
* Checks if the feature should be counted in the score
*
@@ -810,9 +907,9 @@
for (Synset synset : featureSynsets) {
SynNode subNode = new SynNode(synset, relevantWords);
- String[] tokenizedSense = Loader.getTokenizer().tokenize(
+ String[] tokenizedSense = WSDHelper.getTokenizer().tokenize(
subNode.getGloss());
- ArrayList<WordPOS> relvSenseWords = PreProcessor
+ ArrayList<WordPOS> relvSenseWords = WSDHelper
.getAllRelevantWords(tokenizedSense);
for (WordPOS senseWord : relvSenseWords) {
@@ -883,7 +980,7 @@
@Override
public String[] disambiguate(WSDSample sample) {
// if the word is not relevant return null
- if (!Constants.isRelevant(sample.getTargetTag())) {
+ if (!WSDHelper.isRelevant(sample.getTargetTag())) {
return null;
}
@@ -896,70 +993,20 @@
case LESK_BASIC_CTXT:
wsenses = basicContextual(sample);
break;
- case LESK_BASIC_CTXT_WIN:
- wsenses = basicContextual(sample, this.params.win_b_size);
- break;
- case LESK_BASIC_CTXT_WIN_BF:
- wsenses = basicContextual(sample, this.params.win_b_size,
- this.params.win_f_size);
- break;
case LESK_EXT:
- wsenses = extended(sample, this.params.depth, this.params.depth_weight,
- this.params.fathom_synonyms, this.params.fathom_hypernyms,
- this.params.fathom_hyponyms, this.params.fathom_meronyms,
- this.params.fathom_holonyms);
+ wsenses = extended(sample);
break;
case LESK_EXT_CTXT:
- wsenses = extendedContextual(sample, this.params.depth,
- this.params.depth_weight, this.params.fathom_synonyms,
- this.params.fathom_hypernyms, this.params.fathom_hyponyms,
- this.params.fathom_meronyms, this.params.fathom_holonyms);
- break;
- case LESK_EXT_CTXT_WIN:
- wsenses = extendedContextual(sample, this.params.win_b_size,
- this.params.depth, this.params.depth_weight,
- this.params.fathom_synonyms, this.params.fathom_hypernyms,
- this.params.fathom_hyponyms, this.params.fathom_meronyms,
- this.params.fathom_holonyms);
- break;
- case LESK_EXT_CTXT_WIN_BF:
- wsenses = extendedContextual(sample, this.params.win_b_size,
- this.params.win_f_size, this.params.depth, this.params.depth_weight,
- this.params.fathom_synonyms, this.params.fathom_hypernyms,
- this.params.fathom_hyponyms, this.params.fathom_meronyms,
- this.params.fathom_holonyms);
+ wsenses = extendedContextual(sample);
break;
case LESK_EXT_EXP:
- wsenses = extendedExponential(sample, this.params.depth,
- this.params.iexp, this.params.dexp, this.params.fathom_synonyms,
- this.params.fathom_hypernyms, this.params.fathom_hyponyms,
- this.params.fathom_meronyms, this.params.fathom_holonyms);
+ wsenses = extendedExponential(sample);
break;
case LESK_EXT_EXP_CTXT:
- wsenses = extendedExponentialContextual(sample, this.params.depth,
- this.params.iexp, this.params.dexp, this.params.fathom_synonyms,
- this.params.fathom_hypernyms, this.params.fathom_hyponyms,
- this.params.fathom_meronyms, this.params.fathom_holonyms);
- break;
- case LESK_EXT_EXP_CTXT_WIN:
- wsenses = extendedExponentialContextual(sample, this.params.win_b_size,
- this.params.depth, this.params.iexp, this.params.dexp,
- this.params.fathom_synonyms, this.params.fathom_hypernyms,
- this.params.fathom_hyponyms, this.params.fathom_meronyms,
- this.params.fathom_holonyms);
- break;
- case LESK_EXT_EXP_CTXT_WIN_BF:
- wsenses = extendedExponentialContextual(sample, this.params.win_b_size,
- this.params.win_f_size, this.params.depth, this.params.iexp,
- this.params.dexp, this.params.fathom_synonyms,
- this.params.fathom_hypernyms, this.params.fathom_hyponyms,
- this.params.fathom_meronyms, this.params.fathom_holonyms);
+ wsenses = extendedExponentialContextual(sample);
break;
default:
- wsenses = extendedExponentialContextual(sample,
- LeskParameters.DFLT_WIN_SIZE, LeskParameters.DFLT_DEPTH,
- LeskParameters.DFLT_IEXP, LeskParameters.DFLT_DEXP, true, true, true,
- true, true);
+ wsenses = extendedExponentialContextual(sample);
break;
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
index d97128e..2efeba3 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
@@ -32,16 +32,17 @@
*
*/
public static enum LESK_TYPE {
- LESK_BASIC, LESK_BASIC_CTXT, LESK_BASIC_CTXT_WIN, LESK_BASIC_CTXT_WIN_BF, LESK_EXT, LESK_EXT_CTXT, LESK_EXT_CTXT_WIN, LESK_EXT_CTXT_WIN_BF, LESK_EXT_EXP, LESK_EXT_EXP_CTXT, LESK_EXT_EXP_CTXT_WIN, LESK_EXT_EXP_CTXT_WIN_BF,
+ LESK_BASIC, LESK_BASIC_CTXT, LESK_EXT, LESK_EXT_CTXT, LESK_EXT_EXP, LESK_EXT_EXP_CTXT
}
-
+
// DEFAULTS
- protected static final LESK_TYPE DFLT_LESK_TYPE = LESK_TYPE.LESK_EXT_EXP_CTXT_WIN;
+ protected static final LESK_TYPE DFLT_LESK_TYPE = LESK_TYPE.LESK_EXT_EXP_CTXT;
protected static final Source DFLT_SOURCE = Source.WORDNET;
- protected static final int DFLT_WIN_SIZE = 3;
- protected static final int DFLT_DEPTH = 2;
- protected static final double DFLT_IEXP = 0.4;
- protected static final double DFLT_DEXP = 0.4;
+ protected static final int DFLT_WIN_SIZE = 10;
+ protected static final int DFLT_DEPTH = 1;
+ protected static final double DFLT_DEPTH_WEIGHT = 0.8;
+ protected static final double DFLT_IEXP = 0.3;
+ protected static final double DFLT_DEXP = 0.3;
protected LESK_TYPE leskType;
@@ -49,17 +50,17 @@
protected int win_f_size;
protected int win_b_size;
protected int depth;
-
- protected boolean fathom_synonyms;
- protected boolean fathom_hypernyms;
- protected boolean fathom_hyponyms;
- protected boolean fathom_meronyms;
- protected boolean fathom_holonyms;
-
protected double depth_weight;
protected double iexp;
protected double dexp;
+ /*
+ * 10 possible features for lesk 0 : Synonyms 1 : Hypernyms 2 : Hyponyms 3 :
+ * Meronyms 4 : Holonyms 5 : Entailments 6 : Coordinate Terms 7 : Causes 8 :
+ * Attributes 9 : Pertainyms
+ */
+ protected boolean features[];
+
public LESK_TYPE getLeskType() {
return leskType;
}
@@ -92,46 +93,6 @@
this.depth = depth;
}
- public boolean isFathom_synonyms() {
- return fathom_synonyms;
- }
-
- public void setFathom_synonyms(boolean fathom_synonyms) {
- this.fathom_synonyms = fathom_synonyms;
- }
-
- public boolean isFathom_hypernyms() {
- return fathom_hypernyms;
- }
-
- public void setFathom_hypernyms(boolean fathom_hypernyms) {
- this.fathom_hypernyms = fathom_hypernyms;
- }
-
- public boolean isFathom_hyponyms() {
- return fathom_hyponyms;
- }
-
- public void setFathom_hyponyms(boolean fathom_hyponyms) {
- this.fathom_hyponyms = fathom_hyponyms;
- }
-
- public boolean isFathom_meronyms() {
- return fathom_meronyms;
- }
-
- public void setFathom_meronyms(boolean fathom_meronyms) {
- this.fathom_meronyms = fathom_meronyms;
- }
-
- public boolean isFathom_holonyms() {
- return fathom_holonyms;
- }
-
- public void setFathom_holonyms(boolean fathom_holonyms) {
- this.fathom_holonyms = fathom_holonyms;
- }
-
public double getDepth_weight() {
return depth_weight;
}
@@ -156,6 +117,14 @@
this.dexp = dexp;
}
+ public boolean[] getFeatures() {
+ return features;
+ }
+
+ public void setFeatures(boolean[] features) {
+ this.features = features;
+ }
+
public LeskParameters() {
this.setDefaults();
}
@@ -169,13 +138,11 @@
this.win_f_size = LeskParameters.DFLT_WIN_SIZE;
this.win_b_size = LeskParameters.DFLT_WIN_SIZE;
this.depth = LeskParameters.DFLT_DEPTH;
+ this.depth_weight = LeskParameters.DFLT_DEPTH_WEIGHT;
this.iexp = LeskParameters.DFLT_IEXP;
this.dexp = LeskParameters.DFLT_DEXP;
- this.fathom_holonyms = true;
- this.fathom_hypernyms = true;
- this.fathom_hyponyms = true;
- this.fathom_meronyms = true;
- this.fathom_synonyms = true;
+ boolean[] a = { true, true, true, true, true, true, true, true, true, true };
+ this.features = a;
}
/*
@@ -188,23 +155,13 @@
switch (this.leskType) {
case LESK_BASIC:
case LESK_BASIC_CTXT:
- return true;
- case LESK_BASIC_CTXT_WIN:
return (this.win_b_size == this.win_f_size) && this.win_b_size >= 0;
- case LESK_BASIC_CTXT_WIN_BF:
- return (this.win_b_size >= 0) && (this.win_f_size >= 0);
case LESK_EXT:
case LESK_EXT_CTXT:
- return (this.depth >= 0) && (this.depth_weight >= 0);
- case LESK_EXT_CTXT_WIN:
- case LESK_EXT_CTXT_WIN_BF:
return (this.depth >= 0) && (this.depth_weight >= 0)
&& (this.win_b_size >= 0) && (this.win_f_size >= 0);
case LESK_EXT_EXP:
case LESK_EXT_EXP_CTXT:
- return (this.depth >= 0) && (this.dexp >= 0) && (this.iexp >= 0);
- case LESK_EXT_EXP_CTXT_WIN:
- case LESK_EXT_EXP_CTXT_WIN_BF:
return (this.depth >= 0) && (this.dexp >= 0) && (this.iexp >= 0)
&& (this.win_b_size >= 0) && (this.win_f_size >= 0);
default:
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
index cd11b1e..9bc044d 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
@@ -27,7 +27,7 @@
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
import net.sf.extjwnl.data.Word;
-import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDParameters;
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.WSDisambiguator;
@@ -53,10 +53,11 @@
}
@Deprecated
- public static String[] getMostFrequentSense(WordToDisambiguate wordToDisambiguate) {
+ public static String[] getMostFrequentSense(
+ WordToDisambiguate wordToDisambiguate) {
String word = wordToDisambiguate.getRawWord().toLowerCase();
- POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());
+ POS pos = WSDHelper.getPOS(wordToDisambiguate.getPosTag());
if (pos != null) {
@@ -91,7 +92,7 @@
}
}
-
+
/*
* @return the most frequent senses from wordnet
*/
@@ -102,7 +103,7 @@
if (WSDParameters.isStemCompare) {
WordPOS wdPOS = new WordPOS(wd.getLemma(), wd.getPOS());
WordPOS samplePOS = new WordPOS(sample.getTargetLemma(),
- Constants.getPOS(sample.getTargetTag()));
+ WSDHelper.getPOS(sample.getTargetTag()));
if (wdPOS.isStemEquivalent(samplePOS)) {
try {
return WSDParameters.Source.WORDNET.name() + " " + wd.getSenseKey();
@@ -134,7 +135,7 @@
if (WSDParameters.isStemCompare) {
WordPOS wdPOS = new WordPOS(wd.getLemma(), wd.getPOS());
WordPOS samplePOS = new WordPOS(sample.getTargetLemma(),
- Constants.getPOS(sample.getTargetTag()));
+ WSDHelper.getPOS(sample.getTargetTag()));
if (wdPOS.isStemEquivalent(samplePOS)) {
try {
senseKeys[i] = WSDParameters.Source.WORDNET.name() + " "
@@ -145,7 +146,7 @@
}
break;
}
- }else{
+ } else {
if (wd.getLemma().equalsIgnoreCase((sample.getTargetLemma()))) {
try {
senseKeys[i] = WSDParameters.Source.WORDNET.name() + " "
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
index a2703d3..4dc3637 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
@@ -36,8 +36,14 @@
@Test
public static void main(String[] args) {
- Constants.print("Evaluation Started");
+ WSDHelper.print("Evaluation Started");
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir+"en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin");
+
+
IMS ims = new IMS();
IMSParameters imsParams = new IMSParameters();
ims.setParams(imsParams);
@@ -52,16 +58,16 @@
ArrayList<WSDSample> instances = getTestData(word);
if (instances != null) {
- Constants.print("------------------" + word + "------------------");
+ WSDHelper.print("------------------" + word + "------------------");
for (WSDSample instance : instances) {
if (instance.getSenseIDs() != null
&& !instance.getSenseIDs().get(0).equals("null")) {
evaluator.evaluateSample(instance);
}
}
- Constants.print(evaluator.toString());
+ WSDHelper.print(evaluator.toString());
} else {
- Constants.print("null instances");
+ WSDHelper.print("null instances");
}
}
@@ -95,7 +101,7 @@
ArrayList<WSDSample> instances = new ArrayList<WSDSample>();
for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
- List<WordPOS> words = PreProcessor.getAllRelevantWords(wtd);
+ List<WordPOS> words = WSDHelper.getAllRelevantWords(wtd);
int targetWordIndex=0;
for (int i=0; i<words.size();i++){
if(words.get(i).isTarget){
@@ -108,7 +114,7 @@
tags[i] = words.get(i).getPosTag();
tokens[i] = words.get(i).getWord();
}
- String targetLemma = Loader.getLemmatizer().lemmatize(
+ String targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
WSDSample sample = new WSDSample(tokens,tags,targetWordIndex,targetLemma);
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
index fff8bdd..03e2e7d 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
@@ -35,19 +35,24 @@
public static void main(String[] args) {
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir+"en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin");
+
IMS ims = new IMS();
String test1 = "Please write to me soon.";
- String[] sentence1 = Loader.getTokenizer().tokenize(test1);
- Constants.print(ims.disambiguate(sentence1, 1));
+ String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ WSDHelper.print(ims.disambiguate(sentence1, 1));
String test2 = "it was a strong argument that his hypothesis was true";
- String[] sentence2 = Loader.getTokenizer().tokenize(test2);
- Constants.print(ims.disambiguate(sentence2, 3));
+ String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ WSDHelper.print(ims.disambiguate(sentence2, 3));
String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
- String[] sentence3 = Loader.getTokenizer().tokenize(test3);
- Constants.print(ims.disambiguate(sentence3, 12));
+ String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+ WSDHelper.print(ims.disambiguate(sentence3, 12));
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
index bfb78a0..4c2fba3 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
@@ -35,11 +35,17 @@
@Test
public static void main(String[] args) {
- Constants.print("Evaluation Started");
-
+ WSDHelper.print("Evaluation Started");
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
Lesk lesk = new Lesk();
LeskParameters leskParams = new LeskParameters();
- leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_EXP_CTXT_WIN);
+ boolean a[] = { true, true, true, true, true, false, false, false, false,
+ false };
+ leskParams.setFeatures(a);
+ leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_CTXT);
lesk.setParams(leskParams);
ArrayList<String> words = seReader.getSensevalWords();
@@ -52,16 +58,16 @@
ArrayList<WSDSample> instances = getTestData(word);
if (instances != null) {
- Constants.print("------------------" + word + "------------------");
+ WSDHelper.print("------------------" + word + "------------------");
for (WSDSample instance : instances) {
if (instance.getSenseIDs() != null
&& !instance.getSenseIDs().get(0).equals("null")) {
evaluator.evaluateSample(instance);
}
}
- Constants.print(evaluator.toString());
+ WSDHelper.print(evaluator.toString());
} else {
- Constants.print("null instances");
+ WSDHelper.print("null instances");
}
}
}
@@ -71,23 +77,24 @@
ArrayList<WSDSample> instances = new ArrayList<WSDSample>();
for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
- List<WordPOS> words = PreProcessor.getAllRelevantWords(wtd);
- int targetWordIndex=0;
- for (int i=0; i<words.size();i++){
- if(words.get(i).isTarget){
+ List<WordPOS> words = WSDHelper.getAllRelevantWords(wtd);
+ int targetWordIndex = 0;
+ for (int i = 0; i < words.size(); i++) {
+ if (words.get(i).isTarget) {
targetWordIndex = i;
- }
+ }
}
String[] tags = new String[words.size()];
String[] tokens = new String[words.size()];
- for (int i=0;i<words.size();i++){
+ for (int i = 0; i < words.size(); i++) {
tags[i] = words.get(i).getPosTag();
tokens[i] = words.get(i).getWord();
}
- String targetLemma = Loader.getLemmatizer().lemmatize(
+ String targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
-
- WSDSample sample = new WSDSample(tokens,tags,targetWordIndex,targetLemma);
+
+ WSDSample sample = new WSDSample(tokens, tags, targetWordIndex,
+ targetLemma);
sample.setSenseIDs(wtd.getSenseIDs());
if (sample != null) {
if (sample.getSenseIDs().get(0) != null
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
index 02100c0..9f6f477 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
@@ -25,44 +25,49 @@
import opennlp.tools.disambiguator.lesk.Lesk;
import opennlp.tools.disambiguator.lesk.LeskParameters;
import opennlp.tools.disambiguator.lesk.LeskParameters.LESK_TYPE;
-import opennlp.tools.disambiguator.mfs.MFS;
import org.junit.Test;
public class LeskTester {
-
@Test
public static void main(String[] args) {
+
Lesk lesk = new Lesk();
LeskParameters params = new LeskParameters();
- params.setLeskType(LESK_TYPE.LESK_EXT_EXP_CTXT_WIN);
+ params.setLeskType(LESK_TYPE.LESK_EXT);
+ boolean a[] = { true, true, true, true, true, true, true, true, true, true };
+ params.setFeatures(a);
lesk.setParams(params);
-
- String test1 = "I went fishing for some sea bass.";
- String[] sentence = Loader.getTokenizer().tokenize(test1);
- List<WordPOS> words = PreProcessor.getAllRelevantWords(sentence);
- int targetWordIndex = 2;
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir+"en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin");
+
+ String test1 = "I went to the bank to deposit money.";
+ String[] sentence = WSDHelper.getTokenizer().tokenize(test1);
+ List<WordPOS> words = WSDHelper.getAllRelevantWords(sentence);
+ int targetWordIndex = 0;
String[] tags = new String[words.size()];
String[] tokens = new String[words.size()];
for (int i=0;i<words.size();i++){
tags[i] = words.get(i).getPosTag();
tokens[i] = words.get(i).getWord();
- // Constants.print("token : "+ tokens[i] + "_" + tags[i]);
+ WSDHelper.print("token : "+ tokens[i] + "_" + tags[i]);
}
- String targetLemma = Loader.getLemmatizer().lemmatize(
+ String targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
// Constants.print("lemma : "+ targetLemma);
- Constants.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma));
- Constants.printResults(lesk,
+ WSDHelper.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma));
+ WSDHelper.printResults(lesk,
lesk.disambiguate(tokens, tags, targetWordIndex, targetLemma));
- Constants.print("----------------------------------------");
+ WSDHelper.print("----------------------------------------");
String test2 = "it was a strong argument that his hypothesis was true";
- sentence = Loader.getTokenizer().tokenize(test2);
- words = PreProcessor.getAllRelevantWords(sentence);
+ sentence = WSDHelper.getTokenizer().tokenize(test2);
+ words = WSDHelper.getAllRelevantWords(sentence);
targetWordIndex = 1;
tags = new String[words.size()];
tokens = new String[words.size()];
@@ -72,19 +77,19 @@
//Constants.print("token : "+ tokens[i] + "_" + tags[i]);
}
- targetLemma = Loader.getLemmatizer().lemmatize(
+ targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
//Constants.print("lemma : "+ targetLemma);
- Constants.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma));
- Constants.printResults(lesk,
+ WSDHelper.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma));
+ WSDHelper.printResults(lesk,
lesk.disambiguate(tokens, tags, targetWordIndex, targetLemma));
- Constants.print("----------------------------------------");
+ WSDHelper.print("----------------------------------------");
String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
- sentence = Loader.getTokenizer().tokenize(test3);
- words = PreProcessor.getAllRelevantWords(sentence);
+ sentence = WSDHelper.getTokenizer().tokenize(test3);
+ words = WSDHelper.getAllRelevantWords(sentence);
targetWordIndex = 4;
tags = new String[words.size()];
tokens = new String[words.size()];
@@ -94,14 +99,14 @@
//Constants.print("token : "+ tokens[i] + "_" + tags[i]);
}
- targetLemma = Loader.getLemmatizer().lemmatize(
+ targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
//Constants.print("lemma : "+ targetLemma);
- Constants.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma));
- Constants.printResults(lesk,
+ WSDHelper.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma));
+ WSDHelper.printResults(lesk,
lesk.disambiguate(tokens, tags, targetWordIndex, targetLemma));
- Constants.print("----------------------------------------");
+ WSDHelper.print("----------------------------------------");
}
}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
index 4dee747..369791d 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
@@ -34,8 +34,11 @@
@Test
public static void main(String[] args) {
- Constants.print("Evaluation Started");
-
+ WSDHelper.print("Evaluation Started");
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir+"en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin");
MFS mfs = new MFS();
WSDParameters.isStemCompare = true;
@@ -50,16 +53,16 @@
ArrayList<WSDSample> instances = getTestData(word);
if (instances != null) {
- Constants.print("------------------" + word + "------------------");
+ WSDHelper.print("------------------" + word + "------------------");
for (WSDSample instance : instances) {
if (instance.getSenseIDs() != null
&& !instance.getSenseIDs().get(0).equals("null")) {
evaluator.evaluateSample(instance);
}
}
- Constants.print(evaluator.toString());
+ WSDHelper.print(evaluator.toString());
} else {
- Constants.print("null instances");
+ WSDHelper.print("null instances");
}
}
@@ -82,7 +85,7 @@
ArrayList<WSDSample> instances = new ArrayList<WSDSample>();
for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
- String targetLemma = Loader.getLemmatizer().lemmatize(wtd.getWord(),
+ String targetLemma = WSDHelper.getLemmatizer().lemmatize(wtd.getWord(),
wtd.getPosTag());
WSDSample sample = new WSDSample(wtd.getSentence(), wtd.getPosTags(),
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
index e792d37..e42c655 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
@@ -30,11 +30,18 @@
public class MFSTester {
public static void main(String[] args) {
+
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir+"en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin");
+
+
MFS mfs = new MFS();
String test1 = "I went fishing for some sea bass.";
- String[] sentence = Loader.getTokenizer().tokenize(test1);
- List<WordPOS> words = PreProcessor.getAllRelevantWords(sentence);
+ String[] sentence = WSDHelper.getTokenizer().tokenize(test1);
+ List<WordPOS> words = WSDHelper.getAllRelevantWords(sentence);
int targetWordIndex = 2;
String[] tags = new String[words.size()];
String[] tokens = new String[words.size()];
@@ -44,18 +51,18 @@
// Constants.print("token : "+ tokens[i] + "_" + tags[i]);
}
- String targetLemma = Loader.getLemmatizer().lemmatize(
+ String targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
// Constants.print("lemma : "+ targetLemma);
- Constants.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma));
- Constants.printResults(mfs,
+ WSDHelper.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma));
+ WSDHelper.printResults(mfs,
mfs.disambiguate(tokens, tags, targetWordIndex, targetLemma));
- Constants.print("----------------------------------------");
+ WSDHelper.print("----------------------------------------");
String test2 = "it was a strong argument that his hypothesis was true";
- sentence = Loader.getTokenizer().tokenize(test2);
- words = PreProcessor.getAllRelevantWords(sentence);
+ sentence = WSDHelper.getTokenizer().tokenize(test2);
+ words = WSDHelper.getAllRelevantWords(sentence);
targetWordIndex = 1;
tags = new String[words.size()];
tokens = new String[words.size()];
@@ -65,19 +72,19 @@
//Constants.print("token : "+ tokens[i] + "_" + tags[i]);
}
- targetLemma = Loader.getLemmatizer().lemmatize(
+ targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
//Constants.print("lemma : "+ targetLemma);
- Constants.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma));
- Constants.printResults(mfs,
+ WSDHelper.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma));
+ WSDHelper.printResults(mfs,
mfs.disambiguate(tokens, tags, targetWordIndex, targetLemma));
- Constants.print("----------------------------------------");
+ WSDHelper.print("----------------------------------------");
String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
- sentence = Loader.getTokenizer().tokenize(test3);
- words = PreProcessor.getAllRelevantWords(sentence);
+ sentence = WSDHelper.getTokenizer().tokenize(test3);
+ words = WSDHelper.getAllRelevantWords(sentence);
targetWordIndex = 4;
tags = new String[words.size()];
tokens = new String[words.size()];
@@ -87,14 +94,14 @@
//Constants.print("token : "+ tokens[i] + "_" + tags[i]);
}
- targetLemma = Loader.getLemmatizer().lemmatize(
+ targetLemma = WSDHelper.getLemmatizer().lemmatize(
tokens[targetWordIndex], tags[targetWordIndex]);
//Constants.print("lemma : "+ targetLemma);
- Constants.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma));
- Constants.printResults(mfs,
+ WSDHelper.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma));
+ WSDHelper.printResults(mfs,
mfs.disambiguate(tokens, tags, targetWordIndex, targetLemma));
- Constants.print("----------------------------------------");
+ WSDHelper.print("----------------------------------------");
}
}
\ No newline at end of file