OPENNLP-757 Applying bulk patch. Thanks to Mondher Bouazizi for providing a patch!
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
index ae98d59..3cd2780 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
@@ -21,9 +21,9 @@
import java.io.BufferedReader;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
-
import java.util.ArrayList;
import java.util.HashMap;
@@ -36,25 +36,28 @@
import org.w3c.dom.NodeList;
import opennlp.tools.disambiguator.DictionaryInstance;
-import opennlp.tools.disambiguator.DistributionInstance;
import opennlp.tools.disambiguator.ims.WTDIMS;
+/**
+ * This class handles the extraction of data from the different files (training
+ * data, dictionary instances, etc.)
+ */
+
public class DataExtractor {
+ private static String englishDict = "src\\test\\resources\\models\\en-lemmatizer.dict";
+
+ /**
+ * Constructor
+ */
public DataExtractor() {
super();
}
- /**
- * Extract the dictionary from the dictionary XML file and map the senses
- */
private ArrayList<DictionaryInstance> extractDictionary(String xmlLocation) {
ArrayList<DictionaryInstance> dictionary = new ArrayList<DictionaryInstance>();
- // HashMap<Integer, DictionaryInstance> dictionary = new HashMap<Integer,
- // DictionaryInstance>();
-
try {
File xmlFile = new File(xmlLocation);
@@ -149,7 +152,7 @@
}
- private HashMap<String, ArrayList<DictionaryInstance>> extractOptimalDictionary(
+ private HashMap<String, ArrayList<DictionaryInstance>> extractCoarseGrainedDictionary(
String xmlLocation, String sensemapFile) {
HashMap<String, ArrayList<DictionaryInstance>> optimizedDictionary = new HashMap<String, ArrayList<DictionaryInstance>>();
@@ -182,6 +185,20 @@
return optimizedDictionary;
}
+ /**
+ * Extract the different senses (those which are equivalent are put together)
+ * of a word
+ *
+ * @param xmlLocation
+ * : location of the file containing the dictionary instances
+ * @param sensemapFile
+ * : location of the file containing the equivalent senses in the
+ * case of Coarse-grained disambiguation
+ * @param wordTag
+ * : the word to disambiguate. It should be written in the format
+ * "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
+ * @return a {@link HashMap} of {@link DictionaryInstance} with their IDs
+ */
public HashMap<String, ArrayList<DictionaryInstance>> extractWordSenses(
String xmlLocation, String sensemapFile, String wordTag) {
@@ -192,7 +209,7 @@
HashMap<String, ArrayList<DictionaryInstance>> wordSenses = new HashMap<String, ArrayList<DictionaryInstance>>();
- HashMap<String, ArrayList<DictionaryInstance>> optimalDictionary = extractOptimalDictionary(
+ HashMap<String, ArrayList<DictionaryInstance>> optimalDictionary = extractCoarseGrainedDictionary(
xmlLocation, sensemapFile);
int i = 0;
@@ -207,6 +224,20 @@
return wordSenses;
}
+ /**
+ * Extract the different senses. This class returns only the ID of the sense
+ * and the gloss. the synsets and other information are omitted.
+ *
+ * @param xmlLocation
+ * : location of the file containing the dictionary instances
+ * @param sensemapFile
+ * : location of the file containing the equivalent senses in the
+ * case of Coarse-grained disambiguation
+ * @param wordTag
+ * the word to disambiguate. It should be written in the format
+ * "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
+ * @return a {@link HashMap} of word senses with their IDs
+ */
public HashMap<String, String> getDictionaryInstance(String xmlLocation,
String sensemapFile, String wordTag) {
@@ -225,59 +256,12 @@
}
/**
- * Extract the Dictionary Map [USELESS UNLESS USED FOR STATISTICS LATER !!!]
- */
-
- public HashMap<Integer, DistributionInstance> extractWords(String listOfWords) {
-
- HashMap<Integer, DistributionInstance> instances = new HashMap<Integer, DistributionInstance>();
-
- try (BufferedReader wordsList = new BufferedReader(new FileReader(
- listOfWords))) {
-
- String line;
-
- int index = 0;
-
- // Read the file
- while ((line = wordsList.readLine()) != null) {
-
- String[] temp = line.split("\\t");
-
- String[] wordPos = temp[0].split("\\.");
-
- String tag;
-
- if (wordPos[1].equals("n")) {
- tag = "noun";
- } else if (wordPos[1].equals("v")) {
- tag = "verb";
- } else if (wordPos[1].equals("a")) {
- tag = "adjective";
- } else {
- tag = "adverb";
- }
-
- DistributionInstance word = new DistributionInstance(wordPos[0], tag,
- Integer.parseInt(temp[1]), Integer.parseInt(temp[2]));
-
- instances.put(index, word);
-
- index++;
-
- }
-
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- return instances;
- }
-
- /**
* Extract the training instances from the training/test set File
+ *
+ * @param xmlDataSet
+ * : the file from which the data are to be extracted
+ * @return {@link ArrayList} of Word To Disambiguate (WTDIMS) instances
*/
-
public ArrayList<WTDIMS> extractWSDInstances(String xmlDataSet) {
ArrayList<WTDIMS> setInstances = new ArrayList<WTDIMS>();
@@ -356,7 +340,6 @@
rawWord = nChild.getChildNodes().item(1).getTextContent();
// textAfter =
// nChild.getChildNodes().item(2).getTextContent();
- // System.out.println(rawWord);
}
}
@@ -365,11 +348,12 @@
WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
rawWord);
setInstances.add(wordToDisambiguate);
- // System.out.print(index + "\t");
- // System.out.println(wordToDisambiguate.toString());
}
+
}
+
}
+
}
} catch (Exception e) {
@@ -379,4 +363,52 @@
return setInstances;
}
-}
+
+ /**
+ * Extract the list of ALL English words
+ *
+ * @param dict
+ * : this file is the same that is used in the simple lemmatizer
+ * (i.e.,"en-lemmatizer.dict")
+ *
+ * @return a list of all the english words
+ */
+ public HashMap<String, Object> getEnglishWords(String dict) {
+
+ HashMap<String, Object> words = new HashMap<String, Object>();
+
+ BufferedReader br = null;
+
+ File file = new File(englishDict);
+
+ if (file.exists()) {
+
+ try {
+ br = new BufferedReader(new FileReader(file));
+ String line = br.readLine();
+ while (line != null) {
+ line = br.readLine();
+ if (line != null) {
+ String word = line.split("\\t")[0];
+ words.put(word, null);
+ }
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ return words;
+ }
+
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
index 8128e4e..a30c887 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
@@ -19,8 +19,20 @@
package opennlp.tools.disambiguator;
+/**
+ * An instance of the dictionary. A dictionary instance has:
+ * <ul>
+ * <li>index: an index for the current instance of the dictionary</li>
+ * <li>word: the word to disambiguate</li>
+ * <li>id: its id in the source (e.g., in WordNet, Wordsmyth, etc.)</li>
+ * <li>source: the source of the instance (e.g., WordNet, Wordsmyth, etc.)</li>
+ * <li>synset: the list of synonyms (i.e., the words that share the same current
+ * meaning)</li>
+ * <li>gloss: the sense of the word</li>
+ * </ul>
+ */
public class DictionaryInstance {
-
+
protected int index;
protected String word;
@@ -30,6 +42,9 @@
protected String[] synset;
protected String gloss;
+ /**
+ * Constructor
+ */
public DictionaryInstance(int index, String word, String id, String source,
String[] synset, String gloss) {
super();
@@ -41,10 +56,6 @@
this.gloss = gloss;
}
- /**
- * Getters and Setters
- */
-
public int getIndex() {
return index;
}
@@ -92,4 +103,5 @@
public void setGloss(String gloss) {
this.gloss = gloss;
}
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
index fdd0c1f..d2c64a0 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
@@ -26,6 +26,9 @@
protected int trainingSetInstances;
protected int testSetInstances;
+ /**
+ * Constructor
+ */
public DistributionInstance(String word, String pos,
int trainingSetInstances, int testSetInstances) {
super();
@@ -66,4 +69,5 @@
public void setTestSetInstances(int testSetInstances) {
this.testSetInstances = testSetInstances;
}
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
index 599aa8e..6d26480 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
@@ -19,29 +19,43 @@
package opennlp.tools.disambiguator;
-
import java.util.ArrayList;
+import java.util.Arrays;
import opennlp.tools.disambiguator.ims.WTDIMS;
+/**
+ * Class for the extraction of features for the different Supervised
+ * Disambiguation apporaches.<br>
+ * Each set of methods refer to one approach
+ * <ul>
+ * <li>IMS (It Makes Sense): check {@link https
+ * ://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details about this approach</li>
+ * <li>SST (SuperSense Tagging): check {@link http
+ * ://ttic.uchicago.edu/~altun/pubs/CiaAlt_EMNLP06.pdf} for details about this
+ * approach</li>
+ * </ul>
+ *
+ * The first methods serve to extract the features for the algorithm IMS. Three
+ * families of features are to be extracted: - PoS of Surrounding Words: it
+ * requires one parameter: "Window size" - Surrounding Words: no parameters are
+ * required - Local Collocations: it requires one parameter: "the n-gram"
+ *
+ * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
+ * about this approach
+ */
+
public class FeaturesExtractor {
+ /**
+ * Constructor
+ */
public FeaturesExtractor() {
super();
}
- /**
- * @Algorithm: IMS (It Makes Sense)
- *
- * The following methods serve to extract the features for the
- * algorithm IMS.
- *
- * Three families of features are to be extracted: - PoS of
- * Surrounding Words: it requires one parameter: "Window size" -
- * Surrounding Words: no parameters are required - Local
- * Collocations: it requires one parameter: "the n-gram"
- *
- */
+ // IMS approach
+
private String[] extractPosOfSurroundingWords(String[] sentence,
int wordIndex, int windowSize) {
@@ -76,9 +90,9 @@
String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();
- if (!word.equals("")) {
- String lemma = Loader.getLemmatizer().lemmatize(sentence[i],
- posTags[i]);
+ // if (!word.equals("") /*&& Constants.isRelevant(posTags[i])*/) {
+ if (Loader.getEnglishWords().containsKey(word)) {
+ String lemma = Loader.getLemmatizer().lemmatize(word, posTags[i]);
contextWords.add(lemma);
}
@@ -120,7 +134,30 @@
return res;
}
- // public method
+ /**
+ * This methods generates the full list of Surrounding words, from the
+ * training data. These data will be later used for the generation of the
+ * features qualified of "Surrounding words
+ *
+ * @param trainingData
+ * list of the training samples (type {@link WTDIMS}
+ * @return the list of all the surrounding words from all the training data
+ */
+ public ArrayList<String> extractTrainingSurroundingWords(
+ ArrayList<WTDIMS> trainingData) {
+
+ ArrayList<String> list = new ArrayList<String>();
+
+ for (WTDIMS word : trainingData) {
+ for (String sWord : word.getSurroundingWords()) {
+ list.add(sWord);
+ }
+ }
+
+ return list;
+
+ }
+
/**
* This method generates the different set of features related to the IMS
* approach and store them in the corresponding attributes of the WTDIMS
@@ -151,16 +188,20 @@
* doesn't require any parameters.
*
* @param word
+ * the word to disambiguate
+ * @param listSurrWords
+ * the full list of surrounding words of the training data
* @return the Context of the wordToDisambiguate
*/
- public String[] serializeIMSFeatures(WTDIMS word) {
+ public void serializeIMSFeatures(WTDIMS word, ArrayList<String> listSurrWords) {
String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
- String[] surroundingWords = word.getSurroundingWords();
+ ArrayList<String> surroundingWords = new ArrayList<String>(
+ Arrays.asList((word.getSurroundingWords())));
String[] localCollocations = word.getLocalCollocations();
String[] serializedFeatures = new String[posOfSurroundingWords.length
- + surroundingWords.length + localCollocations.length];
+ + localCollocations.length + listSurrWords.size()];
int i = 0;
@@ -169,17 +210,24 @@
i++;
}
- for (String feature : surroundingWords) {
- serializedFeatures[i] = "F" + i + "=" + feature;
- i++;
- }
-
for (String feature : localCollocations) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
- return serializedFeatures;
+ for (String feature : listSurrWords) {
+ serializedFeatures[i] = "F" + i + "=0";
+ if (surroundingWords.contains(feature)) {
+ serializedFeatures[i] = "F" + i + "=1";
+ }
+ i++;
+
+ }
+
+ word.setFeatures(serializedFeatures);
}
-}
+
+ // SST approach
+
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
index 127e6ff..dc2b7a5 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
@@ -44,6 +44,8 @@
public class Loader {
+ private static DataExtractor dExtractor = new DataExtractor();
+
private static String modelsDir = "src\\test\\resources\\models\\";
private static SentenceDetectorME sdetector;
@@ -54,13 +56,14 @@
private static Dictionary dictionary;
private static MorphologicalProcessor morph;
- private static boolean IsInitialized = false;
// local caches for faster lookup
private static HashMap<String, Object> stemCache;
private static HashMap<String, Object> stopCache;
private static HashMap<String, Object> relvCache;
+ private static HashMap<String, Object> englishWords;
+
// Constructor
public Loader() {
super();
@@ -97,6 +100,14 @@
return stemCache;
}
+ public static HashMap<String, Object> getEnglishWords() {
+ if (englishWords == null || englishWords.keySet().isEmpty()) {
+ englishWords = dExtractor.getEnglishWords(modelsDir
+ + "en-lemmatizer.dict");
+ }
+ return englishWords;
+ }
+
public static MorphologicalProcessor getMorph() {
if (morph == null) {
morph = dictionary.getMorphologicalProcessor();
@@ -217,11 +228,14 @@
for (String s : Constants.stopWords) {
stopCache.put(s, null);
}
+
relvCache = new HashMap<String, Object>();
for (String t : Constants.relevantPOS) {
relvCache.put(t, null);
}
+ englishWords = new HashMap<String, Object>();
+
if (isInitialized()) {
Constants.print("loading was succesfull");
} else {
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
index cb1eccc..dbd1bee 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
@@ -68,7 +68,7 @@
public String getSense() {
return this.synset.getGloss().toString();
}
-
+
public long getSenseID() {
return this.synset.getOffset();
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index 8fc8e72..78c8966 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
@@ -22,46 +22,48 @@
import java.security.InvalidParameterException;
import opennlp.tools.util.Span;
-
/**
- * A word sense disambiguator that determines which sense of a word is meant in a particular context.
- * It is a classification task, where the classes are the different senses of the ambiguous word.
- * Disambiguation can be achieved in either supervised or un-supervised approaches.
- * For the moment this component relies on WordNet to retrieve sense definitions.
- * It returns an array of WordNet sense IDs ordered by their disambiguation score.
- * The sense with highest score is the most likely sense of the word.
+ * A word sense disambiguator that determines which sense of a word is meant in
+ * a particular context. It is a classification task, where the classes are the
+ * different senses of the ambiguous word. Disambiguation can be achieved in
+ * either supervised or un-supervised approaches. For the moment this component
+ * relies on WordNet to retrieve sense definitions. It returns an array of
+ * WordNet sense IDs ordered by their disambiguation score. The sense with
+ * highest score is the most likely sense of the word.
*
- * Please see {@link Lesk} for an un-supervised approach.
- * Please see {@link IMS} for a supervised approach.
+ * Please see {@link Lesk} for an un-supervised approach. Please see {@link IMS}
+ * for a supervised approach.
*
* @see Lesk
* @see IMS
*/
public interface WSDisambiguator {
-
/**
* @return the parameters of the disambiguation algorithm
*/
public WSDParameters getParams();
-
+
/**
- * @param the disambiguation implementation specific parameters.
+ * @param the
+ * disambiguation implementation specific parameters.
* @throws InvalidParameterException
*/
public void setParams(WSDParameters params) throws InvalidParameterException;
-
+
/**
* @param tokenizedContext
* @param ambiguousTokenIndex
* @return result as an array of WordNet IDs
*/
- public String[] disambiguate(String[] tokenizedContext, int ambiguousTokenIndex);
+ public String[] disambiguate(String[] tokenizedContext,
+ int ambiguousTokenIndex);
/**
* @param tokenizedContext
* @param ambiguousTokenIndexSpans
* @return result as an array of WordNet IDs
*/
- public String[][] disambiguate(String[] tokenizedContext, Span[] ambiguousTokenIndexSpans);
-}
+ public String[][] disambiguate(String[] tokenizedContext,
+ Span[] ambiguousTokenIndexSpans);
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
index 5f27b38..60aac79 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
@@ -23,8 +23,6 @@
import net.sf.extjwnl.data.POS;
-
-
public class WordToDisambiguate {
// TODO Check if it is necessary to add an attribute [word] since the word in
@@ -102,10 +100,6 @@
public String getRawWord() {
- /**
- * For example, from the word "running" it returns "run.v"
- */
-
String wordBaseForm = Loader.getLemmatizer().lemmatize(
this.sentence[wordIndex], this.posTags[wordIndex]);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
index 9458cc3..399b461 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
@@ -23,31 +23,33 @@
import java.util.Arrays;
import java.util.List;
-
import opennlp.tools.disambiguator.FeaturesExtractor;
import opennlp.tools.disambiguator.ims.WTDIMS;
+/**
+ * The default Context Generator of IMS
+ */
public class DefaultIMSContextGenerator implements IMSContextGenerator {
FeaturesExtractor fExtractor = new FeaturesExtractor();
- /**
- * Default context generator for IMS.
- */
-
public DefaultIMSContextGenerator() {
}
/**
* Get Context of a word To disambiguate
+ *
+ * @param word
+ * : the word to disambiguate in the format {@link WTDIMS}
+ * @return The IMS context of the word to disambiguate
*/
@Override
public String[] getContext(WTDIMS word) {
- return fExtractor.serializeIMSFeatures(word);
+ return word.getFeatures();
}
/**
- * Returns an {@link ArrayList} of features for the object of type WTDIMS
+ * This methods gives the list of features for the object of type WTDIMS
* Extensions of this class can override this method to create a customized
* {@link IMSContextGenerator}
*
@@ -63,8 +65,8 @@
*
* @return an {@link ArrayList} of features
*/
-
protected List<String> createContext(WTDIMS word) {
return Arrays.asList(getContext(word));
}
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java
index e171fd0..e69de29 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/FeaturesExtractor.java
@@ -1,98 +0,0 @@
-package opennlp.tools.disambiguator.ims;
-
-import java.util.ArrayList;
-
-import opennlp.tools.disambiguator.Constants;
-import opennlp.tools.disambiguator.Loader;
-
-public class FeaturesExtractor {
-
- public FeaturesExtractor() {
- super();
- }
-
- /**
- * @Algorithm: IMS (It Makes Sense)
- *
- * The following methods serve to extract the features for the
- * algorithm IMS.
- */
-
- public String[] extractPosOfSurroundingWords(String[] sentence,
- int wordIndex, int numberOfWords) {
-
- String[] taggedSentence = Loader.getTagger().tag(sentence);
-
- String[] tags = new String[2 * numberOfWords + 1];
-
- int j = 0;
-
- for (int i = wordIndex - numberOfWords; i < wordIndex + numberOfWords; i++) {
- if (i < 0 || i >= sentence.length) {
- tags[j] = "null";
- } else {
- tags[j] = taggedSentence[i];
- }
- j++;
- }
-
- return tags;
- }
-
- public String[] extractSurroundingWords(String[] sentence, int wordIndex) {
-
- String[] posTags = Loader.getTagger().tag(sentence);
-
- Constants.print(posTags);
-
- ArrayList<String> contextWords = new ArrayList<String>();
-
- for (int i = 0; i < sentence.length; i++) {
-
- if (!Constants.stopWords.contains(sentence[i].toLowerCase())
- && (wordIndex != i)) {
-
- String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();
-
- if (!word.equals("")) {
- String lemma = Loader.getLemmatizer().lemmatize(sentence[i],
- posTags[i]);
- contextWords.add(lemma);
- }
-
- }
- }
-
- return contextWords.toArray(new String[contextWords.size()]);
- }
-
- public ArrayList<String[]> extractLocalCollocations(String[] sentence,
- int wordIndex, int range) {
- /**
- * Here the author used only 11 features of this type. the range was set to
- * 3 (bigrams extracted in a way that they are at max separated by 1 word).
- */
-
- ArrayList<String[]> localCollocations = new ArrayList<String[]>();
-
- for (int i = wordIndex - range; i <= wordIndex + range; i++) {
-
- if (!(i < 0 || i > sentence.length - 2)) {
- if ((i != wordIndex) && (i + 1 != wordIndex)
- && (i + 1 < wordIndex + range)) {
- String[] lc = { sentence[i], sentence[i + 1] };
- localCollocations.add(lc);
- }
- if ((i != wordIndex) && (i + 2 != wordIndex)
- && (i + 2 < wordIndex + range)) {
- String[] lc = { sentence[i], sentence[i + 2] };
- localCollocations.add(lc);
- }
- }
-
- }
-
- return localCollocations;
- }
-}
-
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
index a453ecb..813e21c 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
@@ -30,9 +30,13 @@
import opennlp.tools.ml.model.OnePassDataIndexer;
import opennlp.tools.ml.model.PlainTextFileDataReader;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.security.InvalidParameterException;
@@ -56,84 +60,43 @@
import opennlp.tools.disambiguator.WordPOS;
import opennlp.tools.disambiguator.WSDisambiguator;
+/**
+ * Implementation of the <b>It Makes Sense</b> approach originally proposed in
+ * Senseval-3. The approach relies on the extraction of textual and
+ * PoS-tag-based features from the sentences surrounding the word to
+ * disambiguate. 3 main families of features are extracted:
+ * <ul>
+ * <li>PoS-tags of the surrounding words</li>
+ * <li>Local collocations</li>
+ * <li>Surrounding words</li>
+ * </ul>
+ * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
+ * about this approach
+ */
public class IMS implements WSDisambiguator {
- // private MaxentModel model;
- private IMSFactory factory;
+ public IMSParameters parameters;
private final IMSContextGenerator cg;
private FeaturesExtractor fExtractor = new FeaturesExtractor();
private DataExtractor dExtractor = new DataExtractor();
-
- private int windowSize;
- private int word;
- private int ngram;
-
public IMS() {
super();
- windowSize = 3;
- ngram = 2;
-
- IMSFactory factory = new IMSFactory();
- this.factory = factory;
- this.cg = factory.createContextGenerator();
+ this.parameters = new IMSParameters();
+ ;
+ this.cg = parameters.createContextGenerator();
}
- public IMS(int windowSize, int ngram) {
+ public IMS(IMSParameters parameters) {
super();
- this.windowSize = windowSize;
- this.ngram = ngram;
-
- IMSFactory factory = new IMSFactory();
- this.factory = factory;
- this.cg = factory.createContextGenerator();
+ this.parameters = parameters;
+ this.cg = this.parameters.createContextGenerator();
}
- protected ArrayList<WTDIMS> extractTrainingData(
- String wordTrainingxmlFile,
- HashMap<String, ArrayList<DictionaryInstance>> senses) {
-
- /**
- * word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
- * etc.)
- */
-
- ArrayList<WTDIMS> trainingData = dExtractor
- .extractWSDInstances(wordTrainingxmlFile);
-
- // HashMap<Integer, WTDIMS> trainingData =
- // dExtractor.extractWSDInstances(wordTrainingxmlFile);
-
- for (WTDIMS data : trainingData) {
- for (String senseId : data.getSenseIDs()) {
- for (String dictKey : senses.keySet()) {
- for (DictionaryInstance instance : senses.get(dictKey)) {
- if (senseId.equals(instance.getId())) {
- data.setSense(
- Integer.parseInt(dictKey.split("_")[1]));
- break;
- }
- }
- }
- }
- }
-
- return trainingData;
- }
-
- protected void extractFeature(ArrayList<WTDIMS> words) {
-
- for (WTDIMS word : words) {
-
- fExtractor.extractIMSFeatures(word, windowSize, ngram);
-
- }
-
- }
-
- protected String getTrainingFile(WTDIMS wtd) {
+ // Internal Methods
+ private String getTrainingFileName(WTDIMS wtd) {
String wordBaseForm = PreProcessor
.lemmatize(wtd.getWord(), wtd.getPosTag());
@@ -155,17 +118,124 @@
return ref;
}
- protected HashMap<String, String> getWordDictionaryInstance(WTDIMS wtd) {
+ private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingData,
+ String wordTag) {
- String dict = factory.getDict();
- String map = factory.getMap();
+ ArrayList<String> surrWords = fExtractor
+ .extractTrainingSurroundingWords(trainingData);
- return dExtractor.getDictionaryInstance(dict, map,
- this.getTrainingFile(wtd));
+ File file = new File(parameters.getTrainingDataDirectory() + wordTag
+ + ".sw");
+ if (!file.exists()) {
+ try {
+
+ file.createNewFile();
+
+ FileWriter fw = new FileWriter(file.getAbsoluteFile());
+ BufferedWriter bw = new BufferedWriter(fw);
+
+ for (String surrWord : surrWords) {
+ bw.write(surrWord);
+ bw.newLine();
+ }
+
+ bw.close();
+
+ System.out.println("Done");
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
}
- protected String[] getMostFrequentSense(WTDIMS wordToDisambiguate) {
+ private ArrayList<String> getAllSurroundingWords(String wordTag) {
+
+ ArrayList<String> surrWords = new ArrayList<String>();
+
+ BufferedReader br = null;
+
+ File file = new File(parameters.getTrainingDataDirectory() + wordTag
+ + ".sw");
+
+ if (file.exists()) {
+
+ try {
+ br = new BufferedReader(new FileReader(file));
+
+ String line = br.readLine();
+ while (line != null) {
+ line = br.readLine();
+ if (!surrWords.contains(line)) {
+ surrWords.add(line);
+ }
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (br != null) {
+ try {
+ br.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ return surrWords;
+
+ }
+
+ private ArrayList<WTDIMS> extractTrainingData(String wordTrainingXmlFile,
+ HashMap<String, ArrayList<DictionaryInstance>> senses) {
+
+ /**
+ * word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
+ * etc.)
+ */
+
+ ArrayList<WTDIMS> trainingData = dExtractor
+ .extractWSDInstances(wordTrainingXmlFile);
+
+ for (WTDIMS word : trainingData) {
+ for (String senseId : word.getSenseIDs()) {
+ for (String dictKey : senses.keySet()) {
+ for (DictionaryInstance instance : senses.get(dictKey)) {
+ if (senseId.equals(instance.getId())) {
+ word.setSense(Integer.parseInt(dictKey.split("_")[1]));
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ return trainingData;
+ }
+
+ private void extractFeature(WTDIMS word) {
+
+ fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
+ this.parameters.getNgram());
+
+ }
+
+ private HashMap<String, String> getWordDictionaryInstance(WTDIMS wtd) {
+
+ String dict = parameters.getDict();
+ String map = parameters.getMap();
+
+ return dExtractor.getDictionaryInstance(dict, map,
+ this.getTrainingFileName(wtd));
+
+ }
+
+ private String[] getMostFrequentSense(WTDIMS wordToDisambiguate) {
String word = wordToDisambiguate.getRawWord();
POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());
@@ -187,47 +257,64 @@
}
/**
- * PUBLIC METHODS
+ * Method for training a model
+ *
+ * @param wordTag
+ * : the word to disambiguate. It should be written in the format
+ * "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
+ * @param trainParams
+ * : the parameters used for training
*/
-
public void train(String wordTag, TrainingParameters trainParams) {
- String rawDataDirectory = factory.getRawDataDirectory();
- String trainingDataDirectory = factory.getTrainingDataDirectory();
- String dict = factory.getDict();
- String map = factory.getMap();
+ String dict = parameters.getDict();
+ String map = parameters.getMap();
- String wordTrainingxmlFile = rawDataDirectory + wordTag + ".xml";
- String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz";
+ String wordTrainingxmlFile = parameters.getRawDataDirectory() + wordTag
+ + ".xml";
+ String wordTrainingbinFile = parameters.getTrainingDataDirectory()
+ + wordTag + ".gz";
File bf = new File(wordTrainingxmlFile);
- ObjectStream IMSes = null;
+ ObjectStream<Event> IMSes = null;
if (bf.exists() && !bf.isDirectory()) {
HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
.extractWordSenses(dict, map, wordTag);
- ArrayList<WTDIMS> instances = extractTrainingData(
- wordTrainingxmlFile, senses);
+ ArrayList<WTDIMS> instances = extractTrainingData(wordTrainingxmlFile,
+ senses);
- extractFeature(instances);
+ for (WTDIMS wtd : instances) {
+ extractFeature(wtd);
+ }
+
+ saveAllSurroundingWords(instances, wordTag);
+
+ for (WTDIMS wtd : instances) {
+ extractFeature(wtd);
+ }
+
+ ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
+
+ for (WTDIMS wtd : instances) {
+ fExtractor.serializeIMSFeatures(wtd, surrWords);
+ }
ArrayList<Event> events = new ArrayList<Event>();
- for (WTDIMS instance : instances) {
+ for (WTDIMS wtd : instances) {
- int sense = instance.getSense();
+ int sense = wtd.getSense();
- String[] context = cg.getContext(instance);
+ String[] context = cg.getContext(wtd);
Event ev = new Event(sense + "", context);
events.add(ev);
- // Collection collEvents = events;
-
IMSes = ObjectStreamUtils.createObjectStream(events);
}
@@ -235,7 +322,7 @@
DataIndexer indexer;
try {
indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
- MaxentModel trainedMaxentModel = GIS.trainModel(100, indexer);
+ MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
File outFile = new File(wordTrainingbinFile);
AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
(AbstractModel) trainedMaxentModel, outFile);
@@ -249,6 +336,13 @@
}
+ /**
+ * Load an existing model
+ *
+ * @param binFile
+ * : Location of the already trained model
+ * @return the model trained
+ */
public MaxentModel load(String binFile) {
MaxentModel loadedMaxentModel = null;
@@ -268,16 +362,25 @@
return loadedMaxentModel;
}
+ /**
+ * The disambiguation method for a single word
+ *
+ * @param inputText
+ * : the text containing the word to disambiguate
+ * @param inputWordIndex
+ * : the index of the word to disambiguate
+ */
@Override
public String[] disambiguate(String[] inputText, int inputWordIndex) {
- String rawDataDirectory = factory.getRawDataDirectory();
- String trainingDataDirectory = factory.getTrainingDataDirectory();
+ String rawDataDirectory = this.parameters.getRawDataDirectory();
+ String trainingDataDirectory = this.parameters.getTrainingDataDirectory();
WTDIMS word = new WTDIMS(inputText, inputWordIndex);
- fExtractor.extractIMSFeatures(word, windowSize, ngram);
+ fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
+ this.parameters.getNgram());
- String wordTag = getTrainingFile(word);
+ String wordTag = getTrainingFileName(word);
String wordTrainingxmlFile = rawDataDirectory + wordTag + ".xml";
String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz";
@@ -289,6 +392,9 @@
if (bf.exists() && !bf.isDirectory()) {
// if the model file exists already
// System.out.println("the model file was found !");
+ ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
+ fExtractor.serializeIMSFeatures(word, surrWords);
+
loadedMaxentModel = load(wordTrainingbinFile);
String[] context = cg.getContext(word);
@@ -301,6 +407,10 @@
// if the xml file exists already
// System.out.println("the xml file was found !");
train(wordTag, null);
+ ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
+
+ fExtractor.serializeIMSFeatures(word, surrWords);
+
bf = new File(wordTrainingbinFile);
loadedMaxentModel = load(wordTrainingbinFile);
String[] context = cg.getContext(word);
@@ -329,17 +439,32 @@
}
+ /**
+ * The disambiguation method for a span of words
+ *
+ * @param inputText
+ * : the text containing the word to disambiguate
+ * @param inputWordSpans
+ * : the span of words to disambiguate
+ */
@Override
- public String[][] disambiguate(String[] inputText, Span[] inputWordSpans) {
+ public String[][] disambiguate(String[] tokenizedContext,
+ Span[] ambiguousTokenIndexSpans) {
+ // TODO Auto-generated method stub
return null;
}
+ // TODO fix the conflicts in parameters with Anthony's code
@Override
public WSDParameters getParams() {
+ // TODO Auto-generated method stub
return null;
}
@Override
public void setParams(WSDParameters params) throws InvalidParameterException {
+ // TODO Auto-generated method stub
+
}
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
index c6d5d2d..c451dc6 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
@@ -25,4 +25,5 @@
public interface IMSContextGenerator {
public String[] getContext(WTDIMS word);
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
index fd444ce..6dfd617 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
@@ -19,7 +19,6 @@
package opennlp.tools.disambiguator.ims;
-
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
index e4a3ab6..e69de29 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import opennlp.tools.util.BaseToolFactory;
-import opennlp.tools.util.InvalidFormatException;
-
-public class IMSFactory extends BaseToolFactory {
-
- protected String languageCode;
-
- protected String resourcesFolder = "src\\test\\resources\\supervised\\";
-
- protected String rawDataDirectory = resourcesFolder + "training\\";
- protected String trainingDataDirectory = resourcesFolder + "models\\";
- protected String dictionaryDirectory = resourcesFolder + "dictionary\\";
-
- protected String dict = dictionaryDirectory + "EnglishLS.dictionary.xml";
- protected String map = dictionaryDirectory + "EnglishLS.sensemap";
-
- public IMSFactory() {
- super();
- }
-
- public String getLanguageCode() {
- return languageCode;
- }
-
- public void setLanguageCode(String languageCode) {
- this.languageCode = languageCode;
- }
-
- public String getRawDataDirectory() {
- return rawDataDirectory;
- }
-
- public void setRawDataDirectory(String rawDataDirectory) {
- this.rawDataDirectory = rawDataDirectory;
- }
-
- public String getTrainingDataDirectory() {
- return trainingDataDirectory;
- }
-
- public void setTrainingDataDirectory(String trainingDataDirectory) {
- this.trainingDataDirectory = trainingDataDirectory;
- }
-
- public String getDictionaryDirectory() {
- return dictionaryDirectory;
- }
-
- public void setDictionaryDirectory(String dictionaryDirectory) {
- this.dictionaryDirectory = dictionaryDirectory;
- }
-
- public String getDict() {
- return dict;
- }
-
- public void setDict(String dict) {
- this.dict = dict;
- }
-
- public String getMap() {
- return map;
- }
-
- public void setMap(String map) {
- this.map = map;
- }
-
- void init() {
- }
-
- public IMSContextGenerator createContextGenerator() {
-
- return new DefaultIMSContextGenerator();
- }
-
- @Override
- public void validateArtifactMap() throws InvalidFormatException {
- }
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
new file mode 100644
index 0000000..5866479
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.ims;
+
+import opennlp.tools.disambiguator.WSDParameters;
+
+/**
+ * This class contains the parameters for the IMS approach as well as the
+ * directories containing the files used
+ */
+public class IMSParameters extends WSDParameters {
+
+ protected String languageCode;
+ protected int windowSize;
+ protected int ngram;
+
+ protected String resourcesFolder = "src\\test\\resources\\supervised\\";
+
+ protected String rawDataDirectory = resourcesFolder + "raw\\";
+ protected String trainingDataDirectory = resourcesFolder + "models\\";
+ protected String dictionaryDirectory = resourcesFolder + "dictionary\\";
+
+ protected String dict = dictionaryDirectory + "EnglishLS.dictionary.xml";
+ protected String map = dictionaryDirectory + "EnglishLS.sensemap";
+
+ public IMSParameters() {
+ super();
+ this.languageCode = "En";
+ this.windowSize = 3;
+ this.ngram = 2;
+ }
+
+ /**
+ *
+ * @param windowSize
+ * : the size of the window used for the extraction of the features
+ * qualified of Surrounding Words
+ * @param ngram
+ * : the number words used for the extraction of features qualified
+ * of Local Collocations
+ */
+ public IMSParameters(int windowSize, int ngram) {
+ super();
+ this.languageCode = "En";
+ this.windowSize = windowSize;
+ this.ngram = ngram;
+ }
+
+ public String getLanguageCode() {
+ return languageCode;
+ }
+
+ public void setLanguageCode(String languageCode) {
+ this.languageCode = languageCode;
+ }
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ public int getNgram() {
+ return ngram;
+ }
+
+ public void setNgram(int ngram) {
+ this.ngram = ngram;
+ }
+
+ public String getRawDataDirectory() {
+ return rawDataDirectory;
+ }
+
+ public void setRawDataDirectory(String rawDataDirectory) {
+ this.rawDataDirectory = rawDataDirectory;
+ }
+
+ public String getTrainingDataDirectory() {
+ return trainingDataDirectory;
+ }
+
+ public void setTrainingDataDirectory(String trainingDataDirectory) {
+ this.trainingDataDirectory = trainingDataDirectory;
+ }
+
+ public String getDictionaryDirectory() {
+ return dictionaryDirectory;
+ }
+
+ public void setDictionaryDirectory(String dictionaryDirectory) {
+ this.dictionaryDirectory = dictionaryDirectory;
+ }
+
+ public String getDict() {
+ return dict;
+ }
+
+ public void setDict(String dict) {
+ this.dict = dict;
+ }
+
+ public String getMap() {
+ return map;
+ }
+
+ public void setMap(String map) {
+ this.map = map;
+ }
+
+ public String getResourcesFolder() {
+ return resourcesFolder;
+ }
+
+ public void setResourcesFolder(String resourcesFolder) {
+ this.resourcesFolder = resourcesFolder;
+ }
+
+ void init() {
+ }
+
+ /**
+ * Creates the context generator of IMS
+ */
+ public IMSContextGenerator createContextGenerator() {
+
+ return new DefaultIMSContextGenerator();
+ }
+
+ @Override
+ public boolean isValid() {
+ // TODO Auto-generated method stub
+ return false;
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
index e40c75a..832ebc0 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
@@ -30,8 +30,11 @@
protected String[] surroundingWords;
protected String[] localCollocations;
+ protected String[] features;
+
public WTDIMS(String[] sentence, int word, int sense) {
super(sentence, word, sense);
+
}
public WTDIMS(String[] sentence, int word) {
@@ -55,6 +58,7 @@
}
this.senseIDs = xmlAnswers;
+
}
public String[] getPosOfSurroundingWords() {
@@ -80,4 +84,13 @@
public void setLocalCollocations(String[] localCollocations) {
this.localCollocations = localCollocations;
}
+
+ public String[] getFeatures() {
+ return this.features;
+ }
+
+ public void setFeatures(String[] features) {
+ this.features = features;
+ }
+
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
index 52442fe..fff8bdd 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
@@ -21,27 +21,33 @@
import opennlp.tools.disambiguator.ims.IMS;
+/**
+ * This is a typical example of how to call the disambiguation function in the
+ * IMS class.
+ * <ul>
+ * <li>In the 2 first examples, the training data exist, therefore the IMS
+ * approach is used.</li>
+ * <li>In the 3rd example, the training data for the word to disambiguate are
+ * absent, therefore the Most Frequent Sents (MFS) is returend</li>
+ * </ul>
+ */
public class IMSTester {
public static void main(String[] args) {
IMS ims = new IMS();
- String test = "You have to write an essay without using a dictionary!";
- String[] sentence = Loader.getTokenizer().tokenize(test);
- Constants.print(ims.disambiguate(sentence, 3));
+ String test1 = "Please write to me soon.";
+ String[] sentence1 = Loader.getTokenizer().tokenize(test1);
+ Constants.print(ims.disambiguate(sentence1, 1));
- String test2 = "Please write to me soon.";
+ String test2 = "it was a strong argument that his hypothesis was true";
String[] sentence2 = Loader.getTokenizer().tokenize(test2);
- Constants.print(ims.disambiguate(sentence2, 1));
+ Constants.print(ims.disambiguate(sentence2, 3));
- String test3 = "the argument over foreign aid goes on and on";
+ String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water";
String[] sentence3 = Loader.getTokenizer().tokenize(test3);
- Constants.print(ims.disambiguate(sentence3, 1));
-
- String test4 = "it was a strong argument that his hypothesis was true";
- String[] sentence4 = Loader.getTokenizer().tokenize(test4);
- Constants.print(ims.disambiguate(sentence4, 3));
+ Constants.print(ims.disambiguate(sentence3, 12));
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
new file mode 100644
index 0000000..e277909
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+
+import opennlp.tools.disambiguator.lesk.Lesk;
+import opennlp.tools.disambiguator.lesk.LeskParameters;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+import org.junit.Test;
+
+public class LeskTester {
+
+ @Test
+ public static void main(String[] args) {
+
+ String sentence = "I went fishing for some sea bass.";
+ TokenizerModel TokenizerModel;
+
+ try {
+ TokenizerModel = new TokenizerModel(new FileInputStream(
+ "src\\test\\resources\\models\\en-token.bin"));
+ Tokenizer tokenizer = new TokenizerME(TokenizerModel);
+
+ String[] words = tokenizer.tokenize(sentence);
+//
+// POSModel posTaggerModel = new POSModelLoader()
+// .load(new File(
+// "src\\test\\resources\\models\\en-pos-maxent.bin"));
+//// POSTagger tagger = new POSTaggerME(posTaggerModel);
+//
+// Constants.print("\ntokens :");
+ Constants.print(words);
+
+ int wordIndex= 6;
+// Constants.print(tagger.tag(words));
+
+ Constants.print("\ntesting default lesk :");
+ Lesk lesk = new Lesk();
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
+
+
+ Constants.print("\ntesting with null params :");
+ lesk.setParams(null);
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
+
+ Constants.print("\ntesting with default params");
+ lesk.setParams(new LeskParameters());
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
+
+ Constants.print("\ntesting with custom params :");
+ LeskParameters leskParams = new LeskParameters();
+ leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_BASIC_CTXT_WIN_BF);
+ leskParams.setWin_b_size(4);
+ leskParams.setDepth(3);
+ lesk.setParams(leskParams);
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
+
+ /*
+ * Constants.print("\ntesting with wrong params should throw exception :");
+ * LeskParameters leskWrongParams = new LeskParameters();
+ * leskWrongParams.depth = -1; lesk.setParams(leskWrongParams);
+ * Constants.print(lesk.disambiguate(words, 6));
+ */
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
index adae7ab..e69de29 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.postag.POSModelLoader;
-import opennlp.tools.disambiguator.lesk.Lesk;
-import opennlp.tools.disambiguator.lesk.LeskParameters;
-import opennlp.tools.postag.POSModel;
-import opennlp.tools.postag.POSTagger;
-import opennlp.tools.postag.POSTaggerME;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.TokenizerME;
-import opennlp.tools.tokenize.TokenizerModel;
-
-import org.junit.Test;
-
-public class Tester {
-
- @Test
- public static void main(String[] args) {
-
- String sentence = "I went fishing for some sea bass.";
- TokenizerModel TokenizerModel;
-
- try {
- TokenizerModel = new TokenizerModel(new FileInputStream(
- "src\\test\\resources\\models\\en-token.bin"));
- Tokenizer tokenizer = new TokenizerME(TokenizerModel);
-
- String[] words = tokenizer.tokenize(sentence);
-//
-// POSModel posTaggerModel = new POSModelLoader()
-// .load(new File(
-// "src\\test\\resources\\models\\en-pos-maxent.bin"));
-//// POSTagger tagger = new POSTaggerME(posTaggerModel);
-//
-// Constants.print("\ntokens :");
- Constants.print(words);
-
- int wordIndex= 6;
-// Constants.print(tagger.tag(words));
-
- Constants.print("\ntesting default lesk :");
- Lesk lesk = new Lesk();
- Constants.print(lesk.disambiguate(words, wordIndex));
- Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
-
-
- Constants.print("\ntesting with null params :");
- lesk.setParams(null);
- Constants.print(lesk.disambiguate(words, wordIndex));
- Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
-
- Constants.print("\ntesting with default params");
- lesk.setParams(new LeskParameters());
- Constants.print(lesk.disambiguate(words, wordIndex));
- Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
-
- Constants.print("\ntesting with custom params :");
- LeskParameters leskParams = new LeskParameters();
- leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_BASIC_CTXT_WIN_BF);
- leskParams.setWin_b_size(4);
- leskParams.setDepth(3);
- lesk.setParams(leskParams);
- Constants.print(lesk.disambiguate(words, wordIndex));
- Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
-
- /*
- * Constants.print("\ntesting with wrong params should throw exception :");
- * LeskParameters leskWrongParams = new LeskParameters();
- * leskWrongParams.depth = -1; lesk.setParams(leskWrongParams);
- * Constants.print(lesk.disambiguate(words, 6));
- */
-
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- }
-
-}