updates sandbox component 'opennlp-wsd' to be compatible with latest opennlp-tools release- (#59)
- adjusts opennlp-tools to 2.1.0
- adjusts parent project (org.apache.apache) to version 18
- adjusts Java language level to 11
- adds missing test resources in a gzip compressed form to check whether the existing work; some don't
- changes some interfaces to use List instead of ArrayList in method signatures
- ignored tests that aren't functional even with corresponding test resources, see OpenNLP-1446
diff --git a/opennlp-wsd/pom.xml b/opennlp-wsd/pom.xml
index 47de8ec..9110b75 100644
--- a/opennlp-wsd/pom.xml
+++ b/opennlp-wsd/pom.xml
@@ -25,12 +25,13 @@
<parent>
<groupId>org.apache</groupId>
<artifactId>apache</artifactId>
- <version>13</version>
+ <!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. -->
+ <version>18</version>
<relativePath />
</parent>
<artifactId>opennlp-wsd</artifactId>
- <version>1.6.0-SNAPSHOT</version>
+ <version>2.1.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>Apache OpenNLP WSD</name>
@@ -38,7 +39,7 @@
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
- <version>1.6.0</version>
+ <version>2.1.0</version>
</dependency>
<dependency>
@@ -62,7 +63,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
- <version>4.8.1</version>
+ <version>4.13.1</version>
<scope>test</scope>
</dependency>
</dependencies>
@@ -80,8 +81,8 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
- <source>1.7</source>
- <target>1.7</target>
+ <source>11</source>
+ <target>11</target>
<compilerArgument>-Xlint</compilerArgument>
</configuration>
</plugin>
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorEvaluatorTool.java b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorEvaluatorTool.java
index e440130..1efa729 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorEvaluatorTool.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorEvaluatorTool.java
@@ -33,14 +33,17 @@
public final class DisambiguatorEvaluatorTool extends CmdLineTool {
+ @Override
public String getName() {
return "DisambiguatorEvaluator";
}
+ @Override
public String getShortDescription() {
return "Disambiguator Evaluation Tool";
}
+ @Override
public String getHelp() {
return "Usage: " + CLI.CMD + " " + getName() + " "
+ ArgumentParser.createUsage(DisambiguatorEvaluatorParams.class);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
index 89d55a5..9de0c7f 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
@@ -20,16 +20,17 @@
package opennlp.tools.cmdline.disambiguator;
import java.io.File;
-import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.SystemInputStreamFactory;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.disambiguator.Lesk;
import opennlp.tools.disambiguator.WSDHelper;
@@ -37,7 +38,9 @@
import opennlp.tools.disambiguator.WSDSampleStream;
import opennlp.tools.disambiguator.WSDisambiguator;
import opennlp.tools.disambiguator.MFS;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
import opennlp.tools.util.PlainTextByLineStream;
/*
@@ -47,14 +50,17 @@
public class DisambiguatorTool extends CmdLineTool {
// TODO CmdLineTool should be an interface not abstract class
+ @Override
public String getName() {
return "Disambiguator";
}
+ @Override
public String getShortDescription() {
return "Word Sense Disambiguator";
}
+ @Override
public String getHelp() {
return "Usage: " + CLI.CMD + " " + getName() + " "
+ ArgumentParser.createUsage(DisambiguatorToolParams.class)
@@ -75,12 +81,10 @@
PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
- ObjectStream<String> lineStream = new PlainTextByLineStream(
- new InputStreamReader(System.in));
-
perfMon.start();
- try {
+ try (ObjectStream<String> lineStream = new PlainTextByLineStream(
+ new SystemInputStreamFactory(), StandardCharsets.UTF_8)) {
String line;
while ((line = lineStream.read()) != null) {
@@ -115,13 +119,19 @@
static ObjectStream<WSDSample> openSampleData(String sampleDataName,
File sampleDataFile, Charset encoding) {
+
CmdLineUtil.checkInputFile(sampleDataName + " Data", sampleDataFile);
+ final MarkableFileInputStreamFactory factory;
+ try {
+ factory = new MarkableFileInputStreamFactory(sampleDataFile);
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException("Error finding specified input file!", e);
+ }
- FileInputStream sampleDataIn = CmdLineUtil.openInFile(sampleDataFile);
-
- ObjectStream<String> lineStream = new PlainTextByLineStream(
- sampleDataIn.getChannel(), encoding);
-
- return new WSDSampleStream(lineStream);
+ try (ObjectStream<String> lineStream = new ParagraphStream(new PlainTextByLineStream(factory, encoding))) {
+ return new WSDSampleStream(lineStream);
+ } catch (IOException e) {
+ throw new RuntimeException("Error loading WSD samples from input data!", e);
+ }
}
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
index c48d950..7873111 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java
@@ -20,6 +20,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
+import java.util.List;
public class IMSWSDContextGenerator implements WSDContextGenerator {
@@ -42,11 +43,10 @@
return windowTags;
}
- public String[] extractSurroundingContext(int index, String[] toks,
- String[] lemmas, int windowSize) {
+ public String[] extractSurroundingContext(int index, String[] toks, String[] lemmas, int windowSize) {
// TODO consider the windowSize
- ArrayList<String> contextWords = new ArrayList<String>();
+ List<String> contextWords = new ArrayList<>();
for (int i = 0; i < toks.length; i++) {
if (lemmas != null) {
@@ -67,14 +67,13 @@
return contextWords.toArray(new String[contextWords.size()]);
}
- private String[] extractLocalCollocations(int index, String[] sentence,
- int ngram) {
- /**
+ private String[] extractLocalCollocations(int index, String[] sentence, int ngram) {
+ /*
* Here the author used only 11 features of this type. the range was set to
* 3 (bigrams extracted in a way that they are at max separated by 1 word).
*/
- ArrayList<String> localCollocations = new ArrayList<String>();
+ ArrayList<String> localCollocations = new ArrayList<>();
for (int i = index - ngram; i <= index + ngram; i++) {
@@ -108,9 +107,9 @@
* @param model The list of unigrams
* @return The IMS context of the word to disambiguate
*/
- @Override public String[] getContext(int index, String[] tokens,
- String[] tags, String[] lemmas, int ngram, int windowSize,
- ArrayList<String> model) {
+ @Override
+ public String[] getContext(int index, String[] tokens,
+ String[] tags, String[] lemmas, int ngram, int windowSize, List<String> model) {
String[] posOfSurroundingWords = extractPosOfSurroundingWords(index, tokens,
windowSize);
@@ -155,8 +154,9 @@
* @param model The list of unigrams
* @return The IMS context of the word to disambiguate
*/
- @Override public String[] getContext(WSDSample sample, int ngram,
- int windowSize, ArrayList<String> model) {
+ @Override
+ public String[] getContext(WSDSample sample, int ngram,
+ int windowSize, List<String> model) {
return getContext(sample.getTargetPosition(), sample.getSentence(),
sample.getTags(), sample.getLemmas(), ngram, windowSize, model);
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
index 719fad8..6aff7fc 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/OSCCWSDContextGenerator.java
@@ -22,6 +22,8 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
import net.sf.extjwnl.data.Synset;
@@ -73,10 +75,11 @@
*
* @return The OSCC context of the word to disambiguate
*/
- @Override public String[] getContext(int index, String[] toks, String[] tags,
- String[] lemmas, int ngram, int windowSize, ArrayList<String> model) {
+ @Override
+ public String[] getContext(int index, String[] toks, String[] tags,
+ String[] lemmas, int ngram, int windowSize, List<String> model) {
- HashSet<String> surroundingContextClusters = new HashSet<>();
+ Set<String> surroundingContextClusters = new HashSet<>();
surroundingContextClusters.addAll(Arrays.asList(
extractSurroundingContext(index, toks, tags, lemmas,
windowSize)));
@@ -96,8 +99,8 @@
return serializedFeatures;
}
- public String[] getContext(WSDSample sample, int ngram, int windowSize,
- ArrayList<String> model) {
+ @Override
+ public String[] getContext(WSDSample sample, int ngram, int windowSize, List<String> model) {
return getContext(sample.getTargetPosition(), sample.getSentence(),
sample.getTags(), sample.getLemmas(), 0, windowSize, model);
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java
index e84b72e..0c685c0 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/SynNode.java
@@ -83,7 +83,7 @@
}
for (int i = 0; i < phypernyms.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) phypernyms.get(i);
+ PointerTargetNode ptn = phypernyms.get(i);
this.hypernyms.add(ptn.getSynset());
}
@@ -102,7 +102,7 @@
}
for (int i = 0; i < pmeronyms.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) pmeronyms.get(i);
+ PointerTargetNode ptn = pmeronyms.get(i);
this.meronyms.add(ptn.getSynset());
}
}
@@ -120,7 +120,7 @@
}
for (int i = 0; i < pholonyms.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) pholonyms.get(i);
+ PointerTargetNode ptn = pholonyms.get(i);
this.holonyms.add(ptn.getSynset());
}
@@ -139,7 +139,7 @@
}
for (int i = 0; i < phyponyms.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) phyponyms.get(i);
+ PointerTargetNode ptn = phyponyms.get(i);
this.hyponyms.add(ptn.getSynset());
}
}
@@ -157,7 +157,7 @@
}
for (int i = 0; i < pentailments.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) pentailments.get(i);
+ PointerTargetNode ptn = pentailments.get(i);
this.entailments.add(ptn.getSynset());
}
@@ -176,7 +176,7 @@
}
for (int i = 0; i < pcoordinateTerms.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) pcoordinateTerms.get(i);
+ PointerTargetNode ptn = pcoordinateTerms.get(i);
this.coordinateTerms.add(ptn.getSynset());
}
@@ -195,7 +195,7 @@
}
for (int i = 0; i < pcauses.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) pcauses.get(i);
+ PointerTargetNode ptn = pcauses.get(i);
this.causes.add(ptn.getSynset());
}
@@ -214,7 +214,7 @@
}
for (int i = 0; i < pattributes.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) pattributes.get(i);
+ PointerTargetNode ptn = pattributes.get(i);
this.attributes.add(ptn.getSynset());
}
@@ -233,7 +233,7 @@
}
for (int i = 0; i < ppertainyms.size(); i++) {
- PointerTargetNode ptn = (PointerTargetNode) ppertainyms.get(i);
+ PointerTargetNode ptn = ppertainyms.get(i);
this.pertainyms.add(ptn.getSynset());
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDContextGenerator.java
index 31e1dd3..3d717cc 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDContextGenerator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDContextGenerator.java
@@ -19,17 +19,16 @@
package opennlp.tools.disambiguator;
-import java.util.ArrayList;
+import java.util.List;
/**
* Interface for {@link WSDisambiguator} context generators.
*/
public interface WSDContextGenerator {
- public String[] getContext(int index, String[] toks, String[] tags,
- String[] lemmas, int ngram, int windowSize, ArrayList<String> model);
+ String[] getContext(int index, String[] toks, String[] tags,
+ String[] lemmas, int ngram, int windowSize, List<String> model);
- public String[] getContext(WSDSample sample, int ngram, int windowSize,
- ArrayList<String> model);
+ String[] getContext(WSDSample sample, int ngram, int windowSize, List<String> model);
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluationMonitor.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluationMonitor.java
index 36369c6..7881b37 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluationMonitor.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluationMonitor.java
@@ -19,8 +19,7 @@
import opennlp.tools.util.eval.EvaluationMonitor;
-public interface WSDEvaluationMonitor extends
- EvaluationMonitor<WSDSample> {
+public interface WSDEvaluationMonitor extends EvaluationMonitor<WSDSample> {
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
index 03a0af3..2880be0 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
@@ -17,22 +17,26 @@
package opennlp.tools.disambiguator;
+import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.dictionary.Dictionary;
import net.sf.extjwnl.dictionary.MorphologicalProcessor;
import opennlp.tools.cmdline.postag.POSModelLoader;
-import opennlp.tools.lemmatizer.SimpleLemmatizer;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
@@ -41,7 +45,7 @@
protected static TokenizerME tokenizer;
protected static POSTaggerME tagger;
- protected static SimpleLemmatizer lemmatizer;
+ protected static DictionaryLemmatizer lemmatizer;
protected static Dictionary dictionary;
protected static MorphologicalProcessor morph;
@@ -50,12 +54,12 @@
protected static String lemmatizerDictionaryPath;
// local caches for faster lookup
- private static HashMap<String, Object> stemCache;
- private static HashMap<String, Object> stopCache;
- private static HashMap<String, Object> relvCache;
+ private static Map<String, Object> stemCache;
+ private static Map<String, Object> stopCache;
+ private static Map<String, Object> relvCache;
- private static HashMap<String, Object> englishWords;
- private static HashMap<String, Object> nonRelevWordsDef;
+ private static Map<String, Object> englishWords;
+ private static Map<String, Object> nonRelevWordsDef;
// List of all the PoS tags
public static String[] allPOS = { "CC", "CD", "DT", "EX", "FW", "IN", "JJ",
@@ -68,11 +72,11 @@
"RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" };
// List of Negation Words
- public static ArrayList<String> negationWords = new ArrayList<String>(
+ public static List<String> negationWords = new ArrayList<>(
Arrays.asList("not", "no", "never", "none", "nor", "non"));
// List of Stop Words
- public static ArrayList<String> stopWords = new ArrayList<String>(
+ public static List<String> stopWords = new ArrayList<>(
Arrays.asList("a", "able", "about", "above", "according", "accordingly",
"across", "actually", "after", "afterwards", "again", "against",
"ain't", "all", "allow", "allows", "almost", "alone", "along",
@@ -153,9 +157,9 @@
"would", "wouldn't", "yes", "yet", "you", "you'd", "you'll", "your",
"you're", "yours", "yourself", "yourselves", "you've", "zero"));
- public static HashMap<String, Object> getRelvCache() {
+ public static Map<String, Object> getRelvCache() {
if (relvCache == null || relvCache.keySet().isEmpty()) {
- relvCache = new HashMap<String, Object>();
+ relvCache = new HashMap<>();
for (String t : relevantPOS) {
relvCache.put(t, null);
}
@@ -163,9 +167,9 @@
return relvCache;
}
- public static HashMap<String, Object> getStopCache() {
+ public static Map<String, Object> getStopCache() {
if (stopCache == null || stopCache.keySet().isEmpty()) {
- stopCache = new HashMap<String, Object>();
+ stopCache = new HashMap<>();
for (String s : stopWords) {
stopCache.put(s, null);
}
@@ -173,17 +177,17 @@
return stopCache;
}
- public static HashMap<String, Object> getStemCache() {
+ public static Map<String, Object> getStemCache() {
if (stemCache == null || stemCache.keySet().isEmpty()) {
- stemCache = new HashMap<String, Object>();
+ stemCache = new HashMap<>();
for (Object pos : POS.getAllPOS()) {
- stemCache.put(((POS) pos).getKey(), new HashMap());
+ stemCache.put(((POS) pos).getKey(), new HashMap<String, Object>());
}
}
return stemCache;
}
- public static HashMap<String, Object> getEnglishWords() {
+ public static Map<String, Object> getEnglishWords() {
if (englishWords == null || englishWords.keySet().isEmpty()) {
englishWords = getEnglishWords(lemmatizerDictionaryPath);
}
@@ -191,16 +195,16 @@
}
/**
- * This initializes the Hashmap of non relevant words definitions, and returns
- * the definition of the non relevant word based on its pos-tag
+ * This initializes the Hashmap of irrelevant words definitions, and returns
+ * the definition of the irrelevant word based on its pos-tag
*
* @param posTag
- * the pos-tag of the non relevant word
+ * the pos-tag of the irrelevant word
* @return the definition of the word
*/
public static String getNonRelevWordsDef(String posTag) {
if (nonRelevWordsDef == null || nonRelevWordsDef.keySet().isEmpty()) {
- nonRelevWordsDef = new HashMap<String, Object>();
+ nonRelevWordsDef = new HashMap<>();
nonRelevWordsDef.put("CC", "coordinating conjunction");
nonRelevWordsDef.put("CD", "cardinal number");
@@ -262,14 +266,26 @@
return dictionary;
}
- public static SimpleLemmatizer getLemmatizer() {
+ public static DictionaryLemmatizer getLemmatizer() {
+ if (lemmatizerDictionaryPath == null) {
+ throw new IllegalStateException("Loading a Lemmatizer is not possible without setting the " +
+ "corresponding model file!");
+ }
if (lemmatizer == null) {
+ final InputStream resource;
try {
- lemmatizer = new SimpleLemmatizer(new FileInputStream(
- lemmatizerDictionaryPath));
+ if (lemmatizerDictionaryPath.endsWith(".dict.gz")) {
+ resource = new GZIPInputStream(new FileInputStream(lemmatizerDictionaryPath));
+ } else {
+ resource = new FileInputStream(lemmatizerDictionaryPath);
+ }
+ try (InputStream in = new BufferedInputStream(resource)) {
+ lemmatizer = new DictionaryLemmatizer(in);
+ }
} catch (IOException e) {
- e.printStackTrace();
+ throw new RuntimeException("Error opening or loading a Lemmatizer from specified resource file!", e);
}
+
}
return lemmatizer;
@@ -306,7 +322,7 @@
return getTagger();
}
- public static SimpleLemmatizer loadLemmatizer(String path) {
+ public static DictionaryLemmatizer loadLemmatizer(String path) {
lemmatizerDictionaryPath = path;
return getLemmatizer();
}
@@ -319,8 +335,7 @@
}
// Print a text in the console
- public static void printResults(WSDisambiguator disambiguator,
- String result) {
+ public static void printResults(WSDisambiguator disambiguator, String result) {
if (result != null) {
@@ -328,7 +343,7 @@
String sensekey;
if (disambiguator instanceof Lesk) {
- Double score;
+ double score;
parts = result.split(" ");
sensekey = parts[1];
@@ -429,20 +444,17 @@
* this file is the same that is used in the simple Lemmatizer
* (i.e.,"en-lemmatizer.dict")
*
- * @return a list of all the English words
+ * @return a Map of all the English words
*/
- public static HashMap<String, Object> getEnglishWords(String dict) {
+ public static Map<String, Object> getEnglishWords(String dict) {
- HashMap<String, Object> words = new HashMap<String, Object>();
-
- BufferedReader br = null;
+ Map<String, Object> words = new HashMap<>();
File file = new File(lemmatizerDictionaryPath);
if (file.exists()) {
- try {
- br = new BufferedReader(new FileReader(file));
+ try (BufferedReader br = new BufferedReader(new FileReader(file))) {
String line = br.readLine();
while (line != null) {
line = br.readLine();
@@ -451,18 +463,8 @@
words.put(word, null);
}
}
- } catch (FileNotFoundException e) {
- e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
- } finally {
- if (br != null) {
- try {
- br.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
}
return words;
} else {
@@ -480,14 +482,10 @@
*/
public static POS getPOS(String posTag) {
- ArrayList<String> adjective = new ArrayList<String>(Arrays.asList("JJ",
- "JJR", "JJS"));
- ArrayList<String> adverb = new ArrayList<String>(Arrays.asList("RB", "RBR",
- "RBS"));
- ArrayList<String> noun = new ArrayList<String>(Arrays.asList("NN", "NNS",
- "NNP", "NNPS"));
- ArrayList<String> verb = new ArrayList<String>(Arrays.asList("VB", "VBD",
- "VBG", "VBN", "VBP", "VBZ"));
+ List<String> adjective = Arrays.asList("JJ", "JJR", "JJS");
+ List<String> adverb = Arrays.asList("RB", "RBR", "RBS");
+ List<String> noun = Arrays.asList("NN", "NNS", "NNP", "NNPS");
+ List<String> verb = Arrays.asList("VB", "VBD", "VBG", "VBN", "VBP", "VBZ");
if (adjective.contains(posTag))
return POS.ADJECTIVE;
@@ -606,7 +604,7 @@
public static ArrayList<WordPOS> getAllRelevantWords(String[] sentence) {
- ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
+ ArrayList<WordPOS> relevantWords = new ArrayList<>();
String[] tags = WSDHelper.getTagger().tag(sentence);
@@ -622,7 +620,7 @@
}
/**
- * Stem a single word with WordNet dictionnary
+ * Stem a single word with WordNet dictionary.
*
* @param wordToStem
* word to be stemmed
@@ -631,7 +629,7 @@
public static ArrayList<String> StemWordWithWordNet(WordPOS wordToStem) {
if (wordToStem == null)
return null;
- ArrayList<String> stems = new ArrayList<String>();
+ ArrayList<String> stems = new ArrayList<>();
try {
for (Object pos : POS.getAllPOS()) {
stems.addAll(WSDHelper.getMorph().lookupAllBaseForms((POS) pos,
@@ -671,8 +669,7 @@
return null;
}
- ArrayList<String> stemList = (ArrayList<String>) posMap.get(wordToStem
- .getWord());
+ ArrayList<String> stemList = (ArrayList<String>) posMap.get(wordToStem.getWord());
if (stemList != null) { // return it if we already cached it
return stemList;
@@ -685,7 +682,7 @@
WSDHelper.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
return stemList;
} else { // could not be stemmed add it anyway (as it is)
- stemList = new ArrayList<String>();
+ stemList = new ArrayList<>();
stemList.add(wordToStem.getWord());
posMap.put(wordToStem.getWord(), stemList);
WSDHelper.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java
index 1041fec..c2ce95d 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDModel.java
@@ -23,6 +23,7 @@
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import java.util.Map;
import java.util.Properties;
@@ -45,12 +46,12 @@
private static final String NGRAM = "ngram";
private static final String CONTEXT = "context";
- private ArrayList<String> contextEntries = new ArrayList<String>();
+ private List<String> contextEntries = new ArrayList<>();
private String wordTag;
private int windowSize;
private int ngram;
- public ArrayList<String> getContextEntries() {
+ public List<String> getContextEntries() {
return contextEntries;
}
@@ -145,7 +146,7 @@
Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
String surroundings = (String) manifest.get(CONTEXT);
- this.contextEntries = new ArrayList(Arrays.asList(surroundings.split(",")));
+ this.contextEntries = Arrays.asList(surroundings.split(","));
this.wordTag = (String) manifest.get(WORDTAG);
this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
this.ngram = Integer.parseInt((String) manifest.get(NGRAM));
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
index 0120b05..ed06aae 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
@@ -25,25 +25,19 @@
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
public class WSDSampleStream extends FilterObjectStream<String, WSDSample> {
- private static Logger logger = Logger.getLogger(WSDSampleStream.class
- .getName());
+ private static Logger logger = Logger.getLogger(WSDSampleStream.class.getName());
/**
* Initializes the current instance.
*
* @param sentences
- * reader with sentences
+ * An {@link ObjectStream} with sentences
* @throws IOException
* IOException
*/
- public WSDSampleStream(Reader sentences) throws IOException {
- super(new PlainTextByLineStream(sentences));
- }
-
public WSDSampleStream(ObjectStream<String> sentences) {
super(sentences);
}
@@ -54,9 +48,9 @@
* If an error occurs an empty {@link WSDSample} object is returned and an
* warning message is logged. Usually it does not matter if one of many
* sentences is ignored.
- *
- * TODO: An exception in error case should be thrown.
*/
+ // TODO: An exception in error case should be thrown.
+ @Override
public WSDSample read() throws IOException {
String sentence = samples.read();
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java
index 096b788..01d4bb3 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguatorME.java
@@ -21,7 +21,6 @@
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.TrainingParameters;
@@ -30,6 +29,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
public class WSDisambiguatorME extends WSDisambiguator {
@@ -64,12 +64,12 @@
ArrayList<String> surroundingContext = buildSurroundingContext(samples,
((WSDDefaultParameters) params).getWindowSize());
- HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
+ HashMap<String, String> manifestInfoEntries = new HashMap<>();
- MaxentModel meModel = null;
+ MaxentModel meModel;
- ArrayList<Event> events = new ArrayList<Event>();
- ObjectStream<Event> es = null;
+ List<Event> events = new ArrayList<>();
+ ObjectStream<Event> es;
WSDSample sample = samples.read();
String wordTag = "";
@@ -86,8 +86,7 @@
}
es = ObjectStreamUtils.createObjectStream(events);
- EventTrainer trainer = TrainerFactory
- .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
+ EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries);
meModel = trainer.train(es);
@@ -132,9 +131,6 @@
if (file.exists() && !file.isDirectory()) {
try {
setModel(new WSDModel(file));
-
- } catch (InvalidFormatException e) {
- e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
index 8fb2045..d9db0de 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordSense.java
@@ -19,10 +19,7 @@
package opennlp.tools.disambiguator;
-import opennlp.tools.disambiguator.WSDSample;
-import opennlp.tools.disambiguator.SynNode;
-
-public class WordSense implements Comparable {
+public class WordSense implements Comparable<WordSense> {
protected WSDSample sample;
protected SynNode node;
@@ -71,7 +68,8 @@
this.id = id;
}
- public int compareTo(Object o) {
+ @Override
+ public int compareTo(WordSense o) {
return (this.score - ((WordSense) o).score) < 0 ? 1 : -1;
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
index e0decf2..55af1a9 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
@@ -27,6 +27,8 @@
import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.postag.POSTagger;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
@@ -200,7 +202,7 @@
*/
private ArrayList<WSDSample> getSemcorOneFileData(String file, String wordTag) {
- ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>();
+ ArrayList<WSDSample> setInstances = new ArrayList<>();
try {
@@ -238,21 +240,19 @@
+ isentences.get(j + 1).toString();
index = isentences.get(j - 1).getIwords().size() + k;
}
- ArrayList<String> senses = new ArrayList<String>();
+ ArrayList<String> senses = new ArrayList<>();
String sense = iword.getLexsn();
if (sense != null) {
senses.add(sense);
}
if (!senses.isEmpty()) {
- String[] words = sentence.split("\\s");
- String[] tags = WSDHelper.getTagger().tag(words);
- String[] lemmas = new String[words.length];
+ final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
+ final POSTagger tagger = WSDHelper.getTagger();
- for (int i = 0; i < words.length; i++) {
- lemmas[i] = WSDHelper.getLemmatizer().lemmatize(words[i],
- tags[i]);
- }
+ final String[] words = sentence.split("\\s");
+ final String[] tags = tagger.tag(words);
+ String[] lemmas = lemmatizer.lemmatize(words, tags);
WSDSample wtd = new WSDSample(words, tags, lemmas, index, senses.toArray(new String[0]));
setInstances.add(wtd);
@@ -285,7 +285,7 @@
*/
private ArrayList<WSDSample> getSemcorFolderData(String folder, String wordTag) {
- ArrayList<WSDSample> result = new ArrayList<WSDSample>();
+ ArrayList<WSDSample> result = new ArrayList<>();
String directory = semcorDirectory + folder + tagfiles;
File tempFolder = new File(directory);
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
index 9dfbb94..d1f9662 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
@@ -19,18 +19,25 @@
package opennlp.tools.disambiguator.datareader;
+import java.io.BufferedInputStream;
import java.io.BufferedReader;
-import java.io.File;
+import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Collections;
import java.util.Arrays;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.postag.POSTagger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -42,16 +49,17 @@
import opennlp.tools.util.ObjectStreamUtils;
/**
- * This class handles the extraction of Senseval-3 data from the different files
- * (training data, dictionary instances, etc.)
+ * This class handles the extraction of
+ * <a href="https://web.eecs.umich.edu/~mihalcea/senseval/senseval3/data.html">Senseval-3</a>
+ * data from the different files (training data, dictionary instances, etc.)
*/
public class SensevalReader {
- protected String sensevalDirectory = "src/test/resources/senseval3/";
+ private String sensevalDirectory = "src/test/resources/senseval3/";
- protected String data = sensevalDirectory + "EnglishLS.train";
- protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
- protected String wordList = sensevalDirectory + "EnglishLS.train.key";
+ private String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
+ private String data = sensevalDirectory + "EnglishLS.train.gz";
+ private String wordList = sensevalDirectory + "EnglishLS.train.key.gz";
public String getSensevalDirectory() {
return sensevalDirectory;
@@ -73,15 +81,12 @@
* This extracts the equivalent senses. This serves in the case of the
* coarse-grained disambiguation
*
- * @param sensemapFile
- * the file containing the equivalent senses, each set of equivalent
- * senses per line
* @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
* an {@link ArrayList} of the equivalent senses original IDs
*/
public HashMap<Integer, ArrayList<String>> getEquivalentSense() {
- HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();
+ HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<>();
try (BufferedReader wordsList = new BufferedReader(new FileReader(
sensemapFile))) {
@@ -94,7 +99,7 @@
String[] temp = line.split("\\s");
- ArrayList<String> tempSenses = new ArrayList<String>();
+ ArrayList<String> tempSenses = new ArrayList<>();
for (String sense : temp) {
if (sense.length() > 1) {
@@ -123,20 +128,26 @@
*/
public ArrayList<String> getSensevalWords() {
- ArrayList<String> wordTags = new ArrayList<String>();
+ ArrayList<String> wordTags = new ArrayList<>();
- try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {
+ final InputStream resource;
+ try {
+ if (wordList.endsWith(".train.key.gz")) {
+ resource = new GZIPInputStream(new FileInputStream(wordList));
+ } else {
+ resource = new FileInputStream(wordList);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Error opening or loading Senseval wordlist from specified resource file!", e);
+ }
+ try (BufferedReader br = new BufferedReader(new InputStreamReader(resource))) {
String line;
-
while ((line = br.readLine()) != null) {
-
String word = line.split("\\s")[0];
-
if (!wordTags.contains(word)) {
wordTags.add(word);
}
-
}
} catch (IOException e) {
@@ -159,14 +170,23 @@
*/
public ArrayList<WSDSample> getSensevalData(String wordTag) {
- ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>();
+ ArrayList<WSDSample> setInstances = new ArrayList<>();
+ final InputStream resource;
try {
+ if (data.endsWith(".train.gz")) {
+ resource = new GZIPInputStream(new FileInputStream(data));
+ } else {
+ resource = new FileInputStream(data);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Error opening or loading Senseval data from specified resource file!", e);
+ }
- File xmlFile = new File(data);
+ try (InputStream xmlFileInputStream = new BufferedInputStream(resource)) {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
- Document doc = dBuilder.parse(xmlFile);
+ Document doc = dBuilder.parse(xmlFileInputStream);
doc.getDocumentElement().normalize();
@@ -188,7 +208,7 @@
Node nInstance = nInstances.item(j);
if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
- ArrayList<String> senseIDs = new ArrayList<String>();
+ ArrayList<String> senseIDs = new ArrayList<>();
String rawWord = "";
String[] finalText = null;
int index = 0;
@@ -218,29 +238,26 @@
String textAfter = nChild.getChildNodes().item(2)
.getTextContent();
- ArrayList<String> textBeforeTokenzed = new ArrayList<String>(
- Arrays.asList(textBefore.split("\\s")));
- ArrayList<String> textAfterTokenzed = new ArrayList<String>(
- Arrays.asList(textAfter.split("\\s")));
+ List<String> textBeforeTokenized = Arrays.asList(textBefore.split("\\s"));
+ List<String> textAfterTokenized = Arrays.asList(textAfter.split("\\s"));
- textBeforeTokenzed.removeAll(Collections.singleton(null));
- textBeforeTokenzed.removeAll(Collections.singleton(""));
+ textBeforeTokenized.removeAll(Collections.singleton(null));
+ textBeforeTokenized.removeAll(Collections.singleton(""));
+ textAfterTokenized.removeAll(Collections.singleton(null));
+ textAfterTokenized.removeAll(Collections.singleton(""));
- textAfterTokenzed.removeAll(Collections.singleton(null));
- textAfterTokenzed.removeAll(Collections.singleton(""));
-
- finalText = new String[textBeforeTokenzed.size() + 1
- + textAfterTokenzed.size()];
+ finalText = new String[textBeforeTokenized.size() + 1
+ + textAfterTokenized.size()];
int l = 0;
- for (String tempWord : textBeforeTokenzed) {
+ for (String tempWord : textBeforeTokenized) {
finalText[l] = tempWord;
l++;
}
index = l;
finalText[l] = rawWord.toLowerCase();
l++;
- for (String tempWord : textAfterTokenzed) {
+ for (String tempWord : textAfterTokenized) {
finalText[l] = tempWord;
l++;
}
@@ -249,27 +266,20 @@
}
}
+ final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
+ final POSTagger tagger = WSDHelper.getTagger();
- String[] words = finalText;
- String[] tags = WSDHelper.getTagger().tag(words);
- String[] lemmas = new String[words.length];
+ final String[] words = finalText;
+ final String[] tags = tagger.tag(finalText);
+ String[] lemmas = lemmatizer.lemmatize(words, tags);
- for (int k = 0; k < words.length; k++) {
- lemmas[k] = WSDHelper.getLemmatizer().lemmatize(words[k],
- tags[k]);
- }
-
- WSDSample wtd = new WSDSample(words, tags, lemmas, index,
- senseIDs.toArray(new String[0]));
+ WSDSample wtd = new WSDSample(words, tags, lemmas, index, senseIDs.toArray(new String[0]));
setInstances.add(wtd);
}
}
-
}
-
}
-
}
} catch (Exception e) {
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/AbstractEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/AbstractEvaluatorTest.java
new file mode 100644
index 0000000..57840eb
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/AbstractEvaluatorTest.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import org.junit.BeforeClass;
+
+public abstract class AbstractEvaluatorTest {
+
+ private static final String MODELS_DIR = "src/test/resources/models/";
+
+ @BeforeClass
+ public static void initEnv() {
+ WSDHelper.loadTokenizer(MODELS_DIR + "en-token.bin");
+ WSDHelper.loadTagger(MODELS_DIR + "en-pos-maxent.bin");
+ WSDHelper.loadLemmatizer(MODELS_DIR + "en-lemmatizer.dict.gz");
+ }
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
index 9e4310a..114306b 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
@@ -21,25 +21,23 @@
import java.util.ArrayList;
-import opennlp.tools.disambiguator.datareader.SensevalReader;
-
+import org.junit.Ignore;
import org.junit.Test;
-public class LeskEvaluatorTest {
+import opennlp.tools.disambiguator.datareader.SensevalReader;
+
+public class LeskEvaluatorTest extends AbstractEvaluatorTest {
static SensevalReader seReader = new SensevalReader();
@Test
- public static void main(String[] args) {
+ @Ignore // TODO OPENNLP-1446: Investigate why test fails while parsing 'EnglishLS.train'
+ public void testEvaluation() {
WSDHelper.print("Evaluation Started");
- String modelsDir = "src/test/resources/models/";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
Lesk lesk = new Lesk();
LeskParameters leskParams = new LeskParameters();
- boolean a[] = { true, true, true, true, true, false, false, false, false,
- false };
+ boolean a[] = { true, true, true, true, true, false, false, false, false, false };
leskParams.setFeatures(a);
leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_CTXT);
lesk.setParams(leskParams);
@@ -69,5 +67,4 @@
}
}
-
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
index 0ef0091..46cb313 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
@@ -21,15 +21,16 @@
import static org.junit.Assert.assertEquals;
-import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
-import opennlp.tools.disambiguator.LeskParameters.LESK_TYPE;
-import opennlp.tools.util.Span;
-
import org.junit.BeforeClass;
import org.junit.Test;
+import opennlp.tools.disambiguator.LeskParameters.LESK_TYPE;
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.util.Span;
+
/**
* This is the test class for {@link Lesk}.
*
@@ -58,9 +59,9 @@
static String[] tags2;
static String[] tags3;
- static String[] lemmas1;
- static String[] lemmas2;
- static String[] lemmas3;
+ static List<List<String>> lemmas1;
+ static List<List<String>> lemmas2;
+ static List<List<String>> lemmas3;
/*
* Setup the testing variables
@@ -68,9 +69,9 @@
@BeforeClass
public static void setUp() {
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict.gz");
sentence1 = WSDHelper.getTokenizer().tokenize(test1);
sentence2 = WSDHelper.getTokenizer().tokenize(test2);
@@ -80,33 +81,16 @@
tags2 = WSDHelper.getTagger().tag(sentence2);
tags3 = WSDHelper.getTagger().tag(sentence3);
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- tempLemmas1
- .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
- }
- lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- tempLemmas2
- .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
- }
- lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- tempLemmas3
- .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
- }
- lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+ final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
+ lemmas1 = lemmatizer.lemmatize(Arrays.asList(sentence1), Arrays.asList(tags1));
+ lemmas2 = lemmatizer.lemmatize(Arrays.asList(sentence2), Arrays.asList(tags2));
+ lemmas3 = lemmatizer.lemmatize(Arrays.asList(sentence3), Arrays.asList(tags3));
lesk = new Lesk();
LeskParameters params = new LeskParameters();
params.setLeskType(LESK_TYPE.LESK_EXT);
- boolean a[] = { true, true, true, true, true, true, true, true, true,
- true };
+ boolean a[] = { true, true, true, true, true, true, true, true, true, true };
params.setFeatures(a);
lesk.setParams(params);
}
@@ -116,7 +100,7 @@
*/
@Test
public void testOneWordDisambiguation() {
- String sense = lesk.disambiguate(sentence1, tags1, lemmas1, 8);
+ String sense = lesk.disambiguate(sentence1, tags1, lemmas1.get(0).toArray(new String[0]), 8);
assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00:: -1", sense);
}
@@ -128,7 +112,7 @@
@Test
public void testWordSpanDisambiguation() {
Span span = new Span(3, 7);
- List<String> senses = lesk.disambiguate(sentence2, tags2, lemmas2, span);
+ List<String> senses = lesk.disambiguate(sentence2, tags2, lemmas2.get(0).toArray(new String[0]), span);
assertEquals("Check number of returned words", 5, senses.size());
assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01:: 3.8",
@@ -144,7 +128,7 @@
*/
@Test
public void testAllWordsDisambiguation() {
- List<String> senses = lesk.disambiguate(sentence3, tags3, lemmas3);
+ List<String> senses = lesk.disambiguate(sentence3, tags3, lemmas3.get(0).toArray(new String[0]));
assertEquals("Check number of returned words", 15, senses.size());
assertEquals("Check preposition", "WSDHELPER personal pronoun",
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
index 098c096..1039338 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
@@ -21,24 +21,21 @@
import java.util.ArrayList;
-import opennlp.tools.disambiguator.datareader.SensevalReader;
-import opennlp.tools.disambiguator.MFS;
-
+import org.junit.Ignore;
import org.junit.Test;
-public class MFSEvaluatorTest {
+import opennlp.tools.disambiguator.datareader.SensevalReader;
+
+public class MFSEvaluatorTest extends AbstractEvaluatorTest {
static SensevalReader seReader = new SensevalReader();
@Test
- public static void main(String[] args) {
+ @Ignore // TODO OPENNLP-1446: Investigate why test fails while parsing 'EnglishLS.train'
+ public void testEvaluation() {
WSDHelper.print("Evaluation Started");
- String modelsDir = "src/test/resources/models/";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
- MFS mfs = new MFS();
+ MFS mfs = new MFS();
ArrayList<String> words = seReader.getSensevalWords();
for (String word : words) {
@@ -62,9 +59,7 @@
WSDHelper.print("null instances");
}
}
-
}
-
}
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
index c6ca4b0..20c418b 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
@@ -21,12 +21,13 @@
import static org.junit.Assert.assertEquals;
-import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
+
import org.junit.BeforeClass;
import org.junit.Test;
-import opennlp.tools.disambiguator.MFS;
+import opennlp.tools.lemmatizer.Lemmatizer;
import opennlp.tools.util.Span;
/**
@@ -58,9 +59,9 @@
static String[] tags2;
static String[] tags3;
- static String[] lemmas1;
- static String[] lemmas2;
- static String[] lemmas3;
+ static List<List<String>> lemmas1;
+ static List<List<String>> lemmas2;
+ static List<List<String>> lemmas3;
/*
* Setup the testing variables and the training files
@@ -68,9 +69,9 @@
@BeforeClass
public static void setUpAndTraining() {
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict.gz");
sentence1 = WSDHelper.getTokenizer().tokenize(test1);
sentence2 = WSDHelper.getTokenizer().tokenize(test2);
@@ -80,26 +81,10 @@
tags2 = WSDHelper.getTagger().tag(sentence2);
tags3 = WSDHelper.getTagger().tag(sentence3);
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- tempLemmas1
- .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
- }
- lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- tempLemmas2
- .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
- }
- lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- tempLemmas3
- .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
- }
- lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+ final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
+ lemmas1 = lemmatizer.lemmatize(Arrays.asList(sentence1), Arrays.asList(tags1));
+ lemmas2 = lemmatizer.lemmatize(Arrays.asList(sentence2), Arrays.asList(tags2));
+ lemmas3 = lemmatizer.lemmatize(Arrays.asList(sentence3), Arrays.asList(tags3));
mfs = new MFS();
@@ -110,7 +95,7 @@
*/
@Test
public void testOneWordDisambiguation() {
- String sense = mfs.disambiguate(sentence1, tags1, lemmas1, 8);
+ String sense = mfs.disambiguate(sentence1, tags1, lemmas1.get(0).toArray(new String[0]), 8);
assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00::", sense);
}
@@ -122,7 +107,7 @@
@Test
public void testWordSpanDisambiguation() {
Span span = new Span(3, 7);
- List<String> senses = mfs.disambiguate(sentence2, tags2, lemmas2, span);
+ List<String> senses = mfs.disambiguate(sentence2, tags2, lemmas2.get(0).toArray(new String[0]), span);
assertEquals("Check number of returned words", 5, senses.size());
assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01::",
@@ -138,7 +123,7 @@
*/
@Test
public void testAllWordsDisambiguation() {
- List<String> senses = mfs.disambiguate(sentence3, tags3, lemmas3);
+ List<String> senses = mfs.disambiguate(sentence3, tags3, lemmas3.get(0).toArray(new String[0]));
assertEquals("Check number of returned words", 15, senses.size());
assertEquals("Check preposition", "WSDHELPER personal pronoun",
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java
index 3b43d99..d6f37b3 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDEvaluatorTest.java
@@ -23,6 +23,9 @@
import java.io.IOException;
import java.util.ArrayList;
+import org.junit.Ignore;
+import org.junit.Test;
+
import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.util.ObjectStream;
@@ -32,11 +35,10 @@
import static org.junit.Assert.fail;
// TODO improve the tests improve parameters
-public class WSDEvaluatorTest {
+public class WSDEvaluatorTest extends AbstractEvaluatorTest {
static SensevalReader seReader;
- static String modelsDir = "src/test/resources/models/";
static String trainingDataDirectory = "src/test/resources/supervised/models/";
static WSDDefaultParameters params = new WSDDefaultParameters("");
@@ -44,14 +46,12 @@
static WSDModel model;
static ArrayList<String> testWords;
+
+ @Test
+ @Ignore // TODO OPENNLP-1446: Investigate why test fails while parsing 'EnglishLS.train'
+ public void testTraining() {
- /*
- * Setup the testing variables
- */
- public static void setUpAndTraining() {
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+ WSDHelper.print("Evaluation Started");
seReader = new SensevalReader();
testWords = seReader.getSensevalWords();
@@ -72,10 +72,10 @@
ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(word);
WSDModel writeModel = null;
- /*
- * Tests training the disambiguator We test both writing and reading a model
- * file trained by semcor
- */
+ /*
+ * Tests training the disambiguator We test both writing and reading a model
+ * file trained by semcor
+ */
File outFile;
try {
writeModel = WSDisambiguatorME
@@ -97,7 +97,9 @@
}
}
- public static void disambiguationEval() {
+ @Test
+ @Ignore // Make this work once we have migrated to JUnit5 in the sandbox components
+ public void testDisambiguationEval() {
WSDHelper.print("Evaluation Started");
@@ -125,8 +127,4 @@
}
}
- public static void main(String[] args) {
- setUpAndTraining();
- disambiguationEval();
- }
}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java
index 8470928..36ff2f3 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/WSDTester.java
@@ -23,14 +23,15 @@
import java.io.File;
import java.io.IOException;
-import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
-import opennlp.tools.util.ObjectStream;
import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
@@ -74,17 +75,17 @@
static String[] tags2;
static String[] tags3;
- static String[] lemmas1;
- static String[] lemmas2;
- static String[] lemmas3;
+ static List<List<String>> lemmas1;
+ static List<List<String>> lemmas2;
+ static List<List<String>> lemmas3;
/*
* Setup the testing variables
*/
@BeforeClass public static void setUpAndTraining() {
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict.gz");
sentence1 = WSDHelper.getTokenizer().tokenize(test1);
sentence2 = WSDHelper.getTokenizer().tokenize(test2);
@@ -94,27 +95,11 @@
tags2 = WSDHelper.getTagger().tag(sentence2);
tags3 = WSDHelper.getTagger().tag(sentence3);
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- tempLemmas1
- .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
- }
- lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- tempLemmas2
- .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
- }
- lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- tempLemmas3
- .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
- }
- lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
+ final Lemmatizer lemmatizer = WSDHelper.getLemmatizer();
+ lemmas1 = lemmatizer.lemmatize(Arrays.asList(sentence1), Arrays.asList(tags1));
+ lemmas2 = lemmatizer.lemmatize(Arrays.asList(sentence2), Arrays.asList(tags2));
+ lemmas3 = lemmatizer.lemmatize(Arrays.asList(sentence3), Arrays.asList(tags3));
+
params = new WSDDefaultParameters("");
params.setTrainingDataDirectory(trainingDataDirectory);
TrainingParameters trainingParams = new TrainingParameters();
@@ -140,7 +125,7 @@
assertNotNull("Checking the disambiguator", wsdME);
} catch (IOException e1) {
e1.printStackTrace();
- fail("Exception in training");
+ fail("Exception in training: "+ e1.getMessage());
}
}
@@ -148,7 +133,7 @@
* Tests disambiguating only one word : The ambiguous word "please"
*/
@Test public void testOneWordDisambiguation() {
- String sense = wsdME.disambiguate(sentence1, tags1, lemmas1, 8);
+ String sense = wsdME.disambiguate(sentence1, tags1, lemmas1.get(0).toArray(new String[0]), 8);
assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00::", sense);
}
@@ -159,7 +144,7 @@
*/
@Test public void testWordSpanDisambiguation() {
Span span = new Span(3, 7);
- List<String> senses = wsdME.disambiguate(sentence2, tags2, lemmas2, span);
+ List<String> senses = wsdME.disambiguate(sentence2, tags2, lemmas2.get(0).toArray(new String[0]), span);
assertEquals("Check number of returned words", 5, senses.size());
assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01::",
@@ -174,7 +159,7 @@
* Tests disambiguating all the words
*/
@Test public void testAllWordsDisambiguation() {
- List<String> senses = wsdME.disambiguate(sentence3, tags3, lemmas3);
+ List<String> senses = wsdME.disambiguate(sentence3, tags3, lemmas3.get(0).toArray(new String[0]));
assertEquals("Check number of returned words", 15, senses.size());
assertEquals("Check preposition", "WSDHELPER personal pronoun",
diff --git a/opennlp-wsd/src/test/resources/models/en-lemmatizer.dict.gz b/opennlp-wsd/src/test/resources/models/en-lemmatizer.dict.gz
new file mode 100644
index 0000000..379db49
--- /dev/null
+++ b/opennlp-wsd/src/test/resources/models/en-lemmatizer.dict.gz
Binary files differ
diff --git a/opennlp-wsd/src/test/resources/models/en-pos-maxent.bin b/opennlp-wsd/src/test/resources/models/en-pos-maxent.bin
new file mode 100644
index 0000000..168f259
--- /dev/null
+++ b/opennlp-wsd/src/test/resources/models/en-pos-maxent.bin
Binary files differ
diff --git a/opennlp-wsd/src/test/resources/models/en-token.bin b/opennlp-wsd/src/test/resources/models/en-token.bin
new file mode 100644
index 0000000..eb7d770
--- /dev/null
+++ b/opennlp-wsd/src/test/resources/models/en-token.bin
Binary files differ
diff --git a/opennlp-wsd/src/test/resources/senseval3/EnglishLS.sensemap b/opennlp-wsd/src/test/resources/senseval3/EnglishLS.sensemap
new file mode 100644
index 0000000..2d9c5a7
--- /dev/null
+++ b/opennlp-wsd/src/test/resources/senseval3/EnglishLS.sensemap
@@ -0,0 +1,303 @@
+38202 2 38204
+38201
+38203
+38205
+42601 2 42606
+42602 2 42604
+42603
+42605
+190901
+190902
+190903
+argument%1:09:00::
+argument%1:10:00:: 2 argument%1:10:03::
+argument%1:10:01::
+argument%1:10:02::
+arm%1:06:00::
+arm%1:06:01::
+arm%1:06:02::
+arm%1:06:03::
+arm%1:08:00::
+arm%1:14:00::
+238101 2 238105
+238102 3 238106
+238103 3 238106
+238104
+atmosphere%1:07:00::
+atmosphere%1:15:00:: 2 atmosphere%1:17:00::
+atmosphere%1:23:00::
+atmosphere%1:26:00::
+atmosphere%1:26:01::
+audience%1:10:00:: 2 audience%1:26:00::
+audience%1:14:00:: 2 audience%1:14:01::
+bank%1:04:00::
+bank%1:06:00:: 2 bank%1:14:00::
+bank%1:06:01:: 2 bank%1:21:01::
+bank%1:14:00::
+bank%1:14:01::
+bank%1:17:00::
+bank%1:17:01::
+bank%1:17:02::
+bank%1:21:00::
+369201 2 369203
+369202
+369204
+770001 3 770005
+770002 3 770005
+770003
+770004
+1067501 2 1067502
+1067503
+1067504
+degree%1:07:00:: 2 degree%1:26:01::
+degree%1:07:01::
+degree%1:09:00::
+degree%1:10:00::
+degree%1:23:00:: 2 degree%1:23:03::
+difference%1:07:00:: 2 difference%1:24:00::
+difference%1:10:00::
+difference%1:11:00::
+difference%1:23:00::
+different%3:00:00:: 2 different%3:00:02::
+different%5:00:00:other:00
+different%5:00:00:unusual:00
+different%5:00:01:other:00
+difficulty%1:04:00:: 3 difficulty%1:26:00::
+difficulty%1:07:00::
+difficulty%1:09:02:: 3 difficulty%1:26:00::
+disc%1:06:00::
+disc%1:06:01::
+disc%1:06:03::
+disc%1:25:00::
+1297001 2 1297006
+1297002 2 1297007
+1297003
+1297004
+1297005
+1353101 3 1353104
+1353102
+1353103 3 1353104
+1440301
+1440302
+1440303
+1446801 2 1446802
+1446803
+1446804
+1892101 3 1892105
+1892102 2 1892106
+1892103 3 1892105
+1892104
+1892107
+hot%3:00:01::
+hot%3:00:02::
+hot%5:00:00:active:01
+hot%5:00:00:charged:00
+hot%5:00:00:eager:00
+hot%5:00:00:fast:01
+hot%5:00:00:fresh:01
+hot%5:00:00:good:01
+hot%5:00:00:illegal:00
+hot%5:00:00:lucky:00
+hot%5:00:00:near:00
+hot%5:00:00:new:00
+hot%5:00:00:popular:00
+hot%5:00:00:pungent:00
+hot%5:00:00:radioactive:00
+hot%5:00:00:sexy:00
+hot%5:00:00:skilled:00
+hot%5:00:00:unpleasant:00
+hot%5:00:00:violent:00
+hot%5:00:00:wanted:00
+hot%5:00:00:warm:03
+hot%5:00:02:fast:01
+image%1:06:00::
+image%1:06:01::
+image%1:07:00:: 2 image%1:18:00::
+image%1:09:00::
+image%1:09:02::
+image%1:10:00::
+important%3:00:00:: 2 important%3:00:02::
+important%3:00:02::
+important%3:00:04::
+important%5:00:00:immodest:02
+important%5:00:00:influential:00
+interest%1:04:01:: 2 interest%1:09:00::
+interest%1:07:01::
+interest%1:07:02::
+interest%1:14:00::
+interest%1:21:00::
+interest%1:21:03::
+judgment%1:04:00::
+judgment%1:04:02::
+judgment%1:07:00:: 3 judgment%1:09:00::
+judgment%1:09:01:: 3 judgment%1:09:00::
+judgment%1:09:04::
+judgment%1:10:00::
+2439901 2 2439902
+2439903
+2439904
+2439905
+2439906
+2439907
+2439908
+2439909
+2555501
+2555502
+2555503
+2555504
+2555505
+2555506
+2555507
+2644301 3 2644307
+2644302
+2644303
+2644304 2 2644308
+2644305 3 2644307
+2644306
+2822011
+2822012
+2822013
+2893201 2 2893205
+2893202
+2893203
+2893204
+organization%1:04:00:: 2 organization%1:04:02::
+organization%1:04:01:: 2 organization%1:09:00::
+organization%1:07:00::
+organization%1:14:00:: 2 organization%1:14:01::
+paper%1:06:00:: 2 paper%1:10:03::
+paper%1:10:00:: 2 paper%1:27:00::
+paper%1:10:01:: 2 paper%1:10:02::
+paper%1:14:00::
+party%1:11:00::
+party%1:14:00:: 2 party%1:14:02::
+party%1:14:01::
+party%1:18:00::
+performance%1:04:00:: 2 performance%1:04:03::
+performance%1:04:01:: 2 performance%1:10:00::
+performance%1:22:00::
+plan%1:06:00:: 2 plan%1:09:01::
+plan%1:09:00::
+3165210
+3165211
+3165212 3 3165214
+3165213 3 3165218
+3165215
+3165216
+3165217 3 3165218
+3165219
+3165220 3 3165214
+3165221
+3288301 2 3288306
+3288302
+3288303
+3288304
+3288305
+3313901 3 3313905
+3313902 3 3313905
+3313903
+3313904
+3313906
+3434801 2 3434806
+3434802
+3434803 2 3434807
+3434804
+3434805
+3434808
+3434809
+3477801
+3477802
+3477803
+3597906
+3597907
+3597908
+3597910
+3597911
+shelter%1:06:00::
+shelter%1:06:01::
+shelter%1:21:00::
+shelter%1:26:00::
+simple%3:00:01::
+simple%3:00:02:: 2 simple%5:00:00:easy:01
+simple%5:00:00:naive:00
+simple%5:00:00:plain:01 2 simple%5:00:02:plain:01
+simple%5:00:00:retarded:00
+3893501 4 3893507
+3893502 4 3893507
+3893503
+3893505 4 3893507
+3893508
+3893509
+solid%3:00:01::
+solid%3:00:02::
+solid%5:00:00:cubic:00
+solid%5:00:00:frozen:00
+solid%5:00:00:good:01
+solid%5:00:00:hard:01
+solid%5:00:00:homogeneous:00
+solid%5:00:00:honorable:00
+solid%5:00:00:opaque:00
+solid%5:00:00:plain:02
+solid%5:00:00:sound:01
+solid%5:00:00:unbroken:02
+solid%5:00:00:undiversified:00
+solid%5:00:00:wholesome:00
+sort%1:07:00::
+sort%1:09:00:: 2 sort%1:18:00::
+sort%1:22:00::
+source%1:06:00::
+source%1:09:00::
+source%1:10:00:: 2 source%1:10:01::
+source%1:15:00::
+source%1:18:00::
+source%1:18:01::
+4155301
+4155302 3 4155307
+4155303 3 4155307
+4155304
+4155305
+4155306
+4198501 3 4198506
+4198502
+4198503 3 4198506
+4198504
+4198505
+4198507
+4198508
+4198509
+4380101
+4380102 2 4380108
+4380103
+4380104 2 4380105
+4380106 2 4380109
+4380107
+4530701
+4530702
+4530703
+4530704
+4530705
+4636101 5 4636109
+4636102 5 4636109
+4636103 2 4636110
+4636104 2 4636111
+4636105
+4636106
+4636107 5 4636109
+4636108 5 4636109
+4636112
+4640501 2 4640507
+4640502
+4640503 2 4640504
+4640508
+4640509
+4711401 2 4711403
+4711402
+4711404 2 4711406
+4711405
+4711407
+4753401 3 4753406
+4753402
+4753404 3 4753406
+4753405
+4753407 2 4753403
+4753408
diff --git a/opennlp-wsd/src/test/resources/senseval3/EnglishLS.train.gz b/opennlp-wsd/src/test/resources/senseval3/EnglishLS.train.gz
new file mode 100644
index 0000000..e1abe66
--- /dev/null
+++ b/opennlp-wsd/src/test/resources/senseval3/EnglishLS.train.gz
Binary files differ
diff --git a/opennlp-wsd/src/test/resources/senseval3/EnglishLS.train.key.gz b/opennlp-wsd/src/test/resources/senseval3/EnglishLS.train.key.gz
new file mode 100644
index 0000000..c1f20d8
--- /dev/null
+++ b/opennlp-wsd/src/test/resources/senseval3/EnglishLS.train.key.gz
Binary files differ