OPENNLP-796 The two readers now return ObjectStream<WSDSample>. Thanks to Mondher Bouazizi for providing a patch.
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
index eb7a2d5..664f7b3 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
@@ -27,6 +27,8 @@
import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -34,7 +36,7 @@
import org.w3c.dom.NodeList;
/**
- * This reads one semcor file. It requires the
+ * This class reads Semcor data.
*
*/
public class SemcorReaderExtended {
@@ -62,10 +64,19 @@
private static final String ELEMENT_PUNCTUATION = "punc";
- private static String path = "src\\test\\resources\\semcor3.0\\";
+ private static String semcorDirectory = "src\\test\\resources\\semcor3.0\\";
private static String[] folders = { "brown1", "brown2", "brownv" };
private static String tagfiles = "\\tagfiles\\";
+
+ public static String getSemcorDirectory() {
+ return semcorDirectory;
+ }
+
+ public static void setSemcorDirectory(String semcorDirectory) {
+ SemcorReaderExtended.semcorDirectory = semcorDirectory;
+ }
+
public SemcorReaderExtended() {
super();
}
@@ -73,7 +84,7 @@
/**
* This serves to read one Semcor XML file
*/
- public ArrayList<Sentence> readFile(String file) {
+ private ArrayList<Sentence> readFile(String file) {
ArrayList<Sentence> result = new ArrayList<Sentence>();
@@ -176,8 +187,18 @@
return result;
}
- public ArrayList<WSDSample> getSemcorOneFileData(String file,
- String wordTag) {
+ /**
+ * One Semcor folder reader: This reads all the files in one semcor folder,
+ * and return all the instances in the format {@link WSDSample} of a
+ * specific word
+ *
+ * @param file
+ * the name of the file to read
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WSDSample} instances
+ */
+ private ArrayList<WSDSample> getSemcorOneFileData(String file, String wordTag) {
ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>();
@@ -227,11 +248,12 @@
String[] words = sentence.split("\\s");
String[] tags = WSDHelper.getTagger().tag(words);
String[] lemmas = new String[words.length];
-
+
for (int i = 0; i < words.length; i++) {
- lemmas[i] = WSDHelper.getLemmatizer().lemmatize(words[i], tags[i]);
+ lemmas[i] = WSDHelper.getLemmatizer().lemmatize(words[i],
+ tags[i]);
}
-
+
WSDSample wtd = new WSDSample(words, tags, lemmas, index, senses);
setInstances.add(wtd);
}
@@ -251,7 +273,7 @@
/**
* One Semcor folder reader: This reads all the files in one semcor folder,
- * and return all the instances in the format {@link WordToDisambiguate} of a
+ * and return all the instances in the format {@link WSDSample} of a
* specific word
*
* @param folder
@@ -259,13 +281,13 @@
* are ["brown1", "brown2", "brownv"]
* @param wordTag
* The word, of which we are looking for the instances
- * @return the list of the {@link WordToDisambiguate} instances
+ * @return the list of the {@link WSDSample} instances
*/
- public ArrayList<WSDSample> getSemcorFolderData(String folder, String wordTag) {
+ private ArrayList<WSDSample> getSemcorFolderData(String folder, String wordTag) {
ArrayList<WSDSample> result = new ArrayList<WSDSample>();
- String directory = path + folder + tagfiles;
+ String directory = semcorDirectory + folder + tagfiles;
File tempFolder = new File(directory);
File[] listOfFiles;
@@ -273,8 +295,8 @@
listOfFiles = tempFolder.listFiles();
for (File file : listOfFiles) {
- ArrayList<WSDSample> list = getSemcorOneFileData(directory
- + file.getName(), wordTag);
+ ArrayList<WSDSample> list = getSemcorOneFileData(
+ directory + file.getName(), wordTag);
result.addAll(list);
}
}
@@ -285,11 +307,11 @@
/**
* Semcor reader: This reads all the files in semcor, and return all the
- * instances in the format {@link WordToDisambiguate} of a specific word
+ * instances in the format {@link WSDSample} of a specific word
*
* @param wordTag
* The word, of which we are looking for the instances
- * @return the list of the {@link WordToDisambiguate} instances of the word to
+ * @return the list of the {@link WSDSample} instances of the word to
* disambiguate
*/
public ArrayList<WSDSample> getSemcorData(String wordTag) {
@@ -305,4 +327,16 @@
}
+ /**
+ * Semcor reader: This reads all the files in semcor, and return all the
+ * instances in the format {@link WSDSample} of a specific word
+ *
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the stream of {@link WSDSample} of the word to disambiguate
+ */
+ public ObjectStream<WSDSample> getSemcorDataStream(String wordTag) {
+ return ObjectStreamUtils.createObjectStream(getSemcorData(wordTag));
+ }
+
}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
index 464bc36..4e060b9 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
@@ -38,6 +38,8 @@
import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
/**
* This class handles the extraction of Senseval-3 data from the different files
@@ -45,13 +47,24 @@
*/
public class SensevalReader {
- private String resourcesFolder = "src\\test\\resources\\";
- protected String sensevalDirectory = resourcesFolder + "senseval3\\";
+ protected String sensevalDirectory = "src\\test\\resources\\senseval3\\";
protected String data = sensevalDirectory + "EnglishLS.train";
protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
protected String wordList = sensevalDirectory + "EnglishLS.train.key";
+ public String getSensevalDirectory() {
+ return sensevalDirectory;
+ }
+
+ public void setSensevalDirectory(String sensevalDirectory) {
+ this.sensevalDirectory = sensevalDirectory;
+
+ this.data = sensevalDirectory + "EnglishLS.train";
+ this.sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
+ this.wordList = sensevalDirectory + "EnglishLS.train.key";
+ }
+
public SensevalReader() {
super();
}
@@ -136,12 +149,12 @@
/**
* Main Senseval Reader: This checks if the data corresponding to the words to
- * disambiguate exist in the folder, and extract the
- * {@link WordToDisambiguate} instances
+ * disambiguate exist in the folder, and extract the {@link WSDSample}
+ * instances
*
* @param wordTag
* The word, of which we are looking for the instances
- * @return the list of the {@link WordToDisambiguate} instances of the word to
+ * @return the list of the {@link WSDSample} instances of the word to
* disambiguate
*/
public ArrayList<WSDSample> getSensevalData(String wordTag) {
@@ -267,4 +280,16 @@
}
+ /**
+ * Main Senseval Reader: This checks if the data corresponding to the words to
+ * disambiguate exist in the folder, and extract the
+ *
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the stream of {@link WSDSample} of the word to disambiguate
+ */
+ public ObjectStream<WSDSample> getSemcorDataStream(String wordTag) {
+ return ObjectStreamUtils.createObjectStream(getSensevalData(wordTag));
+ }
+
}