blob: 4e060b9e96579c14c582130d1e500402ece7a3fd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package opennlp.tools.disambiguator.datareader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Collections;
import java.util.Arrays;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import opennlp.tools.disambiguator.WSDHelper;
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
/**
* This class handles the extraction of Senseval-3 data from the different files
* (training data, dictionary instances, etc.)
*/
public class SensevalReader {
protected String sensevalDirectory = "src\\test\\resources\\senseval3\\";
protected String data = sensevalDirectory + "EnglishLS.train";
protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
protected String wordList = sensevalDirectory + "EnglishLS.train.key";
public String getSensevalDirectory() {
return sensevalDirectory;
}
public void setSensevalDirectory(String sensevalDirectory) {
this.sensevalDirectory = sensevalDirectory;
this.data = sensevalDirectory + "EnglishLS.train";
this.sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
this.wordList = sensevalDirectory + "EnglishLS.train.key";
}
public SensevalReader() {
super();
}
/**
* This extracts the equivalent senses. This serves in the case of the
* coarse-grained disambiguation
*
* @param sensemapFile
* the file containing the equivalent senses, each set of equivalent
* senses per line
* @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
* an {@link ArrayList} of the equivalent senses original IDs
*/
public HashMap<Integer, ArrayList<String>> getEquivalentSense() {
HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();
try (BufferedReader wordsList = new BufferedReader(new FileReader(
sensemapFile))) {
int index = 0;
String line;
while ((line = wordsList.readLine()) != null) {
String[] temp = line.split("\\s");
ArrayList<String> tempSenses = new ArrayList<String>();
for (String sense : temp) {
if (sense.length() > 1) {
tempSenses.add(sense);
}
}
mappedSenses.put(index, tempSenses);
index++;
}
} catch (IOException e) {
e.printStackTrace();
}
return mappedSenses;
}
/**
* This returns the list of words available in the Senseval data
*
* @return {@link ArrayList} of the words available on the current Senseval
* set
*/
public ArrayList<String> getSensevalWords() {
ArrayList<String> wordTags = new ArrayList<String>();
try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {
String line;
while ((line = br.readLine()) != null) {
String word = line.split("\\s")[0];
if (!wordTags.contains(word)) {
wordTags.add(word);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return wordTags;
}
/**
* Main Senseval Reader: This checks if the data corresponding to the words to
* disambiguate exist in the folder, and extract the {@link WSDSample}
* instances
*
* @param wordTag
* The word, of which we are looking for the instances
* @return the list of the {@link WSDSample} instances of the word to
* disambiguate
*/
public ArrayList<WSDSample> getSensevalData(String wordTag) {
ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>();
try {
File xmlFile = new File(data);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(xmlFile);
doc.getDocumentElement().normalize();
NodeList lexelts = doc.getElementsByTagName("lexelt");
for (int i = 0; i < lexelts.getLength(); i++) {
Node nLexelt = lexelts.item(i);
if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
Element eLexelt = (Element) nLexelt;
if (eLexelt.getAttribute("item").equals(wordTag)) {
NodeList nInstances = nLexelt.getChildNodes();
for (int j = 1; j < nInstances.getLength(); j++) {
Node nInstance = nInstances.item(j);
if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
ArrayList<String> senseIDs = new ArrayList<String>();
String rawWord = "";
String[] finalText = null;
int index = 0;
NodeList nChildren = nInstance.getChildNodes();
for (int k = 1; k < nChildren.getLength(); k++) {
Node nChild = nChildren.item(k);
if (nChild.getNodeName().equals("answer")) {
// String answer =
// nChild.getAttributes().item(0).getTextContent();
String senseid = nChild.getAttributes().item(1)
.getTextContent();
String temp = senseid;
// String[] temp = { answer, senseid };
senseIDs.add(temp);
}
if (nChild.getNodeName().equals("context")) {
if (nChild.hasChildNodes()) {
String textBefore = nChild.getChildNodes().item(0)
.getTextContent();
rawWord = nChild.getChildNodes().item(1).getTextContent();
String textAfter = nChild.getChildNodes().item(2)
.getTextContent();
ArrayList<String> textBeforeTokenzed = new ArrayList<String>(
Arrays.asList(textBefore.split("\\s")));
ArrayList<String> textAfterTokenzed = new ArrayList<String>(
Arrays.asList(textAfter.split("\\s")));
textBeforeTokenzed.removeAll(Collections.singleton(null));
textBeforeTokenzed.removeAll(Collections.singleton(""));
textAfterTokenzed.removeAll(Collections.singleton(null));
textAfterTokenzed.removeAll(Collections.singleton(""));
finalText = new String[textBeforeTokenzed.size() + 1
+ textAfterTokenzed.size()];
int l = 0;
for (String tempWord : textBeforeTokenzed) {
finalText[l] = tempWord;
l++;
}
index = l;
finalText[l] = rawWord.toLowerCase();
l++;
for (String tempWord : textAfterTokenzed) {
finalText[l] = tempWord;
l++;
}
}
}
}
String[] words = finalText;
String[] tags = WSDHelper.getTagger().tag(words);
String[] lemmas = new String[words.length];
for (int k = 0; k < words.length; k++) {
lemmas[k] = WSDHelper.getLemmatizer().lemmatize(words[k],
tags[k]);
}
WSDSample wtd = new WSDSample(words, tags, lemmas, index,
senseIDs);
setInstances.add(wtd);
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return setInstances;
}
/**
* Main Senseval Reader: This checks if the data corresponding to the words to
* disambiguate exist in the folder, and extract the
*
* @param wordTag
* The word, of which we are looking for the instances
* @return the stream of {@link WSDSample} of the word to disambiguate
*/
public ObjectStream<WSDSample> getSemcorDataStream(String wordTag) {
return ObjectStreamUtils.createObjectStream(getSensevalData(wordTag));
}
}