blob: 3cd27808977b2cd4b2b61e896d6870120793ea0c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package opennlp.tools.disambiguator;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import opennlp.tools.disambiguator.DictionaryInstance;
import opennlp.tools.disambiguator.ims.WTDIMS;
/**
* This class handles the extraction of data from the different files (training
* data, dictionary instances, etc.)
*/
public class DataExtractor {
private static String englishDict = "src\\test\\resources\\models\\en-lemmatizer.dict";
/**
* Constructor
*/
public DataExtractor() {
super();
}
private ArrayList<DictionaryInstance> extractDictionary(String xmlLocation) {
ArrayList<DictionaryInstance> dictionary = new ArrayList<DictionaryInstance>();
try {
File xmlFile = new File(xmlLocation);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(xmlFile);
doc.getDocumentElement().normalize();
NodeList nLexelts = doc.getElementsByTagName("lexelt");
int index = 0;
for (int i = 0; i < nLexelts.getLength(); i++) {
Node nLexelt = nLexelts.item(i);
Element eLexelt = (Element) nLexelt;
String word = eLexelt.getAttribute("item");
if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
NodeList nSenses = eLexelt.getChildNodes();
for (int j = 0; j < nSenses.getLength(); j++) {
if (nSenses.item(j).getNodeType() == Node.ELEMENT_NODE) {
Element eSense = (Element) nSenses.item(j);
int ind = index; // rather use this than the ID used by default
String id = eSense.getAttribute("id");
String source = eSense.getAttribute("source");
String[] synset = eSense.getAttribute("synset").split("\\s");
String gloss = eSense.getAttribute("gloss");
DictionaryInstance wd = new DictionaryInstance(ind, word, id,
source, synset, gloss);
dictionary.add(wd);
index++;
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return dictionary;
}
private HashMap<Integer, ArrayList<String>> getEquivalentSense(
String sensemapFile) {
HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();
try (BufferedReader wordsList = new BufferedReader(new FileReader(
sensemapFile))) {
int index = 0;
String line;
// Read the file
while ((line = wordsList.readLine()) != null) {
String[] temp = line.split("\\s");
ArrayList<String> tempSenses = new ArrayList<String>();
for (String sense : temp) {
if (sense.length() > 1) {
// System.out.println(sense);
tempSenses.add(sense);
}
}
mappedSenses.put(index, tempSenses);
// System.out.println(index);
index++;
}
} catch (IOException e) {
e.printStackTrace();
}
return mappedSenses;
}
private HashMap<String, ArrayList<DictionaryInstance>> extractCoarseGrainedDictionary(
String xmlLocation, String sensemapFile) {
HashMap<String, ArrayList<DictionaryInstance>> optimizedDictionary = new HashMap<String, ArrayList<DictionaryInstance>>();
HashMap<Integer, ArrayList<String>> equivalentSenses = getEquivalentSense(sensemapFile);
ArrayList<DictionaryInstance> dictionary = extractDictionary(xmlLocation);
for (int mapKey : equivalentSenses.keySet()) {
ArrayList<String> sensesIds = equivalentSenses.get(mapKey);
ArrayList<DictionaryInstance> optimizedDictionaryInstance = new ArrayList<DictionaryInstance>();
String word = "";
for (String senseId : sensesIds) {
for (int i = 0; i < dictionary.size(); i++) {
if (dictionary.get(i).getId().equals(senseId)) {
optimizedDictionaryInstance.add(dictionary.get(i));
word = dictionary.get(i).getWord();
word = word + "_" + mapKey;
break;
}
}
}
optimizedDictionary.put(word, optimizedDictionaryInstance);
}
return optimizedDictionary;
}
/**
* Extract the different senses (those which are equivalent are put together)
* of a word
*
* @param xmlLocation
* : location of the file containing the dictionary instances
* @param sensemapFile
* : location of the file containing the equivalent senses in the
* case of Coarse-grained disambiguation
* @param wordTag
* : the word to disambiguate. It should be written in the format
* "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
* @return a {@link HashMap} of {@link DictionaryInstance} with their IDs
*/
public HashMap<String, ArrayList<DictionaryInstance>> extractWordSenses(
String xmlLocation, String sensemapFile, String wordTag) {
/**
* word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
* etc.)
*/
HashMap<String, ArrayList<DictionaryInstance>> wordSenses = new HashMap<String, ArrayList<DictionaryInstance>>();
HashMap<String, ArrayList<DictionaryInstance>> optimalDictionary = extractCoarseGrainedDictionary(
xmlLocation, sensemapFile);
int i = 0;
for (String key : optimalDictionary.keySet()) {
if (key.startsWith(wordTag)) {
String newKey = wordTag + "_" + i;
wordSenses.put(newKey, optimalDictionary.get(key));
i++;
}
}
return wordSenses;
}
/**
* Extract the different senses. This class returns only the ID of the sense
* and the gloss. the synsets and other information are omitted.
*
* @param xmlLocation
* : location of the file containing the dictionary instances
* @param sensemapFile
* : location of the file containing the equivalent senses in the
* case of Coarse-grained disambiguation
* @param wordTag
* the word to disambiguate. It should be written in the format
* "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
* @return a {@link HashMap} of word senses with their IDs
*/
public HashMap<String, String> getDictionaryInstance(String xmlLocation,
String sensemapFile, String wordTag) {
HashMap<String, ArrayList<DictionaryInstance>> dict = extractWordSenses(
xmlLocation, sensemapFile, wordTag);
HashMap<String, String> senses = new HashMap<String, String>();
for (String key : dict.keySet()) {
String sense = dict.get(key).get(0).getGloss();
senses.put(key, sense);
}
return senses;
}
/**
* Extract the training instances from the training/test set File
*
* @param xmlDataSet
* : the file from which the data are to be extracted
* @return {@link ArrayList} of Word To Disambiguate (WTDIMS) instances
*/
public ArrayList<WTDIMS> extractWSDInstances(String xmlDataSet) {
ArrayList<WTDIMS> setInstances = new ArrayList<WTDIMS>();
try {
File xmlFile = new File(xmlDataSet);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(xmlFile);
doc.getDocumentElement().normalize();
NodeList lexelts = doc.getElementsByTagName("lexelt");
for (int i = 0; i < lexelts.getLength(); i++) {
Node nLexelt = lexelts.item(i);
if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
Element eLexelt = (Element) nLexelt;
NodeList nInstances = nLexelt.getChildNodes();
for (int j = 1; j < nInstances.getLength(); j++) {
Node nInstance = nInstances.item(j);
if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
Element eInstance = (Element) nInstance;
String[] wordPos = eLexelt.getAttribute("item").split("\\.");
String word = wordPos[0]; // Word
String tag; // Part of Speech
if (wordPos[1].equals("n")) {
tag = "noun";
} else if (wordPos[1].equals("v")) {
tag = "verb";
} else if (wordPos[1].equals("a")) {
tag = "adjective";
} else {
tag = "adverb";
}
String id = eInstance.getAttribute("id");
String source = eInstance.getAttribute("docsrc");
ArrayList<String> answers = new ArrayList<String>();
String sentence = "";
String rawWord = "";
NodeList nChildren = nInstance.getChildNodes();
for (int k = 1; k < nChildren.getLength(); k++) {
Node nChild = nChildren.item(k);
if (nChild.getNodeName().equals("answer")) {
// String answer =
// nChild.getAttributes().item(0).getTextContent();
String senseid = nChild.getAttributes().item(1)
.getTextContent();
String temp = senseid;
// String[] temp = { answer, senseid };
answers.add(temp);
}
if (nChild.getNodeName().equals("context")) {
sentence = ((Element) nChild).getTextContent();
if (nChild.hasChildNodes()) {
// textbefore =
// nChild.getChildNodes().item(0).getTextContent();
rawWord = nChild.getChildNodes().item(1).getTextContent();
// textAfter =
// nChild.getChildNodes().item(2).getTextContent();
}
}
}
WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
rawWord);
setInstances.add(wordToDisambiguate);
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return setInstances;
}
/**
* Extract the list of ALL English words
*
* @param dict
* : this file is the same that is used in the simple lemmatizer
* (i.e.,"en-lemmatizer.dict")
*
* @return a list of all the english words
*/
public HashMap<String, Object> getEnglishWords(String dict) {
HashMap<String, Object> words = new HashMap<String, Object>();
BufferedReader br = null;
File file = new File(englishDict);
if (file.exists()) {
try {
br = new BufferedReader(new FileReader(file));
String line = br.readLine();
while (line != null) {
line = br.readLine();
if (line != null) {
String word = line.split("\\t")[0];
words.put(word, null);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
return words;
}
}