/* | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
package opennlp.tools.disambiguator.datareader; | |
import java.io.File; | |
import java.util.ArrayList; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import opennlp.tools.disambiguator.WSDHelper; | |
import opennlp.tools.disambiguator.WSDSample; | |
import opennlp.tools.util.ObjectStream; | |
import opennlp.tools.util.ObjectStreamUtils; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Element; | |
import org.w3c.dom.Node; | |
import org.w3c.dom.NodeList; | |
/** | |
* This class reads Semcor data. | |
* | |
*/ | |
public class SemcorReaderExtended { | |
private static final String ELEMENT_CONTEXTFILE = "contextfile"; | |
private static final String ATTRIBUTE_CONCORDANCE = "concordance"; | |
private static final String ELEMENT_CONTEXT = "context"; | |
private static final String ATTRIBUTE_FILENAME = "filename"; | |
private static final String ATTRIBUTE_PARAS = "paras"; | |
private static final String ELEMENT_PARAGRAPH = "p"; | |
private static final String ATTRIBUTE_PARAGRAPHNUM = "pnum"; | |
private static final String ELEMENT_SENTENCE = "s"; | |
private static final String ATTRIBUTE_SENTENCENUM = "snum"; | |
private static final String ELEMENT_WORDFORM = "wf"; | |
private static final String ATTRIBUTE_CMD = "cmd"; | |
private static final String ATTRIBUTE_RDF = "rdf"; | |
private static final String ATTRIBUTE_POS = "pos"; | |
private static final String ATTRIBUTE_LEMMA = "lemma"; | |
private static final String ATTRIBUTE_WNSN = "wnsn"; | |
private static final String ATTRIBUTE_LEXSN = "lexsn"; | |
private static final String ELEMENT_PUNCTUATION = "punc"; | |
private static String semcorDirectory = "src/test/resources/semcor3.0/"; | |
private static String[] folders = { "brown1", "brown2", "brownv" }; | |
private static String tagfiles = "/tagfiles/"; | |
public static String getSemcorDirectory() { | |
return semcorDirectory; | |
} | |
public static void setSemcorDirectory(String semcorDirectory) { | |
SemcorReaderExtended.semcorDirectory = semcorDirectory; | |
} | |
public SemcorReaderExtended() { | |
super(); | |
} | |
/** | |
* This serves to read one Semcor XML file | |
*/ | |
private ArrayList<Sentence> readFile(String file) { | |
ArrayList<Sentence> result = new ArrayList<Sentence>(); | |
try { | |
File xmlFile = new File(file); | |
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); | |
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); | |
Document doc = dBuilder.parse(xmlFile); | |
doc.getDocumentElement().normalize(); | |
NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH); | |
for (int i = 0; i < paragraphs.getLength(); i++) { | |
Node nParagraph = paragraphs.item(i); | |
if (nParagraph.getNodeType() == Node.ELEMENT_NODE) { | |
Element eParagraph = (Element) nParagraph; | |
// THE PARAGRAPH ID | |
int paragraphID = Integer.parseInt(eParagraph | |
.getAttribute(ATTRIBUTE_PARAGRAPHNUM)); | |
NodeList nSentences = nParagraph.getChildNodes(); | |
for (int j = 1; j < nSentences.getLength(); j++) { | |
Node nSentence = nSentences.item(j); | |
if (nSentence.getNodeType() == Node.ELEMENT_NODE) { | |
Element eSentence = (Element) nSentence; | |
// THE SENTENCE ID | |
int sentenceID = Integer.parseInt(eSentence | |
.getAttribute(ATTRIBUTE_SENTENCENUM)); | |
Sentence isentence = new Sentence(paragraphID, sentenceID); | |
NodeList nWords = nSentence.getChildNodes(); | |
int wnum = 0; | |
for (int k = 0; k < nWords.getLength(); k++) { | |
Node nWord = nWords.item(k); | |
if (nWord.getNodeType() == Node.ELEMENT_NODE) { | |
if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) { | |
Element eWord = (Element) nWord; | |
if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) { | |
// if the word is already disambiguated | |
String word = eWord.getTextContent(); | |
String cmd = eWord.getAttribute(ATTRIBUTE_CMD); | |
String pos = eWord.getAttribute(ATTRIBUTE_POS); | |
String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA); | |
String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN); | |
String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN); | |
Word iword = new Word(paragraphID, sentenceID, wnum, | |
Word.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn); | |
isentence.addIword(iword); | |
wnum++; | |
// System.out.println("*** " + iword.toString() + " ***"); | |
} else { | |
// if the word is not disambiguated | |
String word = eWord.getTextContent(); | |
String cmd = eWord.getAttribute(ATTRIBUTE_CMD); | |
String pos = eWord.getAttribute(ATTRIBUTE_POS); | |
Word iword = new Word(paragraphID, sentenceID, wnum, | |
Word.Type.WORD, word, cmd, pos); | |
isentence.addIword(iword); | |
wnum++; | |
} | |
} else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) { | |
Element eWord = (Element) nWord; | |
String word = eWord.getTextContent(); | |
Word iword = new Word(paragraphID, sentenceID, wnum, | |
Word.Type.PUNCTUATIONMARK, word); | |
isentence.addIword(iword); | |
wnum++; | |
} | |
} | |
} | |
result.add(isentence); | |
} | |
} | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
return result; | |
} | |
/** | |
* One Semcor folder reader: This reads all the files in one semcor folder, | |
* and return all the instances in the format {@link WSDSample} of a | |
* specific word | |
* | |
* @param file | |
* the name of the file to read | |
* @param wordTag | |
* The word, of which we are looking for the instances | |
* @return the list of the {@link WSDSample} instances | |
*/ | |
private ArrayList<WSDSample> getSemcorOneFileData(String file, String wordTag) { | |
ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>(); | |
try { | |
ArrayList<Sentence> isentences = readFile(file); | |
for (int j = 0; j < isentences.size(); j++) { | |
Sentence isentence = isentences.get(j); | |
ArrayList<Word> iwords = isentence.getIwords(); | |
for (int k = 0; k < iwords.size(); k++) { | |
Word iword = iwords.get(k); | |
if (iword.isInstanceOf(wordTag)) { | |
String sentence; | |
int index; | |
if (j == 0) { | |
// case of the first sentence, we consider the current sentence | |
// and the next two ones | |
sentence = isentences.get(j).toString() + " " | |
+ isentences.get(j + 1).toString() + " " | |
+ isentences.get(j + 2).toString(); | |
index = k; | |
} else if (j == isentences.size() - 1) { | |
// case of the last sentence, we consider the current sentence and | |
// the previous two ones | |
sentence = isentences.get(j - 2).toString() + " " | |
+ isentences.get(j - 1).toString() + " " | |
+ isentences.get(j).toString(); | |
index = isentences.get(j - 2).getIwords().size() | |
+ isentences.get(j - 1).getIwords().size() + k; | |
} else { | |
// case of a sentence in the middle, we consider the previous | |
// sentence + the current one + the next one | |
sentence = isentences.get(j - 1).toString() + " " | |
+ isentences.get(j).toString() + " " | |
+ isentences.get(j + 1).toString(); | |
index = isentences.get(j - 1).getIwords().size() + k; | |
} | |
ArrayList<String> senses = new ArrayList<String>(); | |
String sense = iword.getLexsn(); | |
if (sense != null) { | |
senses.add(sense); | |
} | |
if (!senses.isEmpty()) { | |
String[] words = sentence.split("\\s"); | |
String[] tags = WSDHelper.getTagger().tag(words); | |
String[] lemmas = new String[words.length]; | |
for (int i = 0; i < words.length; i++) { | |
lemmas[i] = WSDHelper.getLemmatizer().lemmatize(words[i], | |
tags[i]); | |
} | |
WSDSample wtd = new WSDSample(words, tags, lemmas, index, senses.toArray(new String[0])); | |
setInstances.add(wtd); | |
} | |
} | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
return setInstances; | |
} | |
/** | |
* One Semcor folder reader: This reads all the files in one semcor folder, | |
* and return all the instances in the format {@link WSDSample} of a | |
* specific word | |
* | |
* @param folder | |
* the name of the folder. Three folders exist in Semcor3.0, which | |
* are ["brown1", "brown2", "brownv"] | |
* @param wordTag | |
* The word, of which we are looking for the instances | |
* @return the list of the {@link WSDSample} instances | |
*/ | |
private ArrayList<WSDSample> getSemcorFolderData(String folder, String wordTag) { | |
ArrayList<WSDSample> result = new ArrayList<WSDSample>(); | |
String directory = semcorDirectory + folder + tagfiles; | |
File tempFolder = new File(directory); | |
File[] listOfFiles; | |
if (tempFolder.isDirectory()) { | |
listOfFiles = tempFolder.listFiles(); | |
for (File file : listOfFiles) { | |
ArrayList<WSDSample> list = getSemcorOneFileData( | |
directory + file.getName(), wordTag); | |
result.addAll(list); | |
} | |
} | |
return result; | |
} | |
/** | |
* Semcor reader: This reads all the files in semcor, and return all the | |
* instances in the format {@link WSDSample} of a specific word | |
* | |
* @param wordTag | |
* The word, of which we are looking for the instances | |
* @return the list of the {@link WSDSample} instances of the word to | |
* disambiguate | |
*/ | |
public ArrayList<WSDSample> getSemcorData(String wordTag) { | |
ArrayList<WSDSample> result = new ArrayList<WSDSample>(); | |
for (String folder : folders) { | |
ArrayList<WSDSample> list = getSemcorFolderData(folder, wordTag); | |
result.addAll(list); | |
} | |
return result; | |
} | |
/** | |
* Semcor reader: This reads all the files in semcor, and return all the | |
* instances in the format {@link WSDSample} of a specific word | |
* | |
* @param wordTag | |
* The word, of which we are looking for the instances | |
* @return the stream of {@link WSDSample} of the word to disambiguate | |
*/ | |
public ObjectStream<WSDSample> getSemcorDataStream(String wordTag) { | |
return ObjectStreamUtils.createObjectStream(getSemcorData(wordTag)); | |
} | |
} |