blob: a38ba8236bf732abd4dce43467f17e13dbb1277b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package opennlp.tools.disambiguator.ims;
import opennlp.tools.ml.maxent.GIS;
import opennlp.tools.ml.maxent.io.GISModelReader;
import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter;
import opennlp.tools.ml.model.AbstractModel;
import opennlp.tools.ml.model.AbstractModelWriter;
import opennlp.tools.ml.model.DataIndexer;
import opennlp.tools.ml.model.DataReader;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.OnePassDataIndexer;
import opennlp.tools.ml.model.PlainTextFileDataReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.zip.GZIPInputStream;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.disambiguator.FeaturesExtractor;
import opennlp.tools.disambiguator.WSDParameters;
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.WSDisambiguator;
import opennlp.tools.disambiguator.WordToDisambiguate;
import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.mfs.MFS;
/**
* Implementation of the <b>It Makes Sense</b> approach originally proposed in
* Senseval-3. The approach relies on the extraction of textual and
* PoS-tag-based features from the sentences surrounding the word to
* disambiguate. 3 main families of features are extracted:
* <ul>
* <li>PoS-tags of the surrounding words</li>
* <li>Local collocations</li>
* <li>Surrounding words</li>
* </ul>
* check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
* about this approach
*/
public class IMS implements WSDisambiguator {
public IMSParameters parameters;
private final IMSContextGenerator cg;
private FeaturesExtractor fExtractor = new FeaturesExtractor();
/**
* Sets the input parameters to the default ones
*
* @throws InvalidParameterException
*/
public IMS() {
super();
// Loader loader = new Loader();
this.parameters = new IMSParameters();
this.cg = parameters.createContextGenerator();
}
/**
* Initializes the loader object and sets the input parameters
*
* @param parameters
* The parameters to be used
* @throws InvalidParameterException
*/
public IMS(IMSParameters parameters) {
super();
this.parameters = parameters;
this.cg = this.parameters.createContextGenerator();
}
/**
* Returns that parameter settings of the IMS object.
*
* @return the parameter settings
*/
@Override
public WSDParameters getParams() {
return this.parameters;
}
/**
* Returns that parameter settings of the IMS object. The returned parameters
* are of type {@link IMSParameters}
*
* @return the parameter settings
*/
public IMSParameters getParameters() {
return this.parameters;
}
/**
* If the parameters are null, set the default ones. Otherwise, only set them
* if they valid. Invalid parameters will return a exception (and set the
* parameters to the default ones)
*
* @param Input
* parameters
* @throws InvalidParameterException
*/
@Override
public void setParams(WSDParameters parameters)
throws InvalidParameterException {
if (parameters == null) {
this.parameters = new IMSParameters();
} else {
if (parameters.isValid()) {
this.parameters = (IMSParameters) parameters;
} else {
this.parameters = new IMSParameters();
throw new InvalidParameterException("wrong parameters");
}
}
}
/**
* If the parameters are null, set the default ones. Otherwise, only set them
* if they valid. Invalid parameters will return a exception (and set the
* parameters to the default ones)
*
* @param Input
* parameters
* @throws InvalidParameterException
*/
public void setParams(IMSParameters parameters)
throws InvalidParameterException {
if (parameters == null) {
this.parameters = new IMSParameters();
} else {
if (parameters.isValid()) {
this.parameters = parameters;
} else {
this.parameters = new IMSParameters();
throw new InvalidParameterException("wrong parameters");
}
}
}
// Internal Methods
private ArrayList<String> getAllSurroundingWords(String wordTag) {
ArrayList<String> surrWords = new ArrayList<String>();
BufferedReader br = null;
File file = new File(IMSParameters.trainingDataDirectory + wordTag + ".sw");
if (file.exists()) {
try {
br = new BufferedReader(new FileReader(file));
String line = br.readLine();
while (line != null) {
line = br.readLine();
if (!surrWords.contains(line)) {
surrWords.add(line);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
return surrWords;
}
private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingInstances,
String wordTag) {
ArrayList<String> surrWords = fExtractor
.extractTrainingSurroundingWords(trainingInstances);
File file = new File(IMSParameters.trainingDataDirectory + wordTag + ".sw");
if (!file.exists()) {
try {
file.createNewFile();
} catch (IOException e) {
System.out
.println("Unable to create the List of Surrounding Words file !");
}
}
try {
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
for (String surrWord : surrWords) {
bw.write(surrWord);
bw.newLine();
}
bw.close();
} catch (IOException e) {
System.out
.println("Unable to create the List of Surrounding Words file !");
e.printStackTrace();
}
System.out.println("Done");
}
private void extractFeature(WTDIMS word) {
fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
this.parameters.getNgram());
}
/**
* Method for training a model
*
* @param wordTag
* the word to disambiguate. It should be written in the format
* "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
* @param trainParams
* the parameters used for training
* @param trainingInstances
* the training data in the format {@link WTDIMS}
*/
public void train(String wordTag, TrainingParameters trainParams,
ArrayList<WTDIMS> trainingInstances) {
String wordTrainingbinFile = IMSParameters.trainingDataDirectory + wordTag
+ ".gz";
ObjectStream<Event> IMSes = null;
for (WTDIMS wtd : trainingInstances) {
extractFeature(wtd);
}
saveAllSurroundingWords(trainingInstances, wordTag);
ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
for (WTDIMS wtd : trainingInstances) {
fExtractor.serializeIMSFeatures(wtd, surrWords);
}
ArrayList<Event> events = new ArrayList<Event>();
for (WTDIMS wtd : trainingInstances) {
String sense = wtd.getSenseIDs().get(0);
String[] context = cg.getContext(wtd);
Event ev = new Event(sense + "", context);
events.add(ev);
IMSes = ObjectStreamUtils.createObjectStream(events);
}
DataIndexer indexer;
try {
indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
File outFile = new File(wordTrainingbinFile);
AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
(AbstractModel) trainedMaxentModel, outFile);
writer.persist();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Load an existing model
*
* @param trainedModel
* Name of the file of the already trained model
* @return the model trained
*/
public MaxentModel load(String trainedModel) {
MaxentModel loadedMaxentModel = null;
FileInputStream inputStream;
try {
inputStream = new FileInputStream(trainedModel);
InputStream decodedInputStream = new GZIPInputStream(inputStream);
DataReader modelReader = new PlainTextFileDataReader(decodedInputStream);
loadedMaxentModel = new GISModelReader(modelReader).getModel();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return loadedMaxentModel;
}
/**
* The disambiguation method for a single word
*
* @param inputText
* : the text containing the word to disambiguate
* @param inputWordIndex
* : the index of the word to disambiguate
*/
@Override
public String[] disambiguate(String[] inputText, int inputWordIndex) {
String trainingDataDirectory = IMSParameters.trainingDataDirectory;
File file = new File(trainingDataDirectory);
if (!file.exists()) {
file.mkdirs();
}
WTDIMS word = new WTDIMS(inputText, inputWordIndex);
fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
this.parameters.getNgram());
String wordTag = word.getWordTag();
String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz";
File bf = new File(wordTrainingbinFile);
MaxentModel loadedMaxentModel = null;
String outcome = "";
if (bf.exists() && !bf.isDirectory()) {
// If the trained model exists
ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
fExtractor.serializeIMSFeatures(word, surrWords);
loadedMaxentModel = load(wordTrainingbinFile);
String[] context = cg.getContext(word);
double[] outcomeProbs = loadedMaxentModel.eval(context);
outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
} else {
// Depending on the source, go fetch the training data
ArrayList<WTDIMS> trainingInstances = new ArrayList<WTDIMS>();
switch (this.parameters.getSource().code) {
case 1: {
SemcorReaderExtended sReader = new SemcorReaderExtended();
for (WordToDisambiguate ti : sReader.getSemcorData(wordTag)) {
WTDIMS imsIT = new WTDIMS(ti);
extractFeature(imsIT);
trainingInstances.add(imsIT);
}
break;
}
case 2: {
SensevalReader sReader = new SensevalReader();
for (WordToDisambiguate ti : sReader.getSensevalData(wordTag)) {
WTDIMS imsIT = (WTDIMS) ti;
extractFeature(imsIT);
trainingInstances.add(imsIT);
}
break;
}
case 3: {
// TODO check the case when the user selects his own data set (make an
// interface to collect training data)
break;
}
}
if (!trainingInstances.isEmpty()) {
train(wordTag, null, trainingInstances);
ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
fExtractor.serializeIMSFeatures(word, surrWords);
bf = new File(wordTrainingbinFile);
loadedMaxentModel = load(wordTrainingbinFile);
String[] context = cg.getContext(word);
double[] outcomeProbs = loadedMaxentModel.eval(context);
outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
}
}
if (!outcome.equals("")) {
// System.out.println("The sense is [" + outcome + "] : " /*+
// Loader.getDictionary().getWordBySenseKey(outcome.split("%")[1]).getSynset().getGloss()*/);
outcome = parameters.source.name() + " " + wordTag.split("\\.")[0] + "%"
+ outcome;
String[] s = { outcome };
return s;
} else {
// if no training data exist
return MFS.getMostFrequentSense(word);
}
}
@Override
public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
int ambiguousTokenIndex, String ambiguousTokenLemma) {
// TODO Update
return null;
}
@Override
public String[][] disambiguate(String[] tokenizedContext, String[] tokenTags,
Span ambiguousTokenIndexSpan, String ambiguousTokenLemma) {
// TODO Update
return null;
}
@Override
public String[] disambiguate(WSDSample sample) {
// TODO Update
return null;
}
}