blob: d6f55a6894cbe0b959f622f08a0a38a05fb5b745 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package opennlp.tools.disambiguator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
import opennlp.tools.disambiguator.oscc.OSCCFactory;
import opennlp.tools.disambiguator.oscc.OSCCME;
import opennlp.tools.disambiguator.oscc.OSCCModel;
import opennlp.tools.disambiguator.oscc.OSCCParameters;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
/**
* This is the test class for {@link OSCCME}.
*
* The scope of this test is to make sure that the OSCC disambiguator code can
* be executed. This test can not detect mistakes which lead to incorrect
* feature generation or other mistakes which decrease the disambiguation
* performance of the disambiguator.
*
* In this test the {@link OSCCME} is trained with Semcor and then the computed
* model is used to predict sentences from the training sentences.
*/
public class OSCCMETester {
// TODO write more tests
// TODO modify when we fix the parameter model
static String modelsDir = "src\\test\\resources\\models\\";
static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
static OSCCParameters OSCCParams;
static OSCCME oscc;
static OSCCFactory osccFactory;
static OSCCModel model;
static String test = "please.v";
static File outFile;
static String test1 = "We need to discuss an important topic, please write to me soon.";
static String test2 = "The component was highly radioactive to the point that"
+ " it has been activated the second it touched water";
static String test3 = "The summer is almost over and I did not go to the beach even once";
static String[] sentence1;
static String[] sentence2;
static String[] sentence3;
static String[] tags1;
static String[] tags2;
static String[] tags3;
static String[] lemmas1;
static String[] lemmas2;
static String[] lemmas3;
/*
* Setup the testing variables
*/
@BeforeClass
public static void setUpAndTraining() {
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
sentence1 = WSDHelper.getTokenizer().tokenize(test1);
sentence2 = WSDHelper.getTokenizer().tokenize(test2);
sentence3 = WSDHelper.getTokenizer().tokenize(test3);
tags1 = WSDHelper.getTagger().tag(sentence1);
tags2 = WSDHelper.getTagger().tag(sentence2);
tags3 = WSDHelper.getTagger().tag(sentence3);
List<String> tempLemmas1 = new ArrayList<String>();
for (int i = 0; i < sentence1.length; i++) {
tempLemmas1
.add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
}
lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
List<String> tempLemmas2 = new ArrayList<String>();
for (int i = 0; i < sentence2.length; i++) {
tempLemmas2
.add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
}
lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
List<String> tempLemmas3 = new ArrayList<String>();
for (int i = 0; i < sentence3.length; i++) {
tempLemmas3
.add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
}
lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
OSCCParams = new OSCCParameters("");
OSCCParams.setTrainingDataDirectory(trainingDataDirectory);
osccFactory = new OSCCFactory();
TrainingParameters trainingParams = new TrainingParameters();
SemcorReaderExtended sr = new SemcorReaderExtended();
ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
OSCCModel writeModel = null;
/*
* Tests training the disambiguator We test both writing and reading a model
* file trained by semcor
*/
try {
writeModel = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,
osccFactory);
assertNotNull("Checking the model to be written", writeModel);
writeModel.writeModel(OSCCParams.getTrainingDataDirectory() + test);
outFile = new File(
OSCCParams.getTrainingDataDirectory() + test + ".oscc.model");
model = new OSCCModel(outFile);
assertNotNull("Checking the read model", model);
oscc = new OSCCME(model, OSCCParams);
assertNotNull("Checking the disambiguator", oscc);
} catch (IOException e1) {
e1.printStackTrace();
fail("Exception in training");
}
}
/*
* Tests disambiguating only one word : The ambiguous word "please"
*/
@Test
public void testOneWordDisambiguation() {
String[] senses = oscc.disambiguate(sentence1, tags1, lemmas1, 8);
assertEquals("Check number of senses", 1, senses.length);
}
/*
* Tests disambiguating a word Span In this case we test a mix of monosemous
* and polysemous words as well as words that do not need disambiguation such
* as determiners
*/
@Test
public void testWordSpanDisambiguation() {
Span span = new Span(3, 7);
List<String[]> senses = oscc.disambiguate(sentence2, tags2, lemmas2, span);
assertEquals("Check number of returned words", 5, senses.size());
assertEquals("Check number of senses", 1, senses.get(0).length);
assertEquals("Check monosemous word", 1, senses.get(1).length);
assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]);
assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]);
}
/*
* Tests disambiguating all the words
*/
@Test
public void testAllWordsDisambiguation() {
List<String[]> senses = oscc.disambiguate(sentence3, tags3, lemmas3);
assertEquals("Check number of returned words", 15, senses.size());
assertEquals("Check preposition", "WSDHELPER personal pronoun",
senses.get(6)[0]);
}
}