/* | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
package opennlp.tools.disambiguator; | |
import static org.junit.Assert.assertEquals; | |
import static org.junit.Assert.assertNotNull; | |
import static org.junit.Assert.fail; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.junit.BeforeClass; | |
import org.junit.Test; | |
import opennlp.tools.disambiguator.datareader.SemcorReaderExtended; | |
import opennlp.tools.disambiguator.oscc.OSCCFactory; | |
import opennlp.tools.disambiguator.oscc.OSCCME; | |
import opennlp.tools.disambiguator.oscc.OSCCModel; | |
import opennlp.tools.disambiguator.oscc.OSCCParameters; | |
import opennlp.tools.util.ObjectStream; | |
import opennlp.tools.util.Span; | |
import opennlp.tools.util.TrainingParameters; | |
/** | |
* This is the test class for {@link OSCCME}. | |
* | |
* The scope of this test is to make sure that the OSCC disambiguator code can | |
* be executed. This test can not detect mistakes which lead to incorrect | |
* feature generation or other mistakes which decrease the disambiguation | |
* performance of the disambiguator. | |
* | |
* In this test the {@link OSCCME} is trained with Semcor and then the computed | |
* model is used to predict sentences from the training sentences. | |
*/ | |
public class OSCCMETester { | |
// TODO write more tests | |
// TODO modify when we fix the parameter model | |
static String modelsDir = "src\\test\\resources\\models\\"; | |
static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\"; | |
static OSCCParameters OSCCParams; | |
static OSCCME oscc; | |
static OSCCFactory osccFactory; | |
static OSCCModel model; | |
static String test = "please.v"; | |
static File outFile; | |
static String test1 = "We need to discuss an important topic, please write to me soon."; | |
static String test2 = "The component was highly radioactive to the point that" | |
+ " it has been activated the second it touched water"; | |
static String test3 = "The summer is almost over and I did not go to the beach even once"; | |
static String[] sentence1; | |
static String[] sentence2; | |
static String[] sentence3; | |
static String[] tags1; | |
static String[] tags2; | |
static String[] tags3; | |
static String[] lemmas1; | |
static String[] lemmas2; | |
static String[] lemmas3; | |
/* | |
* Setup the testing variables | |
*/ | |
@BeforeClass | |
public static void setUpAndTraining() { | |
WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); | |
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); | |
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); | |
sentence1 = WSDHelper.getTokenizer().tokenize(test1); | |
sentence2 = WSDHelper.getTokenizer().tokenize(test2); | |
sentence3 = WSDHelper.getTokenizer().tokenize(test3); | |
tags1 = WSDHelper.getTagger().tag(sentence1); | |
tags2 = WSDHelper.getTagger().tag(sentence2); | |
tags3 = WSDHelper.getTagger().tag(sentence3); | |
List<String> tempLemmas1 = new ArrayList<String>(); | |
for (int i = 0; i < sentence1.length; i++) { | |
tempLemmas1 | |
.add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i])); | |
} | |
lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); | |
List<String> tempLemmas2 = new ArrayList<String>(); | |
for (int i = 0; i < sentence2.length; i++) { | |
tempLemmas2 | |
.add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i])); | |
} | |
lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); | |
List<String> tempLemmas3 = new ArrayList<String>(); | |
for (int i = 0; i < sentence3.length; i++) { | |
tempLemmas3 | |
.add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i])); | |
} | |
lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); | |
OSCCParams = new OSCCParameters(""); | |
OSCCParams.setTrainingDataDirectory(trainingDataDirectory); | |
osccFactory = new OSCCFactory(); | |
TrainingParameters trainingParams = new TrainingParameters(); | |
SemcorReaderExtended sr = new SemcorReaderExtended(); | |
ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test); | |
OSCCModel writeModel = null; | |
/* | |
* Tests training the disambiguator We test both writing and reading a model | |
* file trained by semcor | |
*/ | |
try { | |
writeModel = OSCCME.train("en", sampleStream, trainingParams, OSCCParams, | |
osccFactory); | |
assertNotNull("Checking the model to be written", writeModel); | |
writeModel.writeModel(OSCCParams.getTrainingDataDirectory() + test); | |
outFile = new File( | |
OSCCParams.getTrainingDataDirectory() + test + ".oscc.model"); | |
model = new OSCCModel(outFile); | |
assertNotNull("Checking the read model", model); | |
oscc = new OSCCME(model, OSCCParams); | |
assertNotNull("Checking the disambiguator", oscc); | |
} catch (IOException e1) { | |
e1.printStackTrace(); | |
fail("Exception in training"); | |
} | |
} | |
/* | |
* Tests disambiguating only one word : The ambiguous word "please" | |
*/ | |
@Test | |
public void testOneWordDisambiguation() { | |
String[] senses = oscc.disambiguate(sentence1, tags1, lemmas1, 8); | |
assertEquals("Check number of senses", 1, senses.length); | |
} | |
/* | |
* Tests disambiguating a word Span In this case we test a mix of monosemous | |
* and polysemous words as well as words that do not need disambiguation such | |
* as determiners | |
*/ | |
@Test | |
public void testWordSpanDisambiguation() { | |
Span span = new Span(3, 7); | |
List<String[]> senses = oscc.disambiguate(sentence2, tags2, lemmas2, span); | |
assertEquals("Check number of returned words", 5, senses.size()); | |
assertEquals("Check number of senses", 1, senses.get(0).length); | |
assertEquals("Check monosemous word", 1, senses.get(1).length); | |
assertEquals("Check preposition", "WSDHELPER to", senses.get(2)[0]); | |
assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3)[0]); | |
} | |
/* | |
* Tests disambiguating all the words | |
*/ | |
@Test | |
public void testAllWordsDisambiguation() { | |
List<String[]> senses = oscc.disambiguate(sentence3, tags3, lemmas3); | |
assertEquals("Check number of returned words", 15, senses.size()); | |
assertEquals("Check preposition", "WSDHELPER personal pronoun", | |
senses.get(6)[0]); | |
} | |
} |