blob: 80a0a74e2b9eaeece1ee7b59474c03165ae202a8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import opennlp.tools.HighMemoryUsage;
import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.chunker.ChunkSampleStream;
import opennlp.tools.chunker.ChunkerEvaluator;
import opennlp.tools.chunker.ChunkerFactory;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
/**
* Evaluates the chunker against the English CONLL2000 corpus.
* <p>
* Download the train and eval gz files from the CONLL2000 shared task
* <a href="http://www.cnts.ua.ac.be/conll2000/chunking/"> site </a>
* and decompress them into this directory: $OPENNLP_DATA_DIR/conll00.
*/
public class Conll00ChunkerEval extends AbstractEvalTest {
private static File TEST_DATA_FILE;
private static File TRAIN_DATA_FILE;
private static ChunkerModel train(File trainFile, TrainingParameters params)
throws IOException {
ObjectStream<ChunkSample> samples = new ChunkSampleStream(
new PlainTextByLineStream(
new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8));
return ChunkerME.train("eng", samples, params, new ChunkerFactory());
}
private static void eval(ChunkerModel model, File testData,
double expectedFMeasure) throws IOException {
ObjectStream<ChunkSample> samples = new ChunkSampleStream(
new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8));
ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model));
evaluator.evaluate(samples);
Assert.assertEquals(expectedFMeasure,
evaluator.getFMeasure().getFMeasure(), 0.0001);
}
@BeforeClass
public static void verifyTrainingData() throws Exception {
TEST_DATA_FILE = new File(getOpennlpDataDir(), "conll00/test.txt");
TRAIN_DATA_FILE = new File(getOpennlpDataDir(), "conll00/train.txt");
verifyTrainingData(new ChunkSampleStream(
new PlainTextByLineStream(new MarkableFileInputStreamFactory(TEST_DATA_FILE),
StandardCharsets.UTF_8)),
new BigInteger("84610235226433393380477662908529306002"));
verifyTrainingData(new ChunkSampleStream(
new PlainTextByLineStream(new MarkableFileInputStreamFactory(TEST_DATA_FILE),
StandardCharsets.UTF_8)),
new BigInteger("84610235226433393380477662908529306002"));
}
@Test
public void evalEnglishPerceptron() throws IOException {
ChunkerModel maxentModel = train(TRAIN_DATA_FILE, createPerceptronParams());
eval(maxentModel, TEST_DATA_FILE, 0.9295018353434714d);
}
@Test
public void evalEnglishMaxentGis() throws IOException {
ChunkerModel maxentModel = train(TRAIN_DATA_FILE, ModelUtil.createDefaultTrainingParameters());
eval(maxentModel, TEST_DATA_FILE, 0.9239687473746113d);
}
// Note: Don't try to run this on your MacBook
@Test
@Category(HighMemoryUsage.class)
public void evalEnglishMaxentQn() throws IOException {
TrainingParameters params = createMaxentQnParams();
params.put("Threads", 4);
ChunkerModel maxentModel = train(TRAIN_DATA_FILE, params);
eval(maxentModel, TEST_DATA_FILE, 0.9302599230947028d);
}
}