blob: 3ab1a7d33e8f5961dda73c7c4891b4c300f32d35 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.formats.conllu.ConlluLemmaSampleStream;
import opennlp.tools.formats.conllu.ConlluStream;
import opennlp.tools.formats.conllu.ConlluTagset;
import opennlp.tools.lemmatizer.LemmaSample;
import opennlp.tools.lemmatizer.LemmatizerEvaluator;
import opennlp.tools.lemmatizer.LemmatizerFactory;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
public class UniversalDependency20Eval extends AbstractEvalTest {
private static File SPA_ANCORA_TRAIN;
private static File SPA_ANCORA_DEV;
@BeforeClass
public static void verifyTrainingData() throws Exception {
SPA_ANCORA_TRAIN = new File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
SPA_ANCORA_DEV = new File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");
verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
new BigInteger("224942804200733453179524127037951530195"));
verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
new BigInteger("280996187464384493180190898172297941708"));
}
private double trainAndEval(String lang, File trainFile, TrainingParameters params,
File evalFile) throws IOException {
ConlluTagset tagset = ConlluTagset.X;
ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream(
new MarkableFileInputStreamFactory(trainFile)), tagset);
LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory());
LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model));
evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream(
new MarkableFileInputStreamFactory(evalFile)), tagset));
return evaluator.getWordAccuracy();
}
@Test
public void trainAndEvalSpanishAncora() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
params.put("Threads", "4");
double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN,
params, SPA_ANCORA_DEV);
Assert.assertEquals(0.9057341692068787d, wordAccuracy, ACCURACY_DELTA);
}
}