opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.eval;

 import java.io.File;
 import java.io.IOException;
 import java.math.BigInteger;

 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;

 import opennlp.tools.formats.conllu.ConlluLemmaSampleStream;
 import opennlp.tools.formats.conllu.ConlluStream;
 import opennlp.tools.formats.conllu.ConlluTagset;
 import opennlp.tools.lemmatizer.LemmaSample;
 import opennlp.tools.lemmatizer.LemmatizerEvaluator;
 import opennlp.tools.lemmatizer.LemmatizerFactory;
 import opennlp.tools.lemmatizer.LemmatizerME;
 import opennlp.tools.lemmatizer.LemmatizerModel;
 import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;

 public class UniversalDependency20Eval extends AbstractEvalTest {

   private static File SPA_ANCORA_TRAIN;
   private static File SPA_ANCORA_DEV;

   @BeforeClass
   public static void verifyTrainingData() throws Exception {

     SPA_ANCORA_TRAIN = new File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
     SPA_ANCORA_DEV = new File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");

     verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
         new BigInteger("224942804200733453179524127037951530195"));
     verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
         new BigInteger("280996187464384493180190898172297941708"));
   }

   private double trainAndEval(String lang, File trainFile, TrainingParameters params,
                                      File evalFile) throws IOException {
     ConlluTagset tagset = ConlluTagset.X;

     ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream(
         new MarkableFileInputStreamFactory(trainFile)), tagset);

     LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory());
     LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model));

     evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream(
         new MarkableFileInputStreamFactory(evalFile)), tagset));

     return evaluator.getWordAccuracy();
   }

   @Test
   public void trainAndEvalSpanishAncora() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
     params.put("Threads", "4");

     double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN,
         params, SPA_ANCORA_DEV);

     Assert.assertEquals(0.9057341692068787d, wordAccuracy, ACCURACY_DELTA);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.eval;

	import java.io.File;
	import java.io.IOException;
	import java.math.BigInteger;

	import org.junit.Assert;
	import org.junit.BeforeClass;
	import org.junit.Test;

	import opennlp.tools.formats.conllu.ConlluLemmaSampleStream;
	import opennlp.tools.formats.conllu.ConlluStream;
	import opennlp.tools.formats.conllu.ConlluTagset;
	import opennlp.tools.lemmatizer.LemmaSample;
	import opennlp.tools.lemmatizer.LemmatizerEvaluator;
	import opennlp.tools.lemmatizer.LemmatizerFactory;
	import opennlp.tools.lemmatizer.LemmatizerME;
	import opennlp.tools.lemmatizer.LemmatizerModel;
	import opennlp.tools.util.MarkableFileInputStreamFactory;
	import opennlp.tools.util.ObjectStream;
	import opennlp.tools.util.TrainingParameters;
	import opennlp.tools.util.model.ModelUtil;

	public class UniversalDependency20Eval extends AbstractEvalTest {

	private static File SPA_ANCORA_TRAIN;
	private static File SPA_ANCORA_DEV;

	@BeforeClass
	public static void verifyTrainingData() throws Exception {

	SPA_ANCORA_TRAIN = new File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
	SPA_ANCORA_DEV = new File(getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");

	verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
	new BigInteger("224942804200733453179524127037951530195"));
	verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
	new BigInteger("280996187464384493180190898172297941708"));
	}

	private double trainAndEval(String lang, File trainFile, TrainingParameters params,
	File evalFile) throws IOException {
	ConlluTagset tagset = ConlluTagset.X;

	ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream(
	new MarkableFileInputStreamFactory(trainFile)), tagset);

	LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory());
	LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model));

	evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream(
	new MarkableFileInputStreamFactory(evalFile)), tagset));

	return evaluator.getWordAccuracy();
	}

	@Test
	public void trainAndEvalSpanishAncora() throws IOException {
	TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
	params.put("Threads", "4");

	double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN,
	params, SPA_ANCORA_DEV);

	Assert.assertEquals(0.9057341692068787d, wordAccuracy, ACCURACY_DELTA);
	}
	}