| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.eval; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.math.BigInteger; |
| import java.nio.charset.StandardCharsets; |
| |
| import org.junit.Assert; |
| import org.junit.BeforeClass; |
| import org.junit.Test; |
| |
| import opennlp.tools.chunker.ChunkerCrossValidator; |
| import opennlp.tools.chunker.ChunkerFactory; |
| import opennlp.tools.formats.ad.ADChunkSampleStream; |
| import opennlp.tools.formats.ad.ADNameSampleStream; |
| import opennlp.tools.formats.ad.ADSentenceSampleStream; |
| import opennlp.tools.formats.convert.NameToTokenSampleStream; |
| import opennlp.tools.namefind.NameSample; |
| import opennlp.tools.sentdetect.SDCrossValidator; |
| import opennlp.tools.sentdetect.SentenceDetectorFactory; |
| import opennlp.tools.sentdetect.lang.Factory; |
| import opennlp.tools.tokenize.DetokenizationDictionary; |
| import opennlp.tools.tokenize.DictionaryDetokenizer; |
| import opennlp.tools.tokenize.TokenSample; |
| import opennlp.tools.tokenize.TokenizerCrossValidator; |
| import opennlp.tools.tokenize.TokenizerFactory; |
| import opennlp.tools.util.MarkableFileInputStreamFactory; |
| import opennlp.tools.util.ObjectStream; |
| import opennlp.tools.util.PlainTextByLineStream; |
| import opennlp.tools.util.TrainingParameters; |
| import opennlp.tools.util.model.ModelUtil; |
| |
| /** |
| * Cross validation of Sentence Detector, Tokenizer and Chunker against the |
| * Portugues corpus. |
| * <p> |
| * Download the gz files from the Floresta Sintactica project <a |
| * href="http://www.linguateca.pt/floresta/corpus.html"> site </a> and |
| * decompress it into this directory: $OPENNLP_DATA_DIR/ad. |
| * <ul> |
| * <li><a href= |
| * "http://www.linguateca.pt/floresta/ficheiros/gz/FlorestaVirgem_CF_3.0_ad.txt.gz" |
| * > FlorestaVirgem_CF_3.0_ad.txt.gz </a></li> |
| * <li><a href= |
| * "http://www.linguateca.pt/floresta/ficheiros/gz/Bosque_CF_8.0.ad.txt.gz"> |
| * Bosque_CF_8.0.ad.txt.gz </a></li> |
| * </ul> |
| */ |
| public class ArvoresDeitadasEval extends AbstractEvalTest { |
| |
| private static final String BOSQUE = "ad/Bosque_CF_8.0.ad.txt"; |
| private static final String FLORESTA_VIRGEM = "ad/FlorestaVirgem_CF_3.0_ad.txt"; |
| |
| private static final String LANG = "por"; |
| |
| private static ObjectStream<String> getLineSample(String corpus) |
| throws IOException { |
| return new PlainTextByLineStream(new MarkableFileInputStreamFactory( |
| new File(getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1); |
| } |
| |
| @BeforeClass |
| public static void verifyTrainingData() throws Exception { |
| |
| verifyTrainingData(new ADSentenceSampleStream(getLineSample(BOSQUE), false), |
| new BigInteger("140568367548727787313497336739085858596")); |
| |
| verifyTrainingData(new ADSentenceSampleStream(getLineSample(FLORESTA_VIRGEM), false), |
| new BigInteger("2614161133949079191933514776652602918")); |
| |
| } |
| |
| private void sentenceCrossEval(TrainingParameters params, |
| double expectedScore) throws IOException { |
| |
| ADSentenceSampleStream samples = new ADSentenceSampleStream( |
| getLineSample(FLORESTA_VIRGEM), false); |
| |
| SDCrossValidator cv = new SDCrossValidator(LANG, params, |
| new SentenceDetectorFactory(LANG, true, null, |
| new Factory().getEOSCharacters(LANG))); |
| |
| cv.evaluate(samples, 10); |
| |
| System.out.println(cv.getFMeasure()); |
| Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.0001d); |
| } |
| |
| private void tokenizerCrossEval(TrainingParameters params, |
| double expectedScore) throws IOException { |
| |
| ObjectStream<NameSample> nameSamples = new ADNameSampleStream( |
| getLineSample(FLORESTA_VIRGEM), true); |
| |
| DictionaryDetokenizer detokenizer = new DictionaryDetokenizer( |
| new DetokenizationDictionary(new File("lang/pt/tokenizer/pt-detokenizer.xml"))); |
| |
| ObjectStream<TokenSample> samples = new NameToTokenSampleStream( |
| detokenizer, nameSamples); |
| |
| TokenizerCrossValidator validator; |
| |
| TokenizerFactory tokFactory = TokenizerFactory.create(null, LANG, null, |
| true, null); |
| validator = new opennlp.tools.tokenize.TokenizerCrossValidator(params, |
| tokFactory); |
| |
| validator.evaluate(samples, 10); |
| |
| System.out.println(validator.getFMeasure()); |
| Assert.assertEquals(expectedScore, validator.getFMeasure().getFMeasure(), |
| 0.0001d); |
| } |
| |
| private void chunkerCrossEval(TrainingParameters params, |
| double expectedScore) throws IOException { |
| |
| ADChunkSampleStream samples = new ADChunkSampleStream(getLineSample(BOSQUE)); |
| |
| ChunkerCrossValidator cv = new ChunkerCrossValidator(LANG, params, |
| new ChunkerFactory()); |
| |
| cv.evaluate(samples, 10); |
| Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.0001d); |
| } |
| |
| @Test |
| public void evalPortugueseSentenceDetectorPerceptron() throws IOException { |
| sentenceCrossEval(createPerceptronParams(), 0.9892778840089301d); |
| } |
| |
| @Test |
| public void evalPortugueseSentenceDetectorGis() throws IOException { |
| sentenceCrossEval(ModelUtil.createDefaultTrainingParameters(), 0.987270070655111d); |
| } |
| |
| @Test |
| public void evalPortugueseSentenceDetectorMaxentQn() throws IOException { |
| sentenceCrossEval(createMaxentQnParams(), 0.9924715809679968d); |
| } |
| |
| @Test |
| public void evalPortugueseSentenceDetectorNaiveBayes() throws IOException { |
| sentenceCrossEval(createNaiveBayesParams(), 0.9672196206048099d); |
| } |
| |
| @Test |
| public void evalPortugueseTokenizerPerceptron() throws IOException { |
| tokenizerCrossEval(createPerceptronParams(), 0.9994887308380267d); |
| } |
| |
| @Test |
| public void evalPortugueseTokenizerGis() throws IOException { |
| tokenizerCrossEval(ModelUtil.createDefaultTrainingParameters(), 0.9992539405481062d); |
| } |
| |
| @Test |
| public void evalPortugueseTokenizerMaxentQn() throws IOException { |
| tokenizerCrossEval(createMaxentQnParams(), 0.9996017148748251d); |
| } |
| |
| @Test |
| public void evalPortugueseTokenizerNaiveBayes() throws IOException { |
| tokenizerCrossEval(createNaiveBayesParams(), 0.9962358244502717d); |
| } |
| |
| @Test |
| public void evalPortugueseTokenizerMaxentQnMultipleThreads() throws IOException { |
| TrainingParameters params = createMaxentQnParams(); |
| params.put("Threads", 4); |
| tokenizerCrossEval(params, 0.9996017148748251d); |
| } |
| |
| @Test |
| public void evalPortugueseChunkerPerceptron() throws IOException { |
| chunkerCrossEval(createPerceptronParams(), |
| 0.9638122825015589d); |
| } |
| |
| @Test |
| public void evalPortugueseChunkerGis() throws IOException { |
| chunkerCrossEval(ModelUtil.createDefaultTrainingParameters(), |
| 0.9573860781121228d); |
| } |
| |
| @Test |
| public void evalPortugueseChunkerGisMultipleThreads() throws IOException { |
| TrainingParameters params = ModelUtil.createDefaultTrainingParameters(); |
| params.put("Threads", 4); |
| chunkerCrossEval(params, 0.9573860781121228d); |
| } |
| |
| @Test |
| public void evalPortugueseChunkerQn() throws IOException { |
| chunkerCrossEval(createMaxentQnParams(), |
| 0.9648211936491359d); |
| } |
| |
| @Test |
| public void evalPortugueseChunkerQnMultipleThreads() throws IOException { |
| TrainingParameters params = createMaxentQnParams(); |
| params.put("Threads", 4); |
| |
| // NOTE: Should be the same as without multiple threads!!! |
| chunkerCrossEval(params, 0.9649180953528779d); |
| } |
| |
| @Test |
| public void evalPortugueseChunkerNaiveBayes() throws IOException { |
| chunkerCrossEval(createNaiveBayesParams(), 0.9041507736043933d); |
| } |
| } |