| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.eval; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.math.BigInteger; |
| import java.nio.charset.StandardCharsets; |
| import java.security.MessageDigest; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.Objects; |
| |
| import org.junit.Assert; |
| import org.junit.BeforeClass; |
| import org.junit.Test; |
| |
| import opennlp.tools.chunker.Chunker; |
| import opennlp.tools.chunker.ChunkerME; |
| import opennlp.tools.chunker.ChunkerModel; |
| import opennlp.tools.cmdline.parser.ParserTool; |
| import opennlp.tools.namefind.NameFinderME; |
| import opennlp.tools.namefind.TokenNameFinder; |
| import opennlp.tools.namefind.TokenNameFinderModel; |
| import opennlp.tools.parser.Parse; |
| import opennlp.tools.parser.Parser; |
| import opennlp.tools.parser.ParserFactory; |
| import opennlp.tools.parser.ParserModel; |
| import opennlp.tools.postag.POSModel; |
| import opennlp.tools.postag.POSSample; |
| import opennlp.tools.postag.POSTagger; |
| import opennlp.tools.postag.POSTaggerME; |
| import opennlp.tools.sentdetect.SentenceDetector; |
| import opennlp.tools.sentdetect.SentenceDetectorME; |
| import opennlp.tools.sentdetect.SentenceModel; |
| import opennlp.tools.tokenize.SimpleTokenizer; |
| import opennlp.tools.tokenize.Tokenizer; |
| import opennlp.tools.tokenize.TokenizerME; |
| import opennlp.tools.tokenize.TokenizerModel; |
| import opennlp.tools.tokenize.WhitespaceTokenizer; |
| import opennlp.tools.util.FilterObjectStream; |
| import opennlp.tools.util.InputStreamFactory; |
| import opennlp.tools.util.MarkableFileInputStreamFactory; |
| import opennlp.tools.util.ObjectStream; |
| import opennlp.tools.util.PlainTextByLineStream; |
| import opennlp.tools.util.Span; |
| |
| /** |
| * This tests ensures that the existing SourceForge models perform |
| * like they are expected to. |
| * <p> |
| * To run this tests external the leipzig sentences files is needed: |
| * leipzig/eng_news_2010_300K-sentences.txt, this file can be |
| * obtained from the leipzig corpus project. <br> |
| * <p> |
| * And all the SourceForge models:<br> |
| * - models-sf/en-sent.bin<br> |
| * - models-sf/en-token.bin<br> |
| * - models-sf/en-ner-date.bin<br> |
| * - models-sf/en-ner-location.binn<br> |
| * - models-sf/en-ner-money.bin<br> |
| * - models-sf/en-ner-organization.bin<br> |
| * - models-sf/en-ner-percentage.bi<br> |
| * - models-sf/en-ner-person.bin<br> |
| * - models-sf/en-ner-time.bin<br> |
| * - models-sf/en-chunker.bin<br> |
| * - models-sf/en-pos-maxent.bin<br> |
| * - models-sf/en-pos-perceptron.bin<br> |
| * - models-sf/en-parser-chunking.bin.bin<br> |
| */ |
| public class SourceForgeModelEval extends AbstractEvalTest { |
| |
| private static class LeipzigTestSample { |
| private final List<String> text; |
| |
| private LeipzigTestSample(String[] text) { |
| Objects.requireNonNull(text, "text must not be null"); |
| this.text = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(text))); |
| } |
| |
| public String[] getText() { |
| return text.toArray(new String[text.size()]); |
| } |
| |
| @Override |
| public String toString() { |
| |
| StringBuilder sampleString = new StringBuilder("eng"); |
| |
| sampleString.append('\t'); |
| |
| for (String s : text) { |
| sampleString.append(s).append(' '); |
| } |
| |
| if (sampleString.length() > 0) { |
| // remove last space |
| sampleString.setLength(sampleString.length() - 1); |
| } |
| |
| return sampleString.toString(); |
| } |
| } |
| |
| private static class LeipzigTestSampleStream extends FilterObjectStream<String, LeipzigTestSample> { |
| |
| private final int sentencePerDocument; |
| private final Tokenizer tokenizer; |
| |
| private LeipzigTestSampleStream(int sentencePerDocument, Tokenizer tokenizer, InputStreamFactory in) |
| throws IOException { |
| super(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); |
| this.sentencePerDocument = sentencePerDocument; |
| this.tokenizer = tokenizer; |
| } |
| |
| @Override |
| public LeipzigTestSample read() throws IOException { |
| int count = 0; |
| List<String> tokensList = new ArrayList<>(); |
| |
| String line; |
| while (count < sentencePerDocument && (line = samples.read()) != null) { |
| |
| String[] tokens = tokenizer.tokenize(line); |
| |
| if (tokens.length == 0) { |
| throw new IOException("Empty lines are not allowed!"); |
| } |
| |
| // Always skip first token, that is the sentence number! |
| tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length)); |
| |
| count++; |
| } |
| |
| if (tokensList.size() > 0) { |
| return new LeipzigTestSample(tokensList.toArray(new String[tokensList.size()])); |
| } |
| |
| return null; |
| } |
| } |
| |
| @BeforeClass |
| public static void verifyTrainingData() throws Exception { |
| verifyTrainingData(new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE, |
| new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), |
| "leipzig/eng_news_2010_300K-sentences.txt"))), |
| new BigInteger("172812413483919324675263268750583851712")); |
| } |
| |
| @Test |
| public void evalSentenceModel() throws Exception { |
| |
| SentenceModel model = new SentenceModel( |
| new File(getOpennlpDataDir(), "models-sf/en-sent.bin")); |
| |
| MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); |
| |
| SentenceDetector sentenceDetector = new SentenceDetectorME(model); |
| |
| StringBuilder text = new StringBuilder(); |
| |
| try (ObjectStream<LeipzigTestSample> lineBatches = new LeipzigTestSampleStream(25, |
| SimpleTokenizer.INSTANCE, |
| new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), |
| "leipzig/eng_news_2010_300K-sentences.txt")))) { |
| |
| LeipzigTestSample lineBatch; |
| while ((lineBatch = lineBatches.read()) != null) { |
| text.append(String.join(" ", lineBatch.getText())).append(" "); |
| } |
| } |
| |
| String[] sentences = sentenceDetector.sentDetect(text.toString()); |
| |
| for (String sentence : sentences) { |
| digest.update(sentence.getBytes(StandardCharsets.UTF_8)); |
| } |
| |
| Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"), |
| new BigInteger(1, digest.digest())); |
| } |
| |
| @Test |
| public void evalTokenModel() throws Exception { |
| |
| // the input stream is currently tokenized, we should detokenize it again, |
| // (or extend to pass in tokenizer, then whitespace tokenizer can be passed) |
| // and then tokenize it here |
| |
| TokenizerModel model = new TokenizerModel( |
| new File(getOpennlpDataDir(), "models-sf/en-token.bin")); |
| |
| MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); |
| |
| Tokenizer tokenizer = new TokenizerME(model); |
| |
| try (ObjectStream<LeipzigTestSample> lines = new LeipzigTestSampleStream(1, |
| WhitespaceTokenizer.INSTANCE, |
| new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), |
| "leipzig/eng_news_2010_300K-sentences.txt")))) { |
| |
| LeipzigTestSample line; |
| while ((line = lines.read()) != null) { |
| String[] tokens = tokenizer.tokenize(String.join(" ", line.getText())); |
| for (String token : tokens) { |
| digest.update(token.getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| } |
| |
| Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"), |
| new BigInteger(1, digest.digest())); |
| } |
| |
| private ObjectStream<LeipzigTestSample> createLineWiseStream() throws IOException { |
| return new LeipzigTestSampleStream(1, |
| SimpleTokenizer.INSTANCE, |
| new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), |
| "leipzig/eng_news_2010_300K-sentences.txt"))); |
| } |
| |
| |
| private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash) |
| throws Exception { |
| |
| MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); |
| |
| TokenNameFinder nameFinder = new NameFinderME(model); |
| |
| try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { |
| |
| LeipzigTestSample line; |
| while ((line = lines.read()) != null) { |
| Span[] names = nameFinder.find(line.getText()); |
| for (Span name : names) { |
| digest.update((name.getType() + name.getStart() |
| + name.getEnd()).getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| } |
| |
| Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest())); |
| } |
| |
| @Test |
| public void evalNerDateModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-date.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("116570003910213570906062355532299200317")); |
| } |
| |
| @Test |
| public void evalNerLocationModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-location.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("44810593886021404716125849669208680993")); |
| } |
| |
| @Test |
| public void evalNerMoneyModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-money.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("65248897509365807977219790824670047287")); |
| } |
| |
| @Test |
| public void evalNerOrganizationModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-organization.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("50454559690338630659278005157657197233")); |
| } |
| |
| @Test |
| public void evalNerPercentageModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-percentage.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("320996882594215344113023719117249515343")); |
| } |
| |
| @Test |
| public void evalNerPersonModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-person.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("143619582249937129618340838626447763744")); |
| } |
| |
| @Test |
| public void evalNerTimeModel() throws Exception { |
| TokenNameFinderModel personModel = new TokenNameFinderModel( |
| new File(getOpennlpDataDir(), "models-sf/en-ner-time.bin")); |
| |
| evalNameFinder(personModel, new BigInteger("282941772380683328816791801782579055940")); |
| } |
| |
| @Test |
| public void evalChunkerModel() throws Exception { |
| |
| MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); |
| |
| POSTagger tagger = new POSTaggerME(new POSModel( |
| new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"))); |
| |
| Chunker chunker = new ChunkerME(new ChunkerModel( |
| new File(getOpennlpDataDir(), "models-sf/en-chunker.bin"))); |
| |
| try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { |
| |
| LeipzigTestSample line; |
| while ((line = lines.read()) != null) { |
| POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText())); |
| |
| String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags()); |
| for (String chunk : chunks) { |
| digest.update(chunk.getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| } |
| |
| Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"), |
| new BigInteger(1, digest.digest())); |
| } |
| |
| private void evalPosModel(POSModel model, BigInteger expectedHash) throws Exception { |
| |
| // break the input stream into sentences |
| // The input stream is tokenized and can be processed here directly |
| |
| MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); |
| |
| POSTagger tagger = new POSTaggerME(model); |
| |
| try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { |
| |
| LeipzigTestSample line; |
| while ((line = lines.read()) != null) { |
| String[] tags = tagger.tag(line.getText()); |
| for (String tag : tags) { |
| digest.update(tag.getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| } |
| |
| Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest())); |
| } |
| |
| @Test |
| public void evalMaxentModel() throws Exception { |
| POSModel maxentModel = new POSModel( |
| new File(getOpennlpDataDir(), "models-sf/en-pos-maxent.bin")); |
| |
| evalPosModel(maxentModel, new BigInteger("231995214522232523777090597594904492687")); |
| } |
| |
| @Test |
| public void evalPerceptronModel() throws Exception { |
| POSModel perceptronModel = new POSModel( |
| new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")); |
| |
| evalPosModel(perceptronModel, new BigInteger("209440430718727101220960491543652921728")); |
| } |
| |
| @Test |
| public void evalParserModel() throws Exception { |
| |
| ParserModel model = new ParserModel( |
| new File(getOpennlpDataDir(), "models-sf/en-parser-chunking.bin")); |
| |
| MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); |
| |
| Parser parser = ParserFactory.create(model); |
| |
| try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { |
| |
| LeipzigTestSample line; |
| while ((line = lines.read()) != null) { |
| Parse[] parse = ParserTool.parseLine(String.join(" ", line.getText()), parser, 1); |
| if (parse.length > 0) { |
| StringBuffer sb = new StringBuffer(); |
| parse[0].show(sb); |
| digest.update(sb.toString().getBytes(StandardCharsets.UTF_8)); |
| } else { |
| digest.update("empty".getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| } |
| |
| Assert.assertEquals(new BigInteger("68039262350771988792233880373220954061"), |
| new BigInteger(1, digest.digest())); |
| } |
| } |