lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.opennlp;

 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
 import org.apache.lucene.util.ClasspathResourceLoader;

 /**
  * Needs the OpenNLP Tokenizer because it creates full streams of punctuation. Needs the OpenNLP POS
  * tagger for the POS tags.
  *
  * <p>Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not
  * very accurate.
  */
 public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {

   private static final String SENTENCES =
       "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
   private static final String[] SENTENCES_punc = {
     "Sentence",
     "number",
     "1",
     "has",
     "6",
     "words",
     ".",
     "Sentence",
     "number",
     "2",
     ",",
     "5",
     "words",
     "."
   };
   private static final int[] SENTENCES_startOffsets = {
     0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57
   };
   private static final int[] SENTENCES_endOffsets = {
     8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58
   };
   private static final String[] SENTENCES_chunks = {
     "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
     "I-NP", "O"
   };

   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
   private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
   private static final String chunkerModelFile = "en-test-chunker.bin";

   private static byte[][] toPayloads(String... strings) {
     return Arrays.stream(strings)
         .map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8))
         .toArray(byte[][]::new);
   }

   public void testBasic() throws Exception {
     CustomAnalyzer analyzer =
         CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
             .withTokenizer(
                 "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
             .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
             .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
             .build();
     assertAnalyzesTo(
         analyzer,
         SENTENCES,
         SENTENCES_punc,
         SENTENCES_startOffsets,
         SENTENCES_endOffsets,
         SENTENCES_chunks,
         null,
         null,
         true);
   }

   public void testPayloads() throws Exception {
     CustomAnalyzer analyzer =
         CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
             .withTokenizer(
                 "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
             .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
             .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
             .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
             .build();
     assertAnalyzesTo(
         analyzer,
         SENTENCES,
         SENTENCES_punc,
         SENTENCES_startOffsets,
         SENTENCES_endOffsets,
         null,
         null,
         null,
         true,
         toPayloads(SENTENCES_chunks));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.opennlp;

	import java.nio.charset.StandardCharsets;
	import java.util.Arrays;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.custom.CustomAnalyzer;
	import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
	import org.apache.lucene.util.ClasspathResourceLoader;

	/**
	* Needs the OpenNLP Tokenizer because it creates full streams of punctuation. Needs the OpenNLP POS
	* tagger for the POS tags.
	*
	* <p>Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not
	* very accurate.
	*/
	public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {

	private static final String SENTENCES =
	"Sentence number 1 has 6 words. Sentence number 2, 5 words.";
	private static final String[] SENTENCES_punc = {
	"Sentence",
	"number",
	"1",
	"has",
	"6",
	"words",
	".",
	"Sentence",
	"number",
	"2",
	",",
	"5",
	"words",
	"."
	};
	private static final int[] SENTENCES_startOffsets = {
	0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57
	};
	private static final int[] SENTENCES_endOffsets = {
	8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58
	};
	private static final String[] SENTENCES_chunks = {
	"B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
	"I-NP", "O"
	};

	private static final String sentenceModelFile = "en-test-sent.bin";
	private static final String tokenizerModelFile = "en-test-tokenizer.bin";
	private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
	private static final String chunkerModelFile = "en-test-chunker.bin";

	private static byte[][] toPayloads(String... strings) {
	return Arrays.stream(strings)
	.map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8))
	.toArray(byte[][]::new);
	}

	public void testBasic() throws Exception {
	CustomAnalyzer analyzer =
	CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer(
	"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
	.build();
	assertAnalyzesTo(
	analyzer,
	SENTENCES,
	SENTENCES_punc,
	SENTENCES_startOffsets,
	SENTENCES_endOffsets,
	SENTENCES_chunks,
	null,
	null,
	true);
	}

	public void testPayloads() throws Exception {
	CustomAnalyzer analyzer =
	CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer(
	"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
	.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
	.build();
	assertAnalyzesTo(
	analyzer,
	SENTENCES,
	SENTENCES_punc,
	SENTENCES_startOffsets,
	SENTENCES_endOffsets,
	null,
	null,
	null,
	true,
	toPayloads(SENTENCES_chunks));
	}
	}