lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.opennlp;

 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
 import org.apache.lucene.analysis.util.ClasspathResourceLoader;

 /**
  * Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
  * The POS model is based on this tokenization.
  *
  * Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
  */
 public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {

   private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
   private static final String[] SENTENCES_punc
       = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
   private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
   private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
   private static final String[] SENTENCES_posTags
       = {"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."};

   private static final String NO_BREAK = "No period";
   private static final String[] NO_BREAK_terms = {"No", "period"};
   private static final int[] NO_BREAK_startOffsets = {0, 3};
   private static final int[] NO_BREAK_endOffsets = {2, 9};

   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
   private static final String posTaggerModelFile = "en-test-pos-maxent.bin";


   private static byte[][] toPayloads(String... strings) {
     return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
   }

   public void testBasic() throws IOException {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
   }

   public void testPOS() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
         SENTENCES_posTags, null, null, true);

     analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
         null, null, null, true, toPayloads(SENTENCES_posTags));
   }

   public void testNoBreak() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .build();
     assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
         null, null, null, true);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.opennlp;

	import java.io.IOException;
	import java.nio.charset.StandardCharsets;
	import java.util.Arrays;

	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.custom.CustomAnalyzer;
	import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
	import org.apache.lucene.analysis.util.ClasspathResourceLoader;

	/**
	* Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
	* The POS model is based on this tokenization.
	*
	* Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
	*/
	public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {

	private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
	private static final String[] SENTENCES_punc
	= {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
	private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
	private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
	private static final String[] SENTENCES_posTags
	= {"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."};

	private static final String NO_BREAK = "No period";
	private static final String[] NO_BREAK_terms = {"No", "period"};
	private static final int[] NO_BREAK_startOffsets = {0, 3};
	private static final int[] NO_BREAK_endOffsets = {2, 9};

	private static final String sentenceModelFile = "en-test-sent.bin";
	private static final String tokenizerModelFile = "en-test-tokenizer.bin";
	private static final String posTaggerModelFile = "en-test-pos-maxent.bin";


	private static byte[][] toPayloads(String... strings) {
	return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
	}

	public void testBasic() throws IOException {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
	}

	public void testPOS() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
	SENTENCES_posTags, null, null, true);

	analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
	null, null, null, true, toPayloads(SENTENCES_posTags));
	}

	public void testNoBreak() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.build();
	assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
	null, null, null, true);
	}
	}