lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.opennlp;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
 import org.apache.lucene.analysis.util.ClasspathResourceLoader;

 public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {

   private static final String SENTENCE = "They sent him running in the evening.";
   private static final String[] SENTENCE_dict_punc =   {"they", "send", "he",  "run",  "in", "the", "evening", "."};
   private static final String[] SENTENCE_maxent_punc = {"they", "send", "he",  "runn", "in", "the", "evening", "."};
   private static final String[] SENTENCE_posTags =     {"NNP",  "VBD",  "PRP", "VBG",  "IN", "DT",  "NN",      "."};

   private static final String SENTENCES = "They sent him running in the evening. He did not come back.";
   private static final String[] SENTENCES_dict_punc
       = {"they", "send", "he",  "run",  "in", "the", "evening", ".", "he",  "do",  "not", "come", "back", "."};
   private static final String[] SENTENCES_maxent_punc
       = {"they", "send", "he",  "runn", "in", "the", "evening", ".", "he",  "do",  "not", "come", "back", "."};
   private static final String[] SENTENCES_posTags
       = {"NNP",  "VBD",  "PRP", "VBG",  "IN", "DT",  "NN",      ".", "PRP", "VBD", "RB",  "VB",   "RB",   "."};

   private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
   private static final String[] SENTENCE_both_punc
       = {"konstantin", "kalashnitsov", "constantly", "caliph", "."};
   private static final String[] SENTENCE_both_posTags
       = {"IN",         "JJ",          "NN",          "VBN",    "."};

   private static final String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
   private static final String[] SENTENCES_both_punc
       = {"konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "."};
   private static final String[] SENTENCES_both_posTags
       = {"IN",         "JJ",           "NN",          "VBN",    ".", "NNP",     "VBN",   "NN",   ",", "NN",         "."};

   private static final String[] SENTENCES_dict_keep_orig_punc
       = {"They", "they", "sent", "send", "him", "he", "running", "run",  "in", "the", "evening", ".", "He", "he",   "did", "do", "not", "come", "back", "."};
   private static final String[] SENTENCES_max_ent_keep_orig_punc
       = {"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he",   "did", "do", "not", "come", "back", "."};
   private static final String[] SENTENCES_keep_orig_posTags
       = {"NNP",  "NNP",  "VBD",  "VBD",  "PRP", "PRP", "VBG",    "VBG",  "IN", "DT",  "NN",      ".", "PRP", "PRP", "VBD", "VBD", "RB",  "VB",  "RB",   "."};

   private static final String[] SENTENCES_both_keep_orig_punc
       = {"Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "."};
   private static final String[] SENTENCES_both_keep_orig_posTags
       = {"IN",         "IN",         "JJ",           "JJ",           "NN",         "VBN",      "VBN",    ".", "NNP",     "NNP",     "VBN",   "NN",   ",", "NN",         "."};


   private static final String tokenizerModelFile = "en-test-tokenizer.bin";
   private static final String sentenceModelFile = "en-test-sent.bin";
   private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
   private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
   private static final String lemmatizerDictFile = "en-test-lemmas.dict";


   public void test1SentenceDictionaryOnly() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
         .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
         .build();
     assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
         SENTENCE_posTags, null, null, true);
   }

   public void test2SentencesDictionaryOnly() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
         SENTENCES_posTags, null, null, true);
   }

   public void test1SentenceMaxEntOnly() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
         SENTENCE_posTags, null, null, true);
   }

   public void test2SentencesMaxEntOnly() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
         SENTENCES_posTags, null, null, true);
   }

   public void test1SentenceDictionaryAndMaxEnt() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
         .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
         SENTENCE_both_posTags, null, null, true);
   }

   public void test2SentencesDictionaryAndMaxEnt() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
         SENTENCES_both_posTags, null, null, true);
   }

   public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter(KeywordRepeatFilterFactory.class)
         .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
         .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
         SENTENCES_keep_orig_posTags, null, null, true);
   }

   public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter(KeywordRepeatFilterFactory.class)
         .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
         .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
         SENTENCES_keep_orig_posTags, null, null, true);
   }

   public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
     CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
         .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
         .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
         .addTokenFilter(KeywordRepeatFilterFactory.class)
         .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
         .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
         .build();
     assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
         SENTENCES_both_keep_orig_posTags, null, null, true);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.opennlp;

	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.custom.CustomAnalyzer;
	import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
	import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
	import org.apache.lucene.analysis.util.ClasspathResourceLoader;

	public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {

	private static final String SENTENCE = "They sent him running in the evening.";
	private static final String[] SENTENCE_dict_punc = {"they", "send", "he", "run", "in", "the", "evening", "."};
	private static final String[] SENTENCE_maxent_punc = {"they", "send", "he", "runn", "in", "the", "evening", "."};
	private static final String[] SENTENCE_posTags = {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."};

	private static final String SENTENCES = "They sent him running in the evening. He did not come back.";
	private static final String[] SENTENCES_dict_punc
	= {"they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
	private static final String[] SENTENCES_maxent_punc
	= {"they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
	private static final String[] SENTENCES_posTags
	= {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."};

	private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
	private static final String[] SENTENCE_both_punc
	= {"konstantin", "kalashnitsov", "constantly", "caliph", "."};
	private static final String[] SENTENCE_both_posTags
	= {"IN", "JJ", "NN", "VBN", "."};

	private static final String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
	private static final String[] SENTENCES_both_punc
	= {"konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "."};
	private static final String[] SENTENCES_both_posTags
	= {"IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "."};

	private static final String[] SENTENCES_dict_keep_orig_punc
	= {"They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
	private static final String[] SENTENCES_max_ent_keep_orig_punc
	= {"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
	private static final String[] SENTENCES_keep_orig_posTags
	= {"NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP", "VBD", "VBD", "RB", "VB", "RB", "."};

	private static final String[] SENTENCES_both_keep_orig_punc
	= {"Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "."};
	private static final String[] SENTENCES_both_keep_orig_posTags
	= {"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."};


	private static final String tokenizerModelFile = "en-test-tokenizer.bin";
	private static final String sentenceModelFile = "en-test-sent.bin";
	private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
	private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
	private static final String lemmatizerDictFile = "en-test-lemmas.dict";


	public void test1SentenceDictionaryOnly() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
	.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
	.build();
	assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
	SENTENCE_posTags, null, null, true);
	}

	public void test2SentencesDictionaryOnly() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
	SENTENCES_posTags, null, null, true);
	}

	public void test1SentenceMaxEntOnly() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
	SENTENCE_posTags, null, null, true);
	}

	public void test2SentencesMaxEntOnly() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
	SENTENCES_posTags, null, null, true);
	}

	public void test1SentenceDictionaryAndMaxEnt() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
	.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
	SENTENCE_both_posTags, null, null, true);
	}

	public void test2SentencesDictionaryAndMaxEnt() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
	SENTENCES_both_posTags, null, null, true);
	}

	public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter(KeywordRepeatFilterFactory.class)
	.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
	.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
	SENTENCES_keep_orig_posTags, null, null, true);
	}

	public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter(KeywordRepeatFilterFactory.class)
	.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
	.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
	SENTENCES_keep_orig_posTags, null, null, true);
	}

	public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
	CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
	.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
	.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
	.addTokenFilter(KeywordRepeatFilterFactory.class)
	.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
	.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
	.build();
	assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
	SENTENCES_both_keep_orig_posTags, null, null, true);
	}

	}