| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.opennlp; |
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.custom.CustomAnalyzer; |
| import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory; |
| import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory; |
| import org.apache.lucene.util.ClasspathResourceLoader; |
| |
| public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase { |
| |
| private static final String SENTENCE = "They sent him running in the evening."; |
| private static final String[] SENTENCE_dict_punc = { |
| "they", "send", "he", "run", "in", "the", "evening", "." |
| }; |
| private static final String[] SENTENCE_maxent_punc = { |
| "they", "send", "he", "runn", "in", "the", "evening", "." |
| }; |
| private static final String[] SENTENCE_posTags = { |
| "NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "." |
| }; |
| |
| private static final String SENTENCES = |
| "They sent him running in the evening. He did not come back."; |
| private static final String[] SENTENCES_dict_punc = { |
| "they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "." |
| }; |
| private static final String[] SENTENCES_maxent_punc = { |
| "they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back", |
| "." |
| }; |
| private static final String[] SENTENCES_posTags = { |
| "NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "." |
| }; |
| |
| private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed."; |
| private static final String[] SENTENCE_both_punc = { |
| "konstantin", "kalashnitsov", "constantly", "caliph", "." |
| }; |
| private static final String[] SENTENCE_both_posTags = {"IN", "JJ", "NN", "VBN", "."}; |
| |
| private static final String SENTENCES_both = |
| "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely."; |
| private static final String[] SENTENCES_both_punc = { |
| "konstantin", |
| "kalashnitsov", |
| "constantly", |
| "caliph", |
| ".", |
| "coreena", |
| "could", |
| "care", |
| ",", |
| "completely", |
| "." |
| }; |
| private static final String[] SENTENCES_both_posTags = { |
| "IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "." |
| }; |
| |
| private static final String[] SENTENCES_dict_keep_orig_punc = { |
| "They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".", |
| "He", "he", "did", "do", "not", "come", "back", "." |
| }; |
| private static final String[] SENTENCES_max_ent_keep_orig_punc = { |
| "They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", |
| "He", "he", "did", "do", "not", "come", "back", "." |
| }; |
| private static final String[] SENTENCES_keep_orig_posTags = { |
| "NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP", |
| "VBD", "VBD", "RB", "VB", "RB", "." |
| }; |
| |
| private static final String[] SENTENCES_both_keep_orig_punc = { |
| "Konstantin", |
| "konstantin", |
| "Kalashnitsov", |
| "kalashnitsov", |
| "constantly", |
| "caliphed", |
| "caliph", |
| ".", |
| "Coreena", |
| "coreena", |
| "could", |
| "care", |
| ",", |
| "completely", |
| "." |
| }; |
| private static final String[] SENTENCES_both_keep_orig_posTags = { |
| "IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "." |
| }; |
| |
| private static final String tokenizerModelFile = "en-test-tokenizer.bin"; |
| private static final String sentenceModelFile = "en-test-sent.bin"; |
| private static final String posTaggerModelFile = "en-test-pos-maxent.bin"; |
| private static final String lemmatizerModelFile = "en-test-lemmatizer.bin"; |
| private static final String lemmatizerDictFile = "en-test-lemmas.dict"; |
| |
| public void test1SentenceDictionaryOnly() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin") |
| .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict") |
| .build(); |
| assertAnalyzesTo( |
| analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags, null, null, true); |
| } |
| |
| public void test2SentencesDictionaryOnly() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, SENTENCES, SENTENCES_dict_punc, null, null, SENTENCES_posTags, null, null, true); |
| } |
| |
| public void test1SentenceMaxEntOnly() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, SENTENCE, SENTENCE_maxent_punc, null, null, SENTENCE_posTags, null, null, true); |
| } |
| |
| public void test2SentencesMaxEntOnly() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, |
| SENTENCES, |
| SENTENCES_maxent_punc, |
| null, |
| null, |
| SENTENCES_posTags, |
| null, |
| null, |
| true); |
| } |
| |
| public void test1SentenceDictionaryAndMaxEnt() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin") |
| .addTokenFilter( |
| "opennlplemmatizer", |
| "dictionary", |
| "en-test-lemmas.dict", |
| "lemmatizerModel", |
| lemmatizerModelFile) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, |
| SENTENCE_both, |
| SENTENCE_both_punc, |
| null, |
| null, |
| SENTENCE_both_posTags, |
| null, |
| null, |
| true); |
| } |
| |
| public void test2SentencesDictionaryAndMaxEnt() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter( |
| "opennlplemmatizer", |
| "dictionary", |
| lemmatizerDictFile, |
| "lemmatizerModel", |
| lemmatizerModelFile) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, |
| SENTENCES_both, |
| SENTENCES_both_punc, |
| null, |
| null, |
| SENTENCES_both_posTags, |
| null, |
| null, |
| true); |
| } |
| |
| public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter(KeywordRepeatFilterFactory.class) |
| .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile) |
| .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, |
| SENTENCES, |
| SENTENCES_dict_keep_orig_punc, |
| null, |
| null, |
| SENTENCES_keep_orig_posTags, |
| null, |
| null, |
| true); |
| } |
| |
| public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter(KeywordRepeatFilterFactory.class) |
| .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile) |
| .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, |
| SENTENCES, |
| SENTENCES_max_ent_keep_orig_punc, |
| null, |
| null, |
| SENTENCES_keep_orig_posTags, |
| null, |
| null, |
| true); |
| } |
| |
| public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception { |
| CustomAnalyzer analyzer = |
| CustomAnalyzer.builder(new ClasspathResourceLoader(getClass())) |
| .withTokenizer( |
| "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile) |
| .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile) |
| .addTokenFilter(KeywordRepeatFilterFactory.class) |
| .addTokenFilter( |
| "opennlplemmatizer", |
| "dictionary", |
| lemmatizerDictFile, |
| "lemmatizerModel", |
| lemmatizerModelFile) |
| .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class) |
| .build(); |
| assertAnalyzesTo( |
| analyzer, |
| SENTENCES_both, |
| SENTENCES_both_keep_orig_punc, |
| null, |
| null, |
| SENTENCES_both_keep_orig_posTags, |
| null, |
| null, |
| true); |
| } |
| } |