blob: 42c1d93942baed54ba01f43aab0eb00d2242402e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
import org.apache.lucene.util.ClasspathResourceLoader;
public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {
private static final String SENTENCE = "They sent him running in the evening.";
private static final String[] SENTENCE_dict_punc = {
"they", "send", "he", "run", "in", "the", "evening", "."
};
private static final String[] SENTENCE_maxent_punc = {
"they", "send", "he", "runn", "in", "the", "evening", "."
};
private static final String[] SENTENCE_posTags = {
"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."
};
private static final String SENTENCES =
"They sent him running in the evening. He did not come back.";
private static final String[] SENTENCES_dict_punc = {
"they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."
};
private static final String[] SENTENCES_maxent_punc = {
"they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back",
"."
};
private static final String[] SENTENCES_posTags = {
"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."
};
private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
private static final String[] SENTENCE_both_punc = {
"konstantin", "kalashnitsov", "constantly", "caliph", "."
};
private static final String[] SENTENCE_both_posTags = {"IN", "JJ", "NN", "VBN", "."};
private static final String SENTENCES_both =
"Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
private static final String[] SENTENCES_both_punc = {
"konstantin",
"kalashnitsov",
"constantly",
"caliph",
".",
"coreena",
"could",
"care",
",",
"completely",
"."
};
private static final String[] SENTENCES_both_posTags = {
"IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "."
};
private static final String[] SENTENCES_dict_keep_orig_punc = {
"They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".",
"He", "he", "did", "do", "not", "come", "back", "."
};
private static final String[] SENTENCES_max_ent_keep_orig_punc = {
"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".",
"He", "he", "did", "do", "not", "come", "back", "."
};
private static final String[] SENTENCES_keep_orig_posTags = {
"NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP",
"VBD", "VBD", "RB", "VB", "RB", "."
};
private static final String[] SENTENCES_both_keep_orig_punc = {
"Konstantin",
"konstantin",
"Kalashnitsov",
"kalashnitsov",
"constantly",
"caliphed",
"caliph",
".",
"Coreena",
"coreena",
"could",
"care",
",",
"completely",
"."
};
private static final String[] SENTENCES_both_keep_orig_posTags = {
"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
};
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
private static final String lemmatizerDictFile = "en-test-lemmas.dict";
public void test1SentenceDictionaryOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(
analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags, null, null, true);
}
public void test2SentencesDictionaryOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.build();
assertAnalyzesTo(
analyzer, SENTENCES, SENTENCES_dict_punc, null, null, SENTENCES_posTags, null, null, true);
}
public void test1SentenceMaxEntOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(
analyzer, SENTENCE, SENTENCE_maxent_punc, null, null, SENTENCE_posTags, null, null, true);
}
public void test2SentencesMaxEntOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_maxent_punc,
null,
null,
SENTENCES_posTags,
null,
null,
true);
}
public void test1SentenceDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter(
"opennlplemmatizer",
"dictionary",
"en-test-lemmas.dict",
"lemmatizerModel",
lemmatizerModelFile)
.build();
assertAnalyzesTo(
analyzer,
SENTENCE_both,
SENTENCE_both_punc,
null,
null,
SENTENCE_both_posTags,
null,
null,
true);
}
public void test2SentencesDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(
"opennlplemmatizer",
"dictionary",
lemmatizerDictFile,
"lemmatizerModel",
lemmatizerModelFile)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES_both,
SENTENCES_both_punc,
null,
null,
SENTENCES_both_posTags,
null,
null,
true);
}
public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_dict_keep_orig_punc,
null,
null,
SENTENCES_keep_orig_posTags,
null,
null,
true);
}
public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_max_ent_keep_orig_punc,
null,
null,
SENTENCES_keep_orig_posTags,
null,
null,
true);
}
public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter(
"opennlplemmatizer",
"dictionary",
lemmatizerDictFile,
"lemmatizerModel",
lemmatizerModelFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES_both,
SENTENCES_both_keep_orig_punc,
null,
null,
SENTENCES_both_keep_orig_posTags,
null,
null,
true);
}
}