blob: 309f1977494fcfcfaa6e6a659d6ad7e6403703a4 [file] [log] [blame]
// Lucene version compatibility level 8.2.0
using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.Util;
using NUnit.Framework;
using System;
using System.Collections.Generic;
namespace Lucene.Net.Analysis.OpenNlp
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestOpenNLPLemmatizerFilterFactory : BaseTokenStreamTestCase
{
private const String SENTENCE = "They sent him running in the evening.";
private static readonly String[] SENTENCE_dict_punc = { "they", "send", "he", "run", "in", "the", "evening", "." };
private static readonly String[] SENTENCE_maxent_punc = { "they", "send", "he", "runn", "in", "the", "evening", "." };
private static readonly String[] SENTENCE_posTags = { "NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "." };
private static readonly String SENTENCES = "They sent him running in the evening. He did not come back.";
private static readonly String[] SENTENCES_dict_punc
= { "they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "." };
private static readonly String[] SENTENCES_maxent_punc
= { "they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "." };
private static readonly String[] SENTENCES_posTags
= { "NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "." };
private static readonly String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
private static readonly String[] SENTENCE_both_punc
= { "konstantin", "kalashnitsov", "constantly", "caliph", "." };
private static readonly String[] SENTENCE_both_posTags
= { "IN", "JJ", "NN", "VBN", "." };
private const String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
private static readonly String[] SENTENCES_both_punc
= { "konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "." };
private static readonly String[] SENTENCES_both_posTags
= { "IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "." };
private static readonly String[] SENTENCES_dict_keep_orig_punc
= { "They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "." };
private static readonly String[] SENTENCES_max_ent_keep_orig_punc
= { "They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "." };
private static readonly String[] SENTENCES_keep_orig_posTags
= { "NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP", "VBD", "VBD", "RB", "VB", "RB", "." };
private static readonly String[] SENTENCES_both_keep_orig_punc
= { "Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "." };
private static readonly String[] SENTENCES_both_keep_orig_posTags
= { "IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "." };
private const String tokenizerModelFile = "en-test-tokenizer.bin";
private const String sentenceModelFile = "en-test-sent.bin";
private const String posTaggerModelFile = "en-test-pos-maxent.bin";
private const String lemmatizerModelFile = "en-test-lemmatizer.bin";
private const String lemmatizerDictFile = "en-test-lemmas.dict";
[Test]
public void Test1SentenceDictionaryOnly()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "dictionary", lemmatizerDictFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);
return new TokenStreamComponents(opennlp, opennlpLemmatizerFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
//.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
//.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
//.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
//.build();
AssertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
SENTENCE_posTags, null, null, true);
}
[Test]
public void Test2SentencesDictionaryOnly()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "dictionary", lemmatizerDictFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);
return new TokenStreamComponents(opennlp, opennlpLemmatizerFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
//.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
//.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
//.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
//.build();
AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
SENTENCES_posTags, null, null, true);
}
[Test]
public void Test1SentenceMaxEntOnly()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "lemmatizerModel", lemmatizerModelFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);
return new TokenStreamComponents(opennlp, opennlpLemmatizerFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
// .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
// .build();
AssertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
SENTENCE_posTags, null, null, true);
}
[Test]
public void Test2SentencesMaxEntOnly()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "lemmatizerModel", lemmatizerModelFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);
return new TokenStreamComponents(opennlp, opennlpLemmatizerFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
// .addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
// .build();
AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
SENTENCES_posTags, null, null, true);
}
[Test]
public void Test1SentenceDictionaryAndMaxEnt()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "dictionary", lemmatizerDictFile }, { "lemmatizerModel", lemmatizerModelFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);
return new TokenStreamComponents(opennlp, opennlpLemmatizerFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
// .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
// .build();
AssertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
SENTENCE_both_posTags, null, null, true);
}
[Test]
public void Test2SentencesDictionaryAndMaxEnt()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "dictionary", lemmatizerDictFile }, { "lemmatizerModel", lemmatizerModelFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(opennlpPOSFilter);
return new TokenStreamComponents(opennlp, opennlpLemmatizerFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
// .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
// .build();
AssertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
SENTENCES_both_posTags, null, null, true);
}
[Test]
public void TestKeywordAttributeAwarenessDictionaryOnly()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var keywordRepeatFilterFactory = new KeywordRepeatFilterFactory(new Dictionary<string, string>());
var keywordRepeatFilter = keywordRepeatFilterFactory.Create(opennlpPOSFilter);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "dictionary", lemmatizerDictFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(keywordRepeatFilter);
var removeDuplicatesTokenFilterFactory = new RemoveDuplicatesTokenFilterFactory(new Dictionary<string, string>());
var removeDuplicatesTokenFilter = removeDuplicatesTokenFilterFactory.Create(opennlpLemmatizerFilter);
return new TokenStreamComponents(opennlp, removeDuplicatesTokenFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
// .addTokenFilter(KeywordRepeatFilterFactory.class)
// .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
// .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
// .build();
AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
SENTENCES_keep_orig_posTags, null, null, true);
}
[Test]
public void TestKeywordAttributeAwarenessMaxEntOnly()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var keywordRepeatFilterFactory = new KeywordRepeatFilterFactory(new Dictionary<string, string>());
var keywordRepeatFilter = keywordRepeatFilterFactory.Create(opennlpPOSFilter);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "lemmatizerModel", lemmatizerModelFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(keywordRepeatFilter);
var removeDuplicatesTokenFilterFactory = new RemoveDuplicatesTokenFilterFactory(new Dictionary<string, string>());
var removeDuplicatesTokenFilter = removeDuplicatesTokenFilterFactory.Create(opennlpLemmatizerFilter);
return new TokenStreamComponents(opennlp, removeDuplicatesTokenFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
// .addTokenFilter(KeywordRepeatFilterFactory.class)
// .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
// .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
// .build();
AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
SENTENCES_keep_orig_posTags, null, null, true);
}
[Test]
public void TestKeywordAttributeAwarenessDictionaryAndMaxEnt()
{
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
{
var loader = new ClasspathResourceLoader(GetType());
var opennlpFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", tokenizerModelFile }, { "sentenceModel", sentenceModelFile } });
opennlpFactory.Inform(loader);
var opennlp = opennlpFactory.Create(reader);
var opennlpPOSFilterFactory = new OpenNLPPOSFilterFactory(new Dictionary<string, string> { { "posTaggerModel", posTaggerModelFile } });
opennlpPOSFilterFactory.Inform(loader);
var opennlpPOSFilter = opennlpPOSFilterFactory.Create(opennlp);
var keywordRepeatFilterFactory = new KeywordRepeatFilterFactory(new Dictionary<string, string>());
var keywordRepeatFilter = keywordRepeatFilterFactory.Create(opennlpPOSFilter);
var opennlpLemmatizerFilterFactory = new OpenNLPLemmatizerFilterFactory(new Dictionary<string, string> { { "dictionary", lemmatizerDictFile }, { "lemmatizerModel", lemmatizerModelFile } });
opennlpLemmatizerFilterFactory.Inform(loader);
var opennlpLemmatizerFilter = opennlpLemmatizerFilterFactory.Create(keywordRepeatFilter);
var removeDuplicatesTokenFilterFactory = new RemoveDuplicatesTokenFilterFactory(new Dictionary<string, string>());
var removeDuplicatesTokenFilter = removeDuplicatesTokenFilterFactory.Create(opennlpLemmatizerFilter);
return new TokenStreamComponents(opennlp, removeDuplicatesTokenFilter);
});
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
// .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
// .addTokenFilter(KeywordRepeatFilterFactory.class)
// .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
// .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
// .build();
AssertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
SENTENCES_both_keep_orig_posTags, null, null, true);
}
}
}