blob: 8c9d2cf9e8054f0cf5c2cc178215103771092fe7 [file] [log] [blame]
// Lucene version compatibility level 8.2.0
using Lucene.Net.Analysis.Util;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using System.IO;
namespace Lucene.Net.Analysis.OpenNlp
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Tests the Tokenizer as well- the Tokenizer needs the OpenNLP model files,
/// which this can load from src/test-files/opennlp/solr/conf
/// </summary>
public class TestOpenNLPTokenizerFactory : BaseTokenStreamTestCase
private const String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
private static String[] SENTENCES_punc = { "Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "." };
private static int[] SENTENCES_startOffsets = { 0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57 };
private static int[] SENTENCES_endOffsets = { 8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58 };
private const String SENTENCE1 = "Sentence number 1 has 6 words.";
private static String[] SENTENCE1_punc = { "Sentence", "number", "1", "has", "6", "words", "." };
public void TestTokenizer()
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
var tokenizerFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "sentenceModel", "en-test-sent.bin" }, { "tokenizerModel", "en-test-tokenizer.bin" } });
var tokenizer = tokenizerFactory.Create(reader);
return new TokenStreamComponents(tokenizer);
//CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
// .build();
AssertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
AssertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
public void TestTokenizerNoSentenceDetector()
var expected = Assert.Throws<ArgumentException>(() =>
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
var tokenizerFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "tokenizerModel", "en-test-tokenizer.bin" } });
var tokenizer = tokenizerFactory.Create(reader);
return new TokenStreamComponents(tokenizer);
analyzer.GetTokenStream("", "");
// IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
// CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
// .build();
assertTrue(expected.Message.Contains("Configuration Error: missing parameter 'sentenceModel'"));
public void TestTokenizerNoTokenizer()
//Analyzer analyzer2 = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
// var tokenizerFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "sentenceModel", "en-test-sent.bin" } });
// tokenizerFactory.Inform(new ClasspathResourceLoader(GetType()));
// var tokenizer = tokenizerFactory.Create(reader);
// return new TokenStreamComponents(tokenizer);
//analyzer2.GetTokenStream("", "");
var expected = Assert.Throws<ArgumentException>(() =>
Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldname, reader) =>
var tokenizerFactory = new OpenNLPTokenizerFactory(new Dictionary<string, string> { { "sentenceModel", "en-test-sent.bin" } });
var tokenizer = tokenizerFactory.Create(reader);
return new TokenStreamComponents(tokenizer);
analyzer.GetTokenStream("", "");
// IllegalArgumentException expected = expectThrows(ArgumentException.class, () -> {
// CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
// .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
// .build();
assertTrue(expected.Message.Contains("Configuration Error: missing parameter 'tokenizerModel'"));
// test analyzer caching the tokenizer
public void TestClose()
IDictionary<String, String> args = new Dictionary<String, String>()
{ "sentenceModel", "en-test-sent.bin" },
{ "tokenizerModel", "en-test-tokenizer.bin" }
OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
factory.Inform(new ClasspathResourceLoader(GetType()));
Tokenizer ts = factory.Create(NewAttributeFactory(), new StringReader(SENTENCES));
//ts.SetReader(new StringReader(SENTENCES));
ts.SetReader(new StringReader(SENTENCES));
AssertTokenStreamContents(ts, SENTENCES_punc);
ts.SetReader(new StringReader(SENTENCES));
AssertTokenStreamContents(ts, SENTENCES_punc);
internal static void assertTrue(bool condition)