blob: 65a5509ce65a785747c6d83a42329ccf0fc751ef [file] [log] [blame]
// Lucene version compatibility level 8.2.0
using opennlp.tools.lemmatizer;
using System.Diagnostics;
using System.IO;
namespace Lucene.Net.Analysis.OpenNlp.Tools
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Supply OpenNLP Lemmatizer tools.
/// <para/>
/// Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported.
/// If both are configured, the dictionary-based lemmatizer is tried first,
/// and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
/// <para/>
/// The MaxEnt implementation requires binary models from OpenNLP project on SourceForge.
/// </summary>
public class NLPLemmatizerOp
{
private readonly DictionaryLemmatizer dictionaryLemmatizer;
private readonly LemmatizerME lemmatizerME;
public NLPLemmatizerOp(Stream dictionary, LemmatizerModel lemmatizerModel)
{
Debug.Assert(dictionary != null || lemmatizerModel != null, "At least one parameter must be non-null");
dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(new ikvm.io.InputStreamWrapper(dictionary));
lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
}
public virtual string[] Lemmatize(string[] words, string[] postags)
{
string[] lemmas = null;
string[] maxEntLemmas = null;
if (dictionaryLemmatizer != null)
{
lemmas = dictionaryLemmatizer.lemmatize(words, postags);
for (int i = 0; i < lemmas.Length; ++i)
{
if (lemmas[i].Equals("O"))
{ // this word is not in the dictionary
if (lemmatizerME != null)
{ // fall back to the MaxEnt lemmatizer if it's enabled
if (maxEntLemmas == null)
{
maxEntLemmas = lemmatizerME.lemmatize(words, postags);
}
if ("_".Equals(maxEntLemmas[i]))
{
lemmas[i] = words[i]; // put back the original word if no lemma is found
}
else
{
lemmas[i] = maxEntLemmas[i];
}
}
else
{ // there is no MaxEnt lemmatizer
lemmas[i] = words[i]; // put back the original word if no lemma is found
}
}
}
}
else
{ // there is only a MaxEnt lemmatizer
maxEntLemmas = lemmatizerME.lemmatize(words, postags);
for (int i = 0; i < maxEntLemmas.Length; ++i)
{
if ("_".Equals(maxEntLemmas[i]))
{
maxEntLemmas[i] = words[i]; // put back the original word if no lemma is found
}
}
lemmas = maxEntLemmas;
}
return lemmas;
}
}
}