| // Lucene version compatibility level 8.2.0 |
| using Lucene.Net.Analysis.Util; |
| using opennlp.tools.chunker; |
| using opennlp.tools.lemmatizer; |
| using opennlp.tools.namefind; |
| using opennlp.tools.postag; |
| using opennlp.tools.sentdetect; |
| using opennlp.tools.tokenize; |
| using System.Collections.Concurrent; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.IO; |
| using System.Text; |
| |
| namespace Lucene.Net.Analysis.OpenNlp.Tools |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Supply OpenNLP Named Entity Recognizer. |
| /// Cache model file objects. Assumes model files are thread-safe. |
| /// </summary> |
| public static class OpenNLPOpsFactory // LUCENENET: Made static because all members are static |
| { |
| private static readonly IDictionary<string, SentenceModel> sentenceModels = new ConcurrentDictionary<string, SentenceModel>(); |
| private static readonly ConcurrentDictionary<string, TokenizerModel> tokenizerModels = new ConcurrentDictionary<string, TokenizerModel>(); |
| private static readonly ConcurrentDictionary<string, POSModel> posTaggerModels = new ConcurrentDictionary<string, POSModel>(); |
| private static readonly ConcurrentDictionary<string, ChunkerModel> chunkerModels = new ConcurrentDictionary<string, ChunkerModel>(); |
| private static readonly IDictionary<string, TokenNameFinderModel> nerModels = new ConcurrentDictionary<string, TokenNameFinderModel>(); |
| private static readonly IDictionary<string, LemmatizerModel> lemmatizerModels = new ConcurrentDictionary<string, LemmatizerModel>(); |
| private static readonly IDictionary<string, string> lemmaDictionaries = new ConcurrentDictionary<string, string>(); |
| |
| public static NLPSentenceDetectorOp GetSentenceDetector(string modelName) |
| { |
| if (modelName != null) |
| { |
| sentenceModels.TryGetValue(modelName, out SentenceModel model); |
| return new NLPSentenceDetectorOp(model); |
| } |
| else |
| { |
| return new NLPSentenceDetectorOp(); |
| } |
| } |
| |
| public static SentenceModel GetSentenceModel(string modelName, IResourceLoader loader) |
| { |
| if (!sentenceModels.TryGetValue(modelName, out SentenceModel model) || model == null) |
| { |
| using (Stream resource = loader.OpenResource(modelName)) |
| { |
| model = new SentenceModel(new ikvm.io.InputStreamWrapper(resource)); |
| } |
| sentenceModels[modelName] = model; |
| } |
| return model; |
| } |
| |
| public static NLPTokenizerOp GetTokenizer(string modelName) |
| { |
| if (modelName == null) |
| { |
| return new NLPTokenizerOp(); |
| } |
| else |
| { |
| TokenizerModel model = tokenizerModels[modelName]; |
| return new NLPTokenizerOp(model); |
| } |
| } |
| |
| public static TokenizerModel GetTokenizerModel(string modelName, IResourceLoader loader) |
| { |
| if (!tokenizerModels.TryGetValue(modelName, out TokenizerModel model) || model == null) |
| { |
| using (Stream resource = loader.OpenResource(modelName)) |
| { |
| model = new TokenizerModel(new ikvm.io.InputStreamWrapper(resource)); |
| } |
| tokenizerModels[modelName] = model; |
| } |
| return model; |
| } |
| |
| public static NLPPOSTaggerOp GetPOSTagger(string modelName) |
| { |
| posTaggerModels.TryGetValue(modelName, out POSModel model); |
| return new NLPPOSTaggerOp(model); |
| } |
| |
| public static POSModel GetPOSTaggerModel(string modelName, IResourceLoader loader) |
| { |
| if (!posTaggerModels.TryGetValue(modelName, out POSModel model) || model == null) |
| { |
| using (Stream resource = loader.OpenResource(modelName)) |
| { |
| model = new POSModel(new ikvm.io.InputStreamWrapper(resource)); |
| } |
| posTaggerModels[modelName] = model; |
| } |
| return model; |
| } |
| |
| public static NLPChunkerOp GetChunker(string modelName) |
| { |
| chunkerModels.TryGetValue(modelName, out ChunkerModel model); |
| return new NLPChunkerOp(model); |
| } |
| |
| public static ChunkerModel GetChunkerModel(string modelName, IResourceLoader loader) |
| { |
| if (!chunkerModels.TryGetValue(modelName, out ChunkerModel model) || model == null) |
| { |
| using (Stream resource = loader.OpenResource(modelName)) |
| { |
| model = new ChunkerModel(new ikvm.io.InputStreamWrapper(resource)); |
| } |
| chunkerModels[modelName] = model; |
| } |
| return model; |
| } |
| |
| public static NLPNERTaggerOp GetNERTagger(string modelName) |
| { |
| nerModels.TryGetValue(modelName, out TokenNameFinderModel model); |
| return new NLPNERTaggerOp(model); |
| } |
| |
| public static TokenNameFinderModel GetNERTaggerModel(string modelName, IResourceLoader loader) |
| { |
| if (!nerModels.TryGetValue(modelName, out TokenNameFinderModel model) || model == null) |
| { |
| using (Stream resource = loader.OpenResource(modelName)) |
| { |
| model = new TokenNameFinderModel(new ikvm.io.InputStreamWrapper(resource)); |
| } |
| nerModels[modelName] = model; |
| } |
| return model; |
| } |
| |
| public static NLPLemmatizerOp GetLemmatizer(string dictionaryFile, string lemmatizerModelFile) |
| { |
| Debug.Assert(dictionaryFile != null || lemmatizerModelFile != null, "At least one parameter must be non-null"); |
| Stream dictionaryInputStream = null; |
| if (dictionaryFile != null) |
| { |
| string dictionary = lemmaDictionaries[dictionaryFile]; |
| dictionaryInputStream = new MemoryStream(Encoding.UTF8.GetBytes(dictionary)); |
| } |
| LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels[lemmatizerModelFile]; |
| return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel); |
| } |
| |
| public static string GetLemmatizerDictionary(string dictionaryFile, IResourceLoader loader) |
| { |
| if (!lemmaDictionaries.TryGetValue(dictionaryFile, out string dictionary) || dictionary == null) |
| { |
| using (TextReader reader = new StreamReader(loader.OpenResource(dictionaryFile), Encoding.UTF8)) |
| { |
| StringBuilder builder = new StringBuilder(); |
| char[] chars = new char[8092]; |
| int numRead = 0; |
| do |
| { |
| numRead = reader.Read(chars, 0, chars.Length); |
| if (numRead > 0) |
| { |
| builder.Append(chars, 0, numRead); |
| } |
| } while (numRead > 0); |
| dictionary = builder.ToString(); |
| lemmaDictionaries[dictionaryFile] = dictionary; |
| } |
| } |
| return dictionary; |
| } |
| |
| public static LemmatizerModel GetLemmatizerModel(string modelName, IResourceLoader loader) |
| { |
| if (!lemmatizerModels.TryGetValue(modelName, out LemmatizerModel model) || model == null) |
| { |
| using (Stream resource = loader.OpenResource(modelName)) |
| { |
| model = new LemmatizerModel(new ikvm.io.InputStreamWrapper(resource)); |
| } |
| lemmatizerModels[modelName] = model; |
| } |
| return model; |
| } |
| |
| // keeps unit test from blowing out memory |
| public static void ClearModels() |
| { |
| sentenceModels.Clear(); |
| tokenizerModels.Clear(); |
| posTaggerModels.Clear(); |
| chunkerModels.Clear(); |
| nerModels.Clear(); |
| lemmaDictionaries.Clear(); |
| } |
| } |
| } |