blob: 84080af5b3dc8730fbfee6b77c2bcc81f8fb739e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp.tools;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.lucene.analysis.util.ResourceLoader;
/**
* Supply OpenNLP Named Entity Recognizer
* Cache model file objects. Assumes model files are thread-safe.
*/
public class OpenNLPOpsFactory {
private static Map<String,SentenceModel> sentenceModels = new ConcurrentHashMap<>();
private static ConcurrentHashMap<String,TokenizerModel> tokenizerModels = new ConcurrentHashMap<>();
private static ConcurrentHashMap<String,POSModel> posTaggerModels = new ConcurrentHashMap<>();
private static ConcurrentHashMap<String,ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
private static Map<String,TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
private static Map<String,LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
private static Map<String,String> lemmaDictionaries = new ConcurrentHashMap<>();
public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
if (modelName != null) {
SentenceModel model = sentenceModels.get(modelName);
return new NLPSentenceDetectorOp(model);
} else {
return new NLPSentenceDetectorOp();
}
}
public static SentenceModel getSentenceModel(String modelName, ResourceLoader loader) throws IOException {
SentenceModel model = sentenceModels.get(modelName);
if (model == null) {
try (InputStream resource = loader.openResource(modelName)) {
model = new SentenceModel(resource);
}
sentenceModels.put(modelName, model);
}
return model;
}
public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
if (modelName == null) {
return new NLPTokenizerOp();
} else {
TokenizerModel model = tokenizerModels.get(modelName);
return new NLPTokenizerOp(model);
}
}
public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader loader) throws IOException {
TokenizerModel model = tokenizerModels.get(modelName);
if (model == null) {
try (InputStream resource = loader.openResource(modelName)) {
model = new TokenizerModel(resource);
}
tokenizerModels.put(modelName, model);
}
return model;
}
public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
POSModel model = posTaggerModels.get(modelName);
return new NLPPOSTaggerOp(model);
}
public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader) throws IOException {
POSModel model = posTaggerModels.get(modelName);
if (model == null) {
try (InputStream resource = loader.openResource(modelName)) {
model = new POSModel(resource);
}
posTaggerModels.put(modelName, model);
}
return model;
}
public static NLPChunkerOp getChunker(String modelName) throws IOException {
ChunkerModel model = chunkerModels.get(modelName);
return new NLPChunkerOp(model);
}
public static ChunkerModel getChunkerModel(String modelName, ResourceLoader loader) throws IOException {
ChunkerModel model = chunkerModels.get(modelName);
if (model == null) {
try (InputStream resource = loader.openResource(modelName)) {
model = new ChunkerModel(resource);
}
chunkerModels.put(modelName, model);
}
return model;
}
public static NLPNERTaggerOp getNERTagger(String modelName) throws IOException {
TokenNameFinderModel model = nerModels.get(modelName);
return new NLPNERTaggerOp(model);
}
public static TokenNameFinderModel getNERTaggerModel(String modelName, ResourceLoader loader) throws IOException {
TokenNameFinderModel model = nerModels.get(modelName);
if (model == null) {
try (InputStream resource = loader.openResource(modelName)) {
model = new TokenNameFinderModel(resource);
}
nerModels.put(modelName, model);
}
return model;
}
public static NLPLemmatizerOp getLemmatizer(String dictionaryFile, String lemmatizerModelFile) throws IOException {
assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null";
InputStream dictionaryInputStream = null;
if (dictionaryFile != null) {
String dictionary = lemmaDictionaries.get(dictionaryFile);
dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
}
LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
}
public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) throws IOException {
String dictionary = lemmaDictionaries.get(dictionaryFile);
if (dictionary == null) {
try (Reader reader = new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) {
StringBuilder builder = new StringBuilder();
char[] chars = new char[8092];
int numRead = 0;
do {
numRead = reader.read(chars, 0, chars.length);
if (numRead > 0) {
builder.append(chars, 0, numRead);
}
} while (numRead > 0);
dictionary = builder.toString();
lemmaDictionaries.put(dictionaryFile, dictionary);
}
}
return dictionary;
}
public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) throws IOException {
LemmatizerModel model = lemmatizerModels.get(modelName);
if (model == null) {
try (InputStream resource = loader.openResource(modelName)) {
model = new LemmatizerModel(resource);
}
lemmatizerModels.put(modelName, model);
}
return model;
}
// keeps unit test from blowing out memory
public static void clearModels() {
sentenceModels.clear();
tokenizerModels.clear();
posTaggerModels.clear();
chunkerModels.clear();
nerModels.clear();
lemmaDictionaries.clear();
}
}