blob: 8855fda037d58f9102a2c4da3651e110e018b29e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ner.corenlp;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import com.github.openjson.JSONException;
import com.github.openjson.JSONObject;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.parser.ner.NERecogniser;
/**
* This class offers an implementation of {@link NERecogniser} based on
* CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
* due to runtime binding to Stanford CoreNLP.
* See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP">
* Tika NER Wiki</a> for configuring this recogniser.
*
* @see NERecogniser
*/
public class CoreNLPNERecogniser implements NERecogniser {
//default model paths
public static final String NER_3CLASS_MODEL =
"edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
public static final String NER_4CLASS_MODEL =
"edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
public static final String NER_7CLASS_MODEL =
"edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
/**
* default Model path
*/
public static final String DEFAULT_MODEL_PATH = NER_7CLASS_MODEL;
public static final String MODEL_PROP_NAME = "ner.corenlp.model";
public static final Set<String> ENTITY_TYPES = new HashSet<String>() {
{
add(PERSON);
add(TIME);
add(LOCATION);
add(ORGANIZATION);
add(MONEY);
add(PERCENT);
add(DATE);
}
};
private static final Logger LOG = LoggerFactory.getLogger(CoreNLPNERecogniser.class);
private static final String CLASSIFIER_CLASS_NAME = "edu.stanford.nlp.ie.crf.CRFClassifier";
private boolean available = false;
private Field firstField;
private Field secondField;
private Field thirdField;
private Object classifierInstance;
private Method classifyMethod;
public CoreNLPNERecogniser() {
this(System.getProperty(MODEL_PROP_NAME, DEFAULT_MODEL_PATH));
}
/**
* Creates a NERecogniser by loading model from given path
*
* @param modelPath path to NER model file
*/
public CoreNLPNERecogniser(String modelPath) {
try {
Properties props = new Properties();
Class<?> classifierClass = Class.forName(CLASSIFIER_CLASS_NAME);
Method loadMethod =
classifierClass.getMethod("getClassifier", String.class, Properties.class);
classifierInstance = loadMethod.invoke(classifierClass, modelPath, props);
classifyMethod = classifierClass.getMethod("classifyToCharacterOffsets", String.class);
//these fields are for accessing result
Class<?> tripleClass = Class.forName("edu.stanford.nlp.util.Triple");
this.firstField = tripleClass.getField("first");
this.secondField = tripleClass.getField("second");
this.thirdField = tripleClass.getField("third");
this.available = true;
} catch (Exception e) {
LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath);
}
LOG.info("Available for service ? {}", available);
}
public static void main(String[] args) throws IOException, JSONException {
if (args.length != 1) {
System.err.println("Error: Invalid Args");
System.err.println("This tool finds names inside text");
System.err.println("Usage: <path/to/text/file>");
return;
}
try (FileInputStream stream = new FileInputStream(args[0])) {
String text = IOUtils.toString(stream, StandardCharsets.UTF_8);
CoreNLPNERecogniser ner = new CoreNLPNERecogniser();
Map<String, Set<String>> names = ner.recognise(text);
JSONObject jNames = new JSONObject(names);
System.out.println(jNames.toString(2));
}
}
/**
* @return {@code true} if model was available, valid and was able to initialise the classifier.
* returns {@code false} when this recogniser is not available for service.
*/
public boolean isAvailable() {
return available;
}
/**
* Gets set of entity types recognised by this recogniser
*
* @return set of entity classes/types
*/
public Set<String> getEntityTypes() {
return ENTITY_TYPES;
}
/**
* recognises names of entities in the text
*
* @param text text which possibly contains names
* @return map of entity type -&gt; set of names
*/
public Map<String, Set<String>> recognise(String text) {
Map<String, Set<String>> names = new HashMap<>();
try {
Object result = classifyMethod.invoke(classifierInstance, text);
List entries = (List) result;
for (Object entry : entries) {
String entityType = (String) firstField.get(entry);
if (!names.containsKey(entityType)) {
names.put(entityType, new HashSet<>());
}
Integer start = (Integer) secondField.get(entry);
Integer end = (Integer) thirdField.get(entry);
String name = text.substring(start, end);
//Clean repeating spaces, replace line breaks and tabs with single space
name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " ");
if (!name.isEmpty()) {
names.get(entityType).add(name);
}
}
} catch (Exception e) {
LOG.warn(e.getMessage(), e);
}
return names;
}
}