blob: 86864764202d4e51a611622e59195cd737d2c77d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ner.mitie;
import java.io.File;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.parser.ner.NERecogniser;
/**
* This class offers an implementation of {@link NERecogniser} based on
* trained models using state-of-the-art information extraction tools. \This NER requires
* additional setup,
* due to runtime binding to MIT Information Extraction.
* See <a href="http://wiki.apache.org/tika/TikaAndMITIE">
* Tika MITIE Wiki</a> for configuring this recogniser.
*
* @see NERecogniser
*/
public class MITIENERecogniser implements NERecogniser {
public static final String MODEL_PROP_NAME = "ner.mitie.model";
public static final Set<String> ENTITY_TYPES = new HashSet<String>() {
{
add(PERSON);
add(LOCATION);
add(ORGANIZATION);
add("MISC");
}
};
private static final Logger LOG = LoggerFactory.getLogger(MITIENERecogniser.class);
private static final String NamedEntityExtractor_Class =
"edu.mit.ll.mitie.NamedEntityExtractor";
private boolean available = false;
private Object extractorInstance;
public MITIENERecogniser() {
this(System.getProperty(MODEL_PROP_NAME));
}
/**
* Creates a NERecogniser by loading model from given path
*
* @param modelPath path to NER model file
*/
public MITIENERecogniser(String modelPath) {
try {
if (!(new File(modelPath)).exists()) {
LOG.warn("{} does not exist", modelPath);
} else {
Class<?> namedEntityExtractorClass = Class.forName(NamedEntityExtractor_Class);
extractorInstance =
namedEntityExtractorClass.getDeclaredConstructor(new Class[]{String.class})
.newInstance(modelPath);
this.available = true;
}
} catch (Exception e) {
LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath);
}
LOG.info("Available for service ? {}", available);
}
/**
* @return {@code true} if model was available, valid and was able to initialise the classifier.
* returns {@code false} when this recogniser is not available for service.
*/
public boolean isAvailable() {
return available;
}
/**
* Gets set of entity types recognised by this recogniser
*
* @return set of entity classes/types
*/
public Set<String> getEntityTypes() {
return ENTITY_TYPES;
}
/**
* recognises names of entities in the text
*
* @param text text which possibly contains names
* @return map of entity type -&gt; set of names
*/
public Map<String, Set<String>> recognise(String text) {
Map<String, Set<String>> names = new HashMap<>();
try {
Class<?> stringVectorClass = Class.forName("edu.mit.ll.mitie.StringVector");
Class<?> entityMentionVectorClass =
Class.forName("edu.mit.ll.mitie.EntityMentionVector");
Class<?> entityMentionClass = Class.forName("edu.mit.ll.mitie.EntityMention");
Object entityMentionObject = null;
Class<?> globalClass = Class.forName("edu.mit.ll.mitie.global");
Object stringVectorObject = extractorInstance.getClass().getMethod("getPossibleNerTags")
.invoke(extractorInstance);
long size = (Long) stringVectorClass.getMethod("size").invoke(stringVectorObject);
ArrayList<String> possibleTags = new ArrayList<>();
for (long i = 0; i < size; i++) {
String t = (String) stringVectorClass.getMethod("get", Integer.TYPE)
.invoke(stringVectorObject, (int) i);
possibleTags.add(t);
}
Method tokenize = globalClass.getMethod("tokenize", String.class);
stringVectorObject = tokenize.invoke(globalClass, text);
ArrayList<String> stringVector = new ArrayList<>();
size = (Long) stringVectorClass.getMethod("size").invoke(stringVectorObject);
for (long i = 0; i < size; i++) {
String t = (String) stringVectorClass.getMethod("get", Integer.TYPE)
.invoke(stringVectorObject, (int) i);
stringVector.add(t);
}
Method extractEntities =
extractorInstance.getClass().getMethod("extractEntities", stringVectorClass);
Object entities = extractEntities.invoke(extractorInstance, stringVectorObject);
size = (Long) entityMentionVectorClass.getMethod("size").invoke(entities);
for (long i = 0; i < size; i++) {
entityMentionObject = entityMentionVectorClass.getMethod("get", Integer.TYPE)
.invoke(entities, (int) i);
int tag_index = (Integer) entityMentionClass.getMethod("getTag")
.invoke(entityMentionObject);
String tag = possibleTags.get(tag_index);
Set<String> x = new HashSet<>();
if (names.containsKey(tag)) {
x = names.get(tag);
} else {
names.put(tag, x);
}
int start = (Integer) entityMentionClass.getMethod("getStart")
.invoke(entityMentionObject);
int end = (Integer) entityMentionClass.getMethod("getEnd")
.invoke(entityMentionObject);
StringBuilder match = new StringBuilder();
while (start < end) {
match.append(stringVector.get(start)).append(" ");
start++;
}
x.add(match.toString().trim());
}
} catch (Exception e) {
LOG.debug(e.getMessage(), e);
}
return names;
}
}