| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.ner.mitie; |
| |
| |
| import java.io.File; |
| import java.lang.reflect.Method; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.tika.parser.ner.NERecogniser; |
| |
| /** |
| * This class offers an implementation of {@link NERecogniser} based on |
| * trained models using state-of-the-art information extraction tools. \This NER requires |
| * additional setup, |
| * due to runtime binding to MIT Information Extraction. |
| * See <a href="http://wiki.apache.org/tika/TikaAndMITIE"> |
| * Tika MITIE Wiki</a> for configuring this recogniser. |
| * |
| * @see NERecogniser |
| */ |
| public class MITIENERecogniser implements NERecogniser { |
| public static final String MODEL_PROP_NAME = "ner.mitie.model"; |
| public static final Set<String> ENTITY_TYPES = new HashSet<String>() { |
| { |
| add(PERSON); |
| add(LOCATION); |
| add(ORGANIZATION); |
| add("MISC"); |
| } |
| }; |
| private static final Logger LOG = LoggerFactory.getLogger(MITIENERecogniser.class); |
| private static final String NamedEntityExtractor_Class = |
| "edu.mit.ll.mitie.NamedEntityExtractor"; |
| private boolean available = false; |
| private Object extractorInstance; |
| |
| public MITIENERecogniser() { |
| this(System.getProperty(MODEL_PROP_NAME)); |
| } |
| |
| /** |
| * Creates a NERecogniser by loading model from given path |
| * |
| * @param modelPath path to NER model file |
| */ |
| public MITIENERecogniser(String modelPath) { |
| try { |
| if (!(new File(modelPath)).exists()) { |
| LOG.warn("{} does not exist", modelPath); |
| } else { |
| Class<?> namedEntityExtractorClass = Class.forName(NamedEntityExtractor_Class); |
| extractorInstance = |
| namedEntityExtractorClass.getDeclaredConstructor(new Class[]{String.class}) |
| .newInstance(modelPath); |
| this.available = true; |
| } |
| } catch (Exception e) { |
| LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath); |
| } |
| LOG.info("Available for service ? {}", available); |
| } |
| |
| /** |
| * @return {@code true} if model was available, valid and was able to initialise the classifier. |
| * returns {@code false} when this recogniser is not available for service. |
| */ |
| public boolean isAvailable() { |
| return available; |
| } |
| |
| /** |
| * Gets set of entity types recognised by this recogniser |
| * |
| * @return set of entity classes/types |
| */ |
| public Set<String> getEntityTypes() { |
| return ENTITY_TYPES; |
| } |
| |
| /** |
| * recognises names of entities in the text |
| * |
| * @param text text which possibly contains names |
| * @return map of entity type -> set of names |
| */ |
| public Map<String, Set<String>> recognise(String text) { |
| Map<String, Set<String>> names = new HashMap<>(); |
| |
| try { |
| |
| Class<?> stringVectorClass = Class.forName("edu.mit.ll.mitie.StringVector"); |
| Class<?> entityMentionVectorClass = |
| Class.forName("edu.mit.ll.mitie.EntityMentionVector"); |
| Class<?> entityMentionClass = Class.forName("edu.mit.ll.mitie.EntityMention"); |
| Object entityMentionObject = null; |
| Class<?> globalClass = Class.forName("edu.mit.ll.mitie.global"); |
| Object stringVectorObject = extractorInstance.getClass().getMethod("getPossibleNerTags") |
| .invoke(extractorInstance); |
| long size = (Long) stringVectorClass.getMethod("size").invoke(stringVectorObject); |
| ArrayList<String> possibleTags = new ArrayList<>(); |
| for (long i = 0; i < size; i++) { |
| String t = (String) stringVectorClass.getMethod("get", Integer.TYPE) |
| .invoke(stringVectorObject, (int) i); |
| possibleTags.add(t); |
| } |
| Method tokenize = globalClass.getMethod("tokenize", String.class); |
| stringVectorObject = tokenize.invoke(globalClass, text); |
| |
| ArrayList<String> stringVector = new ArrayList<>(); |
| size = (Long) stringVectorClass.getMethod("size").invoke(stringVectorObject); |
| for (long i = 0; i < size; i++) { |
| String t = (String) stringVectorClass.getMethod("get", Integer.TYPE) |
| .invoke(stringVectorObject, (int) i); |
| stringVector.add(t); |
| } |
| Method extractEntities = |
| extractorInstance.getClass().getMethod("extractEntities", stringVectorClass); |
| Object entities = extractEntities.invoke(extractorInstance, stringVectorObject); |
| size = (Long) entityMentionVectorClass.getMethod("size").invoke(entities); |
| for (long i = 0; i < size; i++) { |
| entityMentionObject = entityMentionVectorClass.getMethod("get", Integer.TYPE) |
| .invoke(entities, (int) i); |
| int tag_index = (Integer) entityMentionClass.getMethod("getTag") |
| .invoke(entityMentionObject); |
| String tag = possibleTags.get(tag_index); |
| Set<String> x = new HashSet<>(); |
| if (names.containsKey(tag)) { |
| x = names.get(tag); |
| } else { |
| names.put(tag, x); |
| } |
| int start = (Integer) entityMentionClass.getMethod("getStart") |
| .invoke(entityMentionObject); |
| int end = (Integer) entityMentionClass.getMethod("getEnd") |
| .invoke(entityMentionObject); |
| StringBuilder match = new StringBuilder(); |
| while (start < end) { |
| match.append(stringVector.get(start)).append(" "); |
| start++; |
| } |
| x.add(match.toString().trim()); |
| } |
| |
| } catch (Exception e) { |
| |
| LOG.debug(e.getMessage(), e); |
| } |
| return names; |
| } |
| |
| } |