blob: 813ff986b4557d71dd7b1eb3062c7b6318f9aaf5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ner.nltk;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.cxf.jaxrs.client.WebClient;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.parser.ner.NERecogniser;
/**
* This class offers an implementation of {@link NERecogniser} based on
* ne_chunk() module of NLTK. This NER requires additional setup,
* due to Http requests to an endpoint server that runs NLTK.
* See <a href="http://wiki.apache.org/tika/TikaAndNLTK">
*/
public class NLTKNERecogniser implements NERecogniser {
/**
* some common entities identified by NLTK
*/
public static final Set<String> ENTITY_TYPES = new HashSet<String>() {
{
add("NAMES");
}
};
private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class);
private static final String NLTK_REST_HOST = "http://localhost:8881";
private static boolean available = false;
private String restHostUrlStr;
public NLTKNERecogniser() {
try {
String restHostUrlStr = "";
try {
restHostUrlStr = readRestUrl();
} catch (IOException e) {
LOG.warn("Can't read rest url", e);
}
if (restHostUrlStr == null || restHostUrlStr.equals("")) {
this.restHostUrlStr = NLTK_REST_HOST;
} else {
this.restHostUrlStr = restHostUrlStr;
}
Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get();
int responseCode = response.getStatus();
if (responseCode == 200) {
available = true;
} else {
LOG.info("NLTKRest Server is not running");
}
} catch (Exception e) {
LOG.warn(e.getMessage(), e);
}
}
private static String readRestUrl() throws IOException {
Properties nltkProperties = new Properties();
nltkProperties.load(NLTKNERecogniser.class.getResourceAsStream("NLTKServer.properties"));
return nltkProperties.getProperty("nltk.server.url");
}
/**
* @return {@code true} if server endpoint is available.
* returns {@code false} if server endpoint is not avaliable for service.
*/
public boolean isAvailable() {
return available;
}
/**
* Gets set of entity types recognised by this recogniser
*
* @return set of entity classes/types
*/
public Set<String> getEntityTypes() {
return ENTITY_TYPES;
}
/**
* recognises names of entities in the text
*
* @param text text which possibly contains names
* @return map of entity type -&gt; set of names
*/
public Map<String, Set<String>> recognise(String text) {
Map<String, Set<String>> entities = new HashMap<>();
try {
String url = restHostUrlStr + "/nltk";
Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text);
int responseCode = response.getStatus();
if (responseCode == 200) {
String result = response.readEntity(String.class);
JSONParser parser = new JSONParser();
JSONObject j = (JSONObject) parser.parse(result);
for (Object o : j.keySet()) {
String key = (String) o;
if (!key.equals("result")) {
ENTITY_TYPES.add(key);
entities.put(key.toUpperCase(Locale.ENGLISH),
new HashSet((Collection) j.get(key)));
}
}
}
} catch (Exception e) {
LOG.debug(e.getMessage(), e);
}
return entities;
}
}