| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright owlocationNameEntitieship. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.tika.parser.geo; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.net.MalformedURLException; |
| import java.net.URISyntaxException; |
| import java.net.URL; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import opennlp.tools.namefind.NameFinderME; |
| import opennlp.tools.namefind.TokenNameFinderModel; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.config.Field; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.AbstractParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.geo.gazetteer.GeoGazetteerClient; |
| import org.apache.tika.parser.geo.gazetteer.Location; |
| |
| public class GeoParser extends AbstractParser { |
| private static final long serialVersionUID = -2241391757440215491L; |
| private static final Logger LOG = LoggerFactory.getLogger(GeoParser.class); |
| private static final MediaType MEDIA_TYPE = MediaType.application("geotopic"); |
| private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); |
| |
| private GeoParserConfig defaultConfig = new GeoParserConfig(); |
| private GeoGazetteerClient gazetteerClient; |
| |
| private boolean initialized; |
| private URL modelUrl; |
| private NameFinderME nameFinder; |
| private boolean available; |
| |
| @Override |
| public Set<MediaType> getSupportedTypes(ParseContext parseContext) { |
| return SUPPORTED_TYPES; |
| } |
| |
| /** |
| * Initializes this parser |
| * |
| * @param geoParserConfig config to load the url model from and set the gazetteer client |
| */ |
| public void initialize(GeoParserConfig geoParserConfig) { |
| try { |
| if (this.modelUrl != null && this.modelUrl.toURI().equals(modelUrl.toURI())) { |
| return; |
| } |
| } catch (URISyntaxException e1) { |
| throw new RuntimeException(e1.getMessage()); |
| } |
| |
| this.modelUrl = geoParserConfig.getNerModelUrl(); |
| gazetteerClient = new GeoGazetteerClient(geoParserConfig); |
| |
| // Check if the NER model is available, and if the |
| // lucene-geo-gazetteer is available |
| this.available = modelUrl != null && gazetteerClient.checkAvail(); |
| |
| if (this.available) { |
| try { |
| TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); |
| this.nameFinder = new NameFinderME(model); |
| } catch (Exception e) { |
| LOG.warn("Named Entity Extractor setup failed: {}", e.getMessage(), e); |
| this.available = false; |
| } |
| } |
| initialized = true; |
| } |
| |
| @Override |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException, TikaException { |
| |
| /*----------------configure this parser by ParseContext Object---------------------*/ |
| |
| GeoParserConfig geoParserConfig = context.get(GeoParserConfig.class, defaultConfig); |
| initialize(geoParserConfig); |
| if (!isAvailable(geoParserConfig)) { |
| return; |
| } |
| NameEntityExtractor extractor = null; |
| |
| try { |
| extractor = new NameEntityExtractor(nameFinder); |
| } catch (Exception e) { |
| LOG.warn("Named Entity Extractor setup failed: {}", e.getMessage(), e); |
| return; |
| } |
| |
| /*----------------get locationNameEntities and best nameEntity for the |
| input stream---------------------*/ |
| extractor.getAllNameEntitiesfromInput(stream); |
| extractor.getBestNameEntity(); |
| ArrayList<String> locationNameEntities = extractor.locationNameEntities; |
| String bestner = extractor.bestNameEntity; |
| |
| /*------------------------resolve geonames for each ner, |
| store results in a hashmap---------------------*/ |
| Map<String, List<Location>> resolvedGeonames = searchGeoNames(locationNameEntities); |
| |
| /*----------------store locationNameEntities and their geonames in a |
| geotag, each input has one geotag---------------------*/ |
| GeoTag geotag = new GeoTag(); |
| geotag.toGeoTag(resolvedGeonames, bestner); |
| |
| /* add resolved entities in metadata */ |
| |
| metadata.add("Geographic_NAME", geotag.location.getName()); |
| metadata.add("Geographic_LONGITUDE", geotag.location.getLongitude()); |
| metadata.add("Geographic_LATITUDE", geotag.location.getLatitude()); |
| for (int i = 0; i < geotag.alternatives.size(); ++i) { |
| GeoTag alter = (GeoTag) geotag.alternatives.get(i); |
| metadata.add("Optional_NAME" + (i + 1), alter.location.getName()); |
| metadata.add("Optional_LONGITUDE" + (i + 1), alter.location.getLongitude()); |
| metadata.add("Optional_LATITUDE" + (i + 1), alter.location.getLatitude()); |
| } |
| } |
| |
| public Map<String, List<Location>> searchGeoNames(ArrayList<String> locationNameEntities) { |
| return gazetteerClient.getLocations(locationNameEntities); |
| } |
| |
| public boolean isAvailable(GeoParserConfig geoParserConfig) { |
| if (!initialized) { |
| initialize(geoParserConfig); |
| } |
| return this.available; |
| } |
| |
| public String getGazetteerRestEndpoint() { |
| return defaultConfig.getGazetteerRestEndpoint(); |
| } |
| |
| @Field |
| public void setGazetteerRestEndpoint(String gazetteerRestEndpoint) { |
| defaultConfig.setGazetteerRestEndpoint(gazetteerRestEndpoint); |
| } |
| |
| public URL getNerModelUrl() { |
| return defaultConfig.getNerModelUrl(); |
| } |
| |
| /** |
| * @param nerModelUrl url for the NER model |
| * @throws IllegalArgumentException for a malformed URL |
| */ |
| @Field |
| public void setNerModelUrl(String nerModelUrl) { |
| try { |
| defaultConfig.setNerModelUrl(new URL(nerModelUrl)); |
| } catch (MalformedURLException e) { |
| throw new IllegalArgumentException("malformed url " + nerModelUrl, e); |
| } |
| } |
| } |