| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitycoreference.impl; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Properties; |
| |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.stanbol.enhancer.engines.entitycoreference.Constants; |
| import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; |
| import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival; |
| import org.apache.stanbol.enhancer.nlp.model.Span; |
| import org.osgi.service.cm.ConfigurationException; |
| |
| /** |
| * Contains information about several terms and properties of words we use in the {@link CoreferenceFinder}. |
| * |
| * @author Cristian Petroaca |
| * |
| */ |
| class Dictionaries { |
| private static final String PLACE_ADJECTIVALS_CONFIG = "config.properties"; |
| private static final String ENTITY_BASE_URI = "entity.uri.base"; |
| |
| /** |
| * Contains the list of place adjectivals in the form: language -> adjectival -> UriRef -> adjectival -> |
| * UriRef There are Places that have multiple adjectivals so in this map there are adjectivals that point |
| * to the same UriRef but that ensures a fast lookup. |
| */ |
| private Map<String,Map<String,UriRef>> placeAdjectivalsMap; |
| |
| public Dictionaries(String[] languages) throws ConfigurationException { |
| Properties props = new Properties(); |
| InputStream in = null; |
| |
| try { |
| in = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/" |
| + PLACE_ADJECTIVALS_CONFIG); |
| props.load(in); |
| } catch (IOException e) { |
| throw new ConfigurationException("", "Could not read " + PLACE_ADJECTIVALS_CONFIG); |
| } finally { |
| if (in != null) { |
| try { |
| in.close(); |
| } catch (IOException e) {} |
| } |
| } |
| |
| String entityBaseUri = props.getProperty(ENTITY_BASE_URI); |
| if (entityBaseUri == null || entityBaseUri.equals("")) { |
| throw new ConfigurationException(ENTITY_BASE_URI, "Missing property in " |
| + PLACE_ADJECTIVALS_CONFIG); |
| } |
| |
| placeAdjectivalsMap = new HashMap<>(); |
| |
| for (String language : languages) { |
| String line = null; |
| Map<String,UriRef> languagePlaceAdjMap = new HashMap<>(); |
| InputStream langIn = null; |
| BufferedReader reader = null; |
| |
| try { |
| langIn = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/" |
| + language); |
| reader = new BufferedReader(new InputStreamReader(langIn)); |
| |
| while ((line = reader.readLine()) != null) { |
| String[] splittedLine = line.split("\t"); |
| String place = splittedLine[0]; |
| String adjectivals = splittedLine[1]; |
| UriRef ref = new UriRef(entityBaseUri + place.trim()); |
| String[] adjectivalsArray = adjectivals.split(","); |
| |
| for (String adjectival : adjectivalsArray) { |
| languagePlaceAdjMap.put(adjectival.trim().toLowerCase(), ref); |
| } |
| } |
| |
| placeAdjectivalsMap.put(language, languagePlaceAdjMap); |
| } catch (IOException ioe) { |
| throw new ConfigurationException("", "Could not read " + Constants.PLACE_ADJECTIVALS_FOLDER |
| + "/" + language, ioe); |
| } finally { |
| if (langIn != null) { |
| try { |
| langIn.close(); |
| } catch (IOException e) {} |
| } |
| |
| if (reader != null) { |
| try { |
| reader.close(); |
| } catch (IOException e) {} |
| } |
| } |
| } |
| } |
| |
| /** |
| * Checks whether a {@link NounPhrase} contains a place adjectival and returns it. |
| * |
| * @param language |
| * @param nounPhrase |
| * @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not. |
| */ |
| public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) { |
| List<Span> tokens = nounPhrase.getTokens(); |
| Map<String,UriRef> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language); |
| /* |
| * Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams |
| * should be good enough since there are no 3-gram places at least from what I saw. |
| */ |
| for (int i = 0; i < tokens.size(); i++) { |
| Span currentToken = tokens.get(i); |
| String currentTokenString = currentToken.getSpan().toLowerCase(); |
| // First the current 1-gram |
| if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) { |
| return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(), |
| langPlaceAdjectivalsMap.get(currentTokenString)); |
| } |
| |
| // Then use the 2-gram with the token before it |
| StringBuilder concatTokens = new StringBuilder(); |
| String concatTokensString = null; |
| |
| if (i > 0) { |
| Span previousToken = tokens.get(i - 1); |
| String previousTokenString = previousToken.getSpan().toLowerCase(); |
| concatTokens = new StringBuilder(); |
| concatTokens.append(previousTokenString); |
| concatTokens.append(" "); |
| concatTokens.append(currentTokenString); |
| concatTokensString = concatTokens.toString(); |
| |
| if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) { |
| return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(), |
| langPlaceAdjectivalsMap.get(concatTokensString)); |
| } |
| } |
| |
| // Now use the 2-gram with the token after it |
| if (i < tokens.size() - 1) { |
| Span nextToken = tokens.get(i + 1); |
| String nextTokenString = nextToken.getSpan().toLowerCase(); |
| concatTokens = new StringBuilder(); |
| concatTokens.append(currentTokenString); |
| concatTokens.append(" "); |
| concatTokens.append(nextTokenString); |
| |
| concatTokensString = concatTokens.toString(); |
| |
| if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) { |
| return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(), |
| langPlaceAdjectivalsMap.get(concatTokensString)); |
| } |
| } |
| } |
| |
| return null; |
| } |
| } |