blob: 20021ce31d8d7145e530cb33335c75ef127c368d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.osgi.service.cm.ConfigurationException;
/**
* Contains information about several terms and properties of words we use in the {@link CoreferenceFinder}.
*
* @author Cristian Petroaca
*
*/
class Dictionaries {
private static final String PLACE_ADJECTIVALS_CONFIG = "config.properties";
private static final String ENTITY_BASE_URI = "entity.uri.base";
/**
* Contains the list of place adjectivals in the form: language -> adjectival -> UriRef -> adjectival ->
* UriRef There are Places that have multiple adjectivals so in this map there are adjectivals that point
* to the same UriRef but that ensures a fast lookup.
*/
private Map<String,Map<String,UriRef>> placeAdjectivalsMap;
public Dictionaries(String[] languages) throws ConfigurationException {
Properties props = new Properties();
InputStream in = null;
try {
in = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/"
+ PLACE_ADJECTIVALS_CONFIG);
props.load(in);
} catch (IOException e) {
throw new ConfigurationException("", "Could not read " + PLACE_ADJECTIVALS_CONFIG);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {}
}
}
String entityBaseUri = props.getProperty(ENTITY_BASE_URI);
if (entityBaseUri == null || entityBaseUri.equals("")) {
throw new ConfigurationException(ENTITY_BASE_URI, "Missing property in "
+ PLACE_ADJECTIVALS_CONFIG);
}
placeAdjectivalsMap = new HashMap<>();
for (String language : languages) {
String line = null;
Map<String,UriRef> languagePlaceAdjMap = new HashMap<>();
InputStream langIn = null;
BufferedReader reader = null;
try {
langIn = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/"
+ language);
reader = new BufferedReader(new InputStreamReader(langIn));
while ((line = reader.readLine()) != null) {
String[] splittedLine = line.split("\t");
String place = splittedLine[0];
String adjectivals = splittedLine[1];
UriRef ref = new UriRef(entityBaseUri + place.trim());
String[] adjectivalsArray = adjectivals.split(",");
for (String adjectival : adjectivalsArray) {
languagePlaceAdjMap.put(adjectival.trim().toLowerCase(), ref);
}
}
placeAdjectivalsMap.put(language, languagePlaceAdjMap);
} catch (IOException ioe) {
throw new ConfigurationException("", "Could not read " + Constants.PLACE_ADJECTIVALS_FOLDER
+ "/" + language, ioe);
} finally {
if (langIn != null) {
try {
langIn.close();
} catch (IOException e) {}
}
if (reader != null) {
try {
reader.close();
} catch (IOException e) {}
}
}
}
}
/**
* Checks whether a {@link NounPhrase} contains a place adjectival and returns it.
*
* @param language
* @param nounPhrase
* @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not.
*/
public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) {
List<Span> tokens = nounPhrase.getTokens();
Map<String,UriRef> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language);
/*
* Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams
* should be good enough since there are no 3-gram places at least from what I saw.
*/
for (int i = 0; i < tokens.size(); i++) {
Span currentToken = tokens.get(i);
String currentTokenString = currentToken.getSpan().toLowerCase();
// First the current 1-gram
if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) {
return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(),
langPlaceAdjectivalsMap.get(currentTokenString));
}
// Then use the 2-gram with the token before it
StringBuilder concatTokens = new StringBuilder();
String concatTokensString = null;
if (i > 0) {
Span previousToken = tokens.get(i - 1);
String previousTokenString = previousToken.getSpan().toLowerCase();
concatTokens = new StringBuilder();
concatTokens.append(previousTokenString);
concatTokens.append(" ");
concatTokens.append(currentTokenString);
concatTokensString = concatTokens.toString();
if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) {
return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(),
langPlaceAdjectivalsMap.get(concatTokensString));
}
}
// Now use the 2-gram with the token after it
if (i < tokens.size() - 1) {
Span nextToken = tokens.get(i + 1);
String nextTokenString = nextToken.getSpan().toLowerCase();
concatTokens = new StringBuilder();
concatTokens.append(currentTokenString);
concatTokens.append(" ");
concatTokens.append(nextTokenString);
concatTokensString = concatTokens.toString();
if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) {
return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(),
langPlaceAdjectivalsMap.get(concatTokensString));
}
}
}
return null;
}
}