blob: da72dc89fdd0252ad7f35b30b677054ee30068e9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.osgi.service.cm.ConfigurationException;
/**
* Filters out bad {@link NounPhrase}s based on pos information.
*
* @author Cristian Petroaca
*
*/
/*
* TODO - create a NounPhraseFilterer interface with multiple implementations to separate languages with
* appositional definite article from the others.
*/
public class NounPhraseFilterer {
private final static String WITHIN_TEXT_DET_PROP = "within.text.referencing.determiners";
private final static short MIN_POS_NUMBER = 2;
/**
* Set of determiners based on language which make a {@link NounPhrase} valid for being a coref mention.
*/
private Map<String,Set<String>> withinTextRefDeterminers;
public NounPhraseFilterer(String[] languages) throws ConfigurationException {
withinTextRefDeterminers = new HashMap<String,Set<String>>();
for (String language : languages) {
Properties props = new Properties();
String propertiesFile = Constants.POS_CONFIG_FOLDER + "/" + language + ".properties";
InputStream in = null;
try {
in = NounPhraseFilterer.class.getResourceAsStream(propertiesFile);
props.load(in);
} catch (IOException e) {
throw new ConfigurationException("", "Could not read " + propertiesFile);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {}
}
}
String determinersProperty = props.getProperty(WITHIN_TEXT_DET_PROP);
if (determinersProperty == null) {
throw new ConfigurationException(WITHIN_TEXT_DET_PROP, "Missing property in "
+ propertiesFile);
}
Set<String> langDeterminerSet = new HashSet<String>();
for (String determiner : determinersProperty.split(",")) {
langDeterminerSet.add(determiner);
}
withinTextRefDeterminers.put(language, langDeterminerSet);
}
}
/**
* Filters out noun phrases which do not contain a determiner from the given config and do not a token
* count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
* phrases?
*
* @param nounPhrases
* @param language
*/
public void filter(List<NounPhrase> nounPhrases, String language) {
Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
Iterator<NounPhrase> it = nounPhrases.iterator();
while (it.hasNext()) {
NounPhrase nounPhrase = it.next();
boolean hasGoodDeterminer = false;
short nounNo = 0;
for (Span token : nounPhrase.getTokens()) {
Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (pos != null) {
PosTag posTag = pos.value();
if (posTag.hasCategory(LexicalCategory.Noun)
|| posTag.hasCategory(LexicalCategory.Adjective)) {
nounNo++;
}
if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner)
&& langDeterminerSet.contains(token.getSpan().toLowerCase())) {
hasGoodDeterminer = true;
}
}
}
if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
it.remove();
}
}
}
public boolean supportsLanguage(String language) {
return withinTextRefDeterminers.containsKey(language);
}
}