entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.entitycoreference.impl;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;

 import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
 import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.Span;
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.osgi.service.cm.ConfigurationException;

 /**
  * Filters out bad {@link NounPhrase}s based on pos information.
  *
  * @author Cristian Petroaca
  *
  */
 /*
  * TODO - create a NounPhraseFilterer interface with multiple implementations to separate languages with
  * appositional definite article from the others.
  */
 public class NounPhraseFilterer {
     private final static String WITHIN_TEXT_DET_PROP = "within.text.referencing.determiners";
     private final static short MIN_POS_NUMBER = 2;

     /**
      * Set of determiners based on language which make a {@link NounPhrase} valid for being a coref mention.
      */
     private Map<String,Set<String>> withinTextRefDeterminers;

     public NounPhraseFilterer(String[] languages) throws ConfigurationException {
         withinTextRefDeterminers = new HashMap<String,Set<String>>();

         for (String language : languages) {
             Properties props = new Properties();
             String propertiesFile = Constants.POS_CONFIG_FOLDER + "/" + language + ".properties";
             InputStream in = null;

             try {
                 in = NounPhraseFilterer.class.getResourceAsStream(propertiesFile);
                 props.load(in);
             } catch (IOException e) {
                 throw new ConfigurationException("", "Could not read " + propertiesFile);
             } finally {
                 if (in != null) {
                     try {
                         in.close();
                     } catch (IOException e) {}
                 }
             }

             String determinersProperty = props.getProperty(WITHIN_TEXT_DET_PROP);

             if (determinersProperty == null) {
                 throw new ConfigurationException(WITHIN_TEXT_DET_PROP, "Missing property in "
                                                                        + propertiesFile);
             }

             Set<String> langDeterminerSet = new HashSet<String>();
             for (String determiner : determinersProperty.split(",")) {
                 langDeterminerSet.add(determiner);
             }

             withinTextRefDeterminers.put(language, langDeterminerSet);
         }
     }

     /**
      * Filters out noun phrases which do not contain a determiner from the given config and do not a token
      * count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
      * phrases?
      *
      * @param nounPhrases
      * @param language
      */
     public void filter(List<NounPhrase> nounPhrases, String language) {
         Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
         Iterator<NounPhrase> it = nounPhrases.iterator();

         while (it.hasNext()) {
             NounPhrase nounPhrase = it.next();
             boolean hasGoodDeterminer = false;
             short nounNo = 0;

             for (Span token : nounPhrase.getTokens()) {
                 Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);

                 if (pos != null) {
                     PosTag posTag = pos.value();

                     if (posTag.hasCategory(LexicalCategory.Noun)
                         || posTag.hasCategory(LexicalCategory.Adjective)) {
                         nounNo++;
                     }

                     if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner)
                         && langDeterminerSet.contains(token.getSpan().toLowerCase())) {
                         hasGoodDeterminer = true;
                     }
                 }
             }

             if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
                 it.remove();
             }
         }
     }

     public boolean supportsLanguage(String language) {
         return withinTextRefDeterminers.containsKey(language);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.entitycoreference.impl;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Properties;
	import java.util.Set;

	import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
	import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
	import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
	import org.apache.stanbol.enhancer.nlp.model.Span;
	import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
	import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
	import org.apache.stanbol.enhancer.nlp.pos.Pos;
	import org.apache.stanbol.enhancer.nlp.pos.PosTag;
	import org.osgi.service.cm.ConfigurationException;

	/**
	* Filters out bad {@link NounPhrase}s based on pos information.
	*
	* @author Cristian Petroaca
	*
	*/
	/*
	* TODO - create a NounPhraseFilterer interface with multiple implementations to separate languages with
	* appositional definite article from the others.
	*/
	public class NounPhraseFilterer {
	private final static String WITHIN_TEXT_DET_PROP = "within.text.referencing.determiners";
	private final static short MIN_POS_NUMBER = 2;

	/**
	* Set of determiners based on language which make a {@link NounPhrase} valid for being a coref mention.
	*/
	private Map<String,Set<String>> withinTextRefDeterminers;

	public NounPhraseFilterer(String[] languages) throws ConfigurationException {
	withinTextRefDeterminers = new HashMap<String,Set<String>>();

	for (String language : languages) {
	Properties props = new Properties();
	String propertiesFile = Constants.POS_CONFIG_FOLDER + "/" + language + ".properties";
	InputStream in = null;

	try {
	in = NounPhraseFilterer.class.getResourceAsStream(propertiesFile);
	props.load(in);
	} catch (IOException e) {
	throw new ConfigurationException("", "Could not read " + propertiesFile);
	} finally {
	if (in != null) {
	try {
	in.close();
	} catch (IOException e) {}
	}
	}

	String determinersProperty = props.getProperty(WITHIN_TEXT_DET_PROP);

	if (determinersProperty == null) {
	throw new ConfigurationException(WITHIN_TEXT_DET_PROP, "Missing property in "
	+ propertiesFile);
	}

	Set<String> langDeterminerSet = new HashSet<String>();
	for (String determiner : determinersProperty.split(",")) {
	langDeterminerSet.add(determiner);
	}

	withinTextRefDeterminers.put(language, langDeterminerSet);
	}
	}

	/**
	* Filters out noun phrases which do not contain a determiner from the given config and do not a token
	* count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
	* phrases?
	*
	* @param nounPhrases
	* @param language
	*/
	public void filter(List<NounPhrase> nounPhrases, String language) {
	Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
	Iterator<NounPhrase> it = nounPhrases.iterator();

	while (it.hasNext()) {
	NounPhrase nounPhrase = it.next();
	boolean hasGoodDeterminer = false;
	short nounNo = 0;

	for (Span token : nounPhrase.getTokens()) {
	Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);

	if (pos != null) {
	PosTag posTag = pos.value();

	if (posTag.hasCategory(LexicalCategory.Noun)
	\|\| posTag.hasCategory(LexicalCategory.Adjective)) {
	nounNo++;
	}

	if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner)
	&& langDeterminerSet.contains(token.getSpan().toLowerCase())) {
	hasGoodDeterminer = true;
	}
	}
	}

	if (!hasGoodDeterminer \|\| nounNo < MIN_POS_NUMBER) {
	it.remove();
	}
	}
	}

	public boolean supportsLanguage(String language) {
	return withinTextRefDeterminers.containsKey(language);
	}
	}