geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java - opennlp-addons - Git at Google

 /*
  * Copyright 2013 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.addons.geoentitylinker.scoring;

 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.regex.Pattern;
 import opennlp.addons.geoentitylinker.AdminBoundaryContext;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.BaseLink;
 import opennlp.tools.entitylinker.LinkedSpan;
 import opennlp.tools.util.Span;

 /**
  * Scores toponyms based on their proximity to a country mention. Based on the
  * heuristic that toponymn mentions are more likely close to their parent
  * country mentions. For instance, if the toponym Berlin is mentioned near an
  * indicator of Germany, it is more likely to be Berlin Germany than Berlin
  * Connecticut (if Connecticut is mentioned further down in the article).
  */
 public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {

   private Map<String, Set<String>> nameCodesMap;
   String dominantCode = "";
   private Map<String, String> regexMap = new HashMap<>();

   @Override
   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

     regexMap = additionalContext.getCountryRegexMap();
     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

   }

   /**
    * Assigns a score to each BaseLink in each linkedSpan's set of N best
    * matches. Currently, the scoring indicates the probability that the toponym
    * is correct based on the country context in the document
    *
    * @param linkedData the linked spans, holds the Namefinder results, and the
    * list of BaseLink for each
    * @param countryHits all the country mentions in the document
    * @param nameCodesMap maps a country indicator name to a country code. Used
    * to determine if the namefinder found the same exact toponym the country
    * context did. If so the score is boosted due to the high probability that
    * the NameFinder actually "rediscovered" a country
    * @param docText the full text of the document...not used in this default
    * implementation
    * @param sentences the sentences that correspond to the doc text.
    * @param maxAllowedDist a constant that is used to determine which country
    * mentions, based on proximity within the text, should be used to score the
    * Named Entity.
    * @return
    */
   public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
     this.nameCodesMap = nameCodesMap;
     setDominantCode(countryHits);
     for (LinkedSpan<BaseLink> linkedspan : linkedData) {

       linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
     }
     return linkedData;
   }

   /**
    * sets class level variable to a code based on the number of mentions
    *
    * @param countryHits
    */
   private void setDominantCode(Map<String, Set<Integer>> countryHits) {
     int hits = -1;
     for (String code : countryHits.keySet()) {
       if (countryHits.get(code).size() > hits) {
         hits = countryHits.get(code).size();
         dominantCode = code;
       }
     }
   }

   /**
    * Generates distances from each country mention to the span's location in the
    * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
    * are resolved to the correct country and coordinate.
    *
    * @param sentences
    * @param countryHits
    * @param span
    * @return
    */
   private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
     Double score = 0.0;
     /*
      * get the index of the actual span, beginning of sentence //should generate
      * tokens from sentence and create a char offset... //could have large
      * sentences due to poor sentence detection or wonky doc text
      */
     int sentenceIdx = span.getSentenceid();
     int sentIndexInDoc = sentences[sentenceIdx].getStart();
     /*
      * create a map of all the span's proximal country mentions in the document
      * Map< countrycode, set of <distances from this NamedEntity>>
      */
     Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<>();
     //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
     for (String cCode : countryHits.keySet()) {
       // iterate over all the regex start values and calculate an offset
       for (Integer cHit : countryHits.get(cCode)) {
         Integer absDist = Math.abs(sentIndexInDoc - cHit);
         //only include near mentions based on a heuristic
         //TODO make this a property
         //  if (absDist < maxAllowedDistance) {
         if (distancesFromCodeMap.containsKey(cCode)) {
           distancesFromCodeMap.get(cCode).add(absDist);
         } else {
           HashSet<Integer> newset = new HashSet<>();
           newset.add(absDist);
           distancesFromCodeMap.put(cCode, newset);
         }
       }
     }
     //we now know how far this named entity is from every country mention in the document

     /*
      * the gaz matches that have a country code that have mentions in the doc
      * that are closest to the Named Entity should return the best score.
      * Analyzemap generates a likelihood score that the toponym from the gaz is
      * referring to one of the countries, i.e, Map<countrycode, prob that this
      * span is referring to the toponym form this code key>
      */
     Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
     for (BaseLink link : span.getLinkedEntries()) {
       //getItemParentId is the country code
       String spanCountryCode = link.getItemParentID();
       if (scoreMap.containsKey(spanCountryCode)) {

         score = scoreMap.get(spanCountryCode);
         ///does the name extracted match a country name?
         if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {
           //if so, is it the correct country code for that name?
           if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
             //boost the score because it is likely that this is the location in the text, so add 50% to the score or set to 1
             score = (score + .75) > 1.0 ? 1d : (score + .75);

             if (link.getItemParentID().equals(dominantCode)) {
               score = (score + .25) > 1.0 ? 1d : (score + .25);
             }
           }
         }
       }

       link.getScoreMap().put("countrycontext", score);
     }
     return span;
   }

   /**
    * takes a map of distances from the toponym to each country mention and
    * generates a map of scores for each country code. The map is then correlated
    * to the code of the BaseLink parentid for retrieval. Then the score is added
    * to the overall list.
    *
    * @param distanceMap
    * @param sentences
    * @param span
    * @return
    */
   private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {

     Map<String, Double> scoreMap = new HashMap<>();
     if (distanceMap.isEmpty()) {
       return scoreMap;
     }
     TreeSet<Integer> all = new TreeSet<>();
     for (String key : distanceMap.keySet()) {
       all.addAll(distanceMap.get(key));
     }

     // get min max for normalization, this could be more efficient
     int min = all.first();
     int max = all.last();
     if (min == max) {
       min = 0;
     }

     for (String key : distanceMap.keySet()) {
       TreeSet<Double> normalizedDistances = new TreeSet<>();
       for (Integer i : distanceMap.get(key)) {
         Double norm = normalize(i, min, max);
         //reverse the normed distance so low numbers (closer) are better
         //this could be improved with a "decaying " function using an imcreaseing negative exponent
         Double reverse = Math.abs(norm - 1);
         normalizedDistances.add(reverse);
       }

       List<Double> doubles = new ArrayList<>(normalizedDistances);
       scoreMap.put(key, slidingDistanceAverage(doubles));
     }
     return scoreMap;
   }

   private boolean regexMatch(String placeName, String countryCode) {
     if (regexMap.containsKey(countryCode)) {
       String regexForCountry = regexMap.get(countryCode);

       Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
       return p.matcher(placeName.trim()).matches();
     }
     return false;
   }

   /**
    * this method is an attempt to make closer clusters of mentions group
    * together to smooth out the average, so one distant outlier does not kill
    * the score for an obviously good hit. More elegant solution is possible
    * using Math.pow, and making the score decay with distance by using an
    * increasing negative exponent (I think)
    *
    * @param normDis the normalized and sorted set of distances as a list
    * @return
    */
   private Double slidingDistanceAverage(List<Double> normDis) {
     List<Double> windowOfAverages = new ArrayList<>();

     if (normDis.size() < 3) {
       windowOfAverages.addAll(normDis);
     } else {

       for (int i = 0; i < normDis.size() - 1; i++) {
         double a = normDis.get(i);
         double b = normDis.get(i + 1);
         windowOfAverages.add((a + b) / 2);

       }
     }
     double sum = 0d;
     for (double d : windowOfAverages) {
       sum += d;
     }
     double result = sum / windowOfAverages.size();
     //TODO: ++ prob when large amounts of mentions for a code
     //System.out.println("avg of window:" + result);
     return result;
   }

   /**
    * transposes a value within one range to a relative value in a different
    * range. Used to normalize distances in this class.
    *
    * @param valueToNormalize the value to place within the new range
    * @param minimum the min of the set to be transposed
    * @param maximum the max of the set to be transposed
    * @return
    */
   private Double normalize(int valueToNormalize, int minimum, int maximum) {
     Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
     d = d == null ? 0d : d;
     return d;
   }
 }
	/*
	* Copyright 2013 The Apache Software Foundation.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.addons.geoentitylinker.scoring;

	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.TreeSet;
	import java.util.regex.Pattern;
	import opennlp.addons.geoentitylinker.AdminBoundaryContext;
	import opennlp.tools.entitylinker.EntityLinkerProperties;
	import opennlp.tools.entitylinker.BaseLink;
	import opennlp.tools.entitylinker.LinkedSpan;
	import opennlp.tools.util.Span;

	/**
	* Scores toponyms based on their proximity to a country mention. Based on the
	* heuristic that toponymn mentions are more likely close to their parent
	* country mentions. For instance, if the toponym Berlin is mentioned near an
	* indicator of Germany, it is more likely to be Berlin Germany than Berlin
	* Connecticut (if Connecticut is mentioned further down in the article).
	*/
	public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {

	private Map<String, Set<String>> nameCodesMap;
	String dominantCode = "";
	private Map<String, String> regexMap = new HashMap<>();

	@Override
	public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

	regexMap = additionalContext.getCountryRegexMap();
	score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

	}

	/**
	* Assigns a score to each BaseLink in each linkedSpan's set of N best
	* matches. Currently, the scoring indicates the probability that the toponym
	* is correct based on the country context in the document
	*
	* @param linkedData the linked spans, holds the Namefinder results, and the
	* list of BaseLink for each
	* @param countryHits all the country mentions in the document
	* @param nameCodesMap maps a country indicator name to a country code. Used
	* to determine if the namefinder found the same exact toponym the country
	* context did. If so the score is boosted due to the high probability that
	* the NameFinder actually "rediscovered" a country
	* @param docText the full text of the document...not used in this default
	* implementation
	* @param sentences the sentences that correspond to the doc text.
	* @param maxAllowedDist a constant that is used to determine which country
	* mentions, based on proximity within the text, should be used to score the
	* Named Entity.
	* @return
	*/
	public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
	this.nameCodesMap = nameCodesMap;
	setDominantCode(countryHits);
	for (LinkedSpan<BaseLink> linkedspan : linkedData) {

	linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
	}
	return linkedData;
	}

	/**
	* sets class level variable to a code based on the number of mentions
	*
	* @param countryHits
	*/
	private void setDominantCode(Map<String, Set<Integer>> countryHits) {
	int hits = -1;
	for (String code : countryHits.keySet()) {
	if (countryHits.get(code).size() > hits) {
	hits = countryHits.get(code).size();
	dominantCode = code;
	}
	}
	}

	/**
	* Generates distances from each country mention to the span's location in the
	* doc text. Ultimately an attempt to ensure that ambiguously named toponyms
	* are resolved to the correct country and coordinate.
	*
	* @param sentences
	* @param countryHits
	* @param span
	* @return
	*/
	private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
	Double score = 0.0;
	/*
	* get the index of the actual span, beginning of sentence //should generate
	* tokens from sentence and create a char offset... //could have large
	* sentences due to poor sentence detection or wonky doc text
	*/
	int sentenceIdx = span.getSentenceid();
	int sentIndexInDoc = sentences[sentenceIdx].getStart();
	/*
	* create a map of all the span's proximal country mentions in the document
	* Map< countrycode, set of <distances from this NamedEntity>>
	*/
	Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<>();
	//map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
	for (String cCode : countryHits.keySet()) {
	// iterate over all the regex start values and calculate an offset
	for (Integer cHit : countryHits.get(cCode)) {
	Integer absDist = Math.abs(sentIndexInDoc - cHit);
	//only include near mentions based on a heuristic
	//TODO make this a property
	// if (absDist < maxAllowedDistance) {
	if (distancesFromCodeMap.containsKey(cCode)) {
	distancesFromCodeMap.get(cCode).add(absDist);
	} else {
	HashSet<Integer> newset = new HashSet<>();
	newset.add(absDist);
	distancesFromCodeMap.put(cCode, newset);
	}
	}
	}
	//we now know how far this named entity is from every country mention in the document

	/*
	* the gaz matches that have a country code that have mentions in the doc
	* that are closest to the Named Entity should return the best score.
	* Analyzemap generates a likelihood score that the toponym from the gaz is
	* referring to one of the countries, i.e, Map<countrycode, prob that this
	* span is referring to the toponym form this code key>
	*/
	Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
	for (BaseLink link : span.getLinkedEntries()) {
	//getItemParentId is the country code
	String spanCountryCode = link.getItemParentID();
	if (scoreMap.containsKey(spanCountryCode)) {

	score = scoreMap.get(spanCountryCode);
	///does the name extracted match a country name?
	if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) \|\| regexMatch(link.getItemName(), link.getItemParentID())) {
	//if so, is it the correct country code for that name?
	if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
	//boost the score because it is likely that this is the location in the text, so add 50% to the score or set to 1
	score = (score + .75) > 1.0 ? 1d : (score + .75);

	if (link.getItemParentID().equals(dominantCode)) {
	score = (score + .25) > 1.0 ? 1d : (score + .25);
	}
	}
	}
	}

	link.getScoreMap().put("countrycontext", score);
	}
	return span;
	}

	/**
	* takes a map of distances from the toponym to each country mention and
	* generates a map of scores for each country code. The map is then correlated
	* to the code of the BaseLink parentid for retrieval. Then the score is added
	* to the overall list.
	*
	* @param distanceMap
	* @param sentences
	* @param span
	* @return
	*/
	private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {

	Map<String, Double> scoreMap = new HashMap<>();
	if (distanceMap.isEmpty()) {
	return scoreMap;
	}
	TreeSet<Integer> all = new TreeSet<>();
	for (String key : distanceMap.keySet()) {
	all.addAll(distanceMap.get(key));
	}

	// get min max for normalization, this could be more efficient
	int min = all.first();
	int max = all.last();
	if (min == max) {
	min = 0;
	}

	for (String key : distanceMap.keySet()) {
	TreeSet<Double> normalizedDistances = new TreeSet<>();
	for (Integer i : distanceMap.get(key)) {
	Double norm = normalize(i, min, max);
	//reverse the normed distance so low numbers (closer) are better
	//this could be improved with a "decaying " function using an imcreaseing negative exponent
	Double reverse = Math.abs(norm - 1);
	normalizedDistances.add(reverse);
	}

	List<Double> doubles = new ArrayList<>(normalizedDistances);
	scoreMap.put(key, slidingDistanceAverage(doubles));
	}
	return scoreMap;
	}

	private boolean regexMatch(String placeName, String countryCode) {
	if (regexMap.containsKey(countryCode)) {
	String regexForCountry = regexMap.get(countryCode);

	Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL\|Pattern.CASE_INSENSITIVE);
	return p.matcher(placeName.trim()).matches();
	}
	return false;
	}

	/**
	* this method is an attempt to make closer clusters of mentions group
	* together to smooth out the average, so one distant outlier does not kill
	* the score for an obviously good hit. More elegant solution is possible
	* using Math.pow, and making the score decay with distance by using an
	* increasing negative exponent (I think)
	*
	* @param normDis the normalized and sorted set of distances as a list
	* @return
	*/
	private Double slidingDistanceAverage(List<Double> normDis) {
	List<Double> windowOfAverages = new ArrayList<>();

	if (normDis.size() < 3) {
	windowOfAverages.addAll(normDis);
	} else {

	for (int i = 0; i < normDis.size() - 1; i++) {
	double a = normDis.get(i);
	double b = normDis.get(i + 1);
	windowOfAverages.add((a + b) / 2);

	}
	}
	double sum = 0d;
	for (double d : windowOfAverages) {
	sum += d;
	}
	double result = sum / windowOfAverages.size();
	//TODO: ++ prob when large amounts of mentions for a code
	//System.out.println("avg of window:" + result);
	return result;
	}

	/**
	* transposes a value within one range to a relative value in a different
	* range. Used to normalize distances in this class.
	*
	* @param valueToNormalize the value to place within the new range
	* @param minimum the min of the set to be transposed
	* @param maximum the max of the set to be transposed
	* @return
	*/
	private Double normalize(int valueToNormalize, int minimum, int maximum) {
	Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
	d = d == null ? 0d : d;
	return d;
	}
	}