| /* |
| * Copyright 2013 The Apache Software Foundation. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package opennlp.addons.geoentitylinker.scoring; |
| |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.TreeSet; |
| import java.util.regex.Pattern; |
| import opennlp.addons.geoentitylinker.AdminBoundaryContext; |
| import opennlp.tools.entitylinker.EntityLinkerProperties; |
| import opennlp.tools.entitylinker.BaseLink; |
| import opennlp.tools.entitylinker.LinkedSpan; |
| import opennlp.tools.util.Span; |
| |
| /** |
| * Scores toponyms based on their proximity to a country mention. Based on the |
| * heuristic that toponymn mentions are more likely close to their parent |
| * country mentions. For instance, if the toponym Berlin is mentioned near an |
| * indicator of Germany, it is more likely to be Berlin Germany than Berlin |
| * Connecticut (if Connecticut is mentioned further down in the article). |
| */ |
| public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> { |
| |
| private Map<String, Set<String>> nameCodesMap; |
| String dominantCode = ""; |
| private Map<String, String> regexMap = new HashMap<>(); |
| |
| @Override |
| public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) { |
| |
| regexMap = additionalContext.getCountryRegexMap(); |
| score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000); |
| |
| } |
| |
| /** |
| * Assigns a score to each BaseLink in each linkedSpan's set of N best |
| * matches. Currently, the scoring indicates the probability that the toponym |
| * is correct based on the country context in the document |
| * |
| * @param linkedData the linked spans, holds the Namefinder results, and the |
| * list of BaseLink for each |
| * @param countryHits all the country mentions in the document |
| * @param nameCodesMap maps a country indicator name to a country code. Used |
| * to determine if the namefinder found the same exact toponym the country |
| * context did. If so the score is boosted due to the high probability that |
| * the NameFinder actually "rediscovered" a country |
| * @param docText the full text of the document...not used in this default |
| * implementation |
| * @param sentences the sentences that correspond to the doc text. |
| * @param maxAllowedDist a constant that is used to determine which country |
| * mentions, based on proximity within the text, should be used to score the |
| * Named Entity. |
| * @return |
| */ |
| public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) { |
| this.nameCodesMap = nameCodesMap; |
| setDominantCode(countryHits); |
| for (LinkedSpan<BaseLink> linkedspan : linkedData) { |
| |
| linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist); |
| } |
| return linkedData; |
| } |
| |
| /** |
| * sets class level variable to a code based on the number of mentions |
| * |
| * @param countryHits |
| */ |
| private void setDominantCode(Map<String, Set<Integer>> countryHits) { |
| int hits = -1; |
| for (String code : countryHits.keySet()) { |
| if (countryHits.get(code).size() > hits) { |
| hits = countryHits.get(code).size(); |
| dominantCode = code; |
| } |
| } |
| } |
| |
| /** |
| * Generates distances from each country mention to the span's location in the |
| * doc text. Ultimately an attempt to ensure that ambiguously named toponyms |
| * are resolved to the correct country and coordinate. |
| * |
| * @param sentences |
| * @param countryHits |
| * @param span |
| * @return |
| */ |
| private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) { |
| Double score = 0.0; |
| /* |
| * get the index of the actual span, beginning of sentence //should generate |
| * tokens from sentence and create a char offset... //could have large |
| * sentences due to poor sentence detection or wonky doc text |
| */ |
| int sentenceIdx = span.getSentenceid(); |
| int sentIndexInDoc = sentences[sentenceIdx].getStart(); |
| /* |
| * create a map of all the span's proximal country mentions in the document |
| * Map< countrycode, set of <distances from this NamedEntity>> |
| */ |
| Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<>(); |
| //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>> |
| for (String cCode : countryHits.keySet()) { |
| // iterate over all the regex start values and calculate an offset |
| for (Integer cHit : countryHits.get(cCode)) { |
| Integer absDist = Math.abs(sentIndexInDoc - cHit); |
| //only include near mentions based on a heuristic |
| //TODO make this a property |
| // if (absDist < maxAllowedDistance) { |
| if (distancesFromCodeMap.containsKey(cCode)) { |
| distancesFromCodeMap.get(cCode).add(absDist); |
| } else { |
| HashSet<Integer> newset = new HashSet<>(); |
| newset.add(absDist); |
| distancesFromCodeMap.put(cCode, newset); |
| } |
| } |
| } |
| //we now know how far this named entity is from every country mention in the document |
| |
| /* |
| * the gaz matches that have a country code that have mentions in the doc |
| * that are closest to the Named Entity should return the best score. |
| * Analyzemap generates a likelihood score that the toponym from the gaz is |
| * referring to one of the countries, i.e, Map<countrycode, prob that this |
| * span is referring to the toponym form this code key> |
| */ |
| Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span); |
| for (BaseLink link : span.getLinkedEntries()) { |
| //getItemParentId is the country code |
| String spanCountryCode = link.getItemParentID(); |
| if (scoreMap.containsKey(spanCountryCode)) { |
| |
| score = scoreMap.get(spanCountryCode); |
| ///does the name extracted match a country name? |
| if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) { |
| //if so, is it the correct country code for that name? |
| if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) { |
| //boost the score because it is likely that this is the location in the text, so add 50% to the score or set to 1 |
| score = (score + .75) > 1.0 ? 1d : (score + .75); |
| |
| if (link.getItemParentID().equals(dominantCode)) { |
| score = (score + .25) > 1.0 ? 1d : (score + .25); |
| } |
| } |
| } |
| } |
| |
| link.getScoreMap().put("countrycontext", score); |
| } |
| return span; |
| } |
| |
| /** |
| * takes a map of distances from the toponym to each country mention and |
| * generates a map of scores for each country code. The map is then correlated |
| * to the code of the BaseLink parentid for retrieval. Then the score is added |
| * to the overall list. |
| * |
| * @param distanceMap |
| * @param sentences |
| * @param span |
| * @return |
| */ |
| private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) { |
| |
| Map<String, Double> scoreMap = new HashMap<>(); |
| if (distanceMap.isEmpty()) { |
| return scoreMap; |
| } |
| TreeSet<Integer> all = new TreeSet<>(); |
| for (String key : distanceMap.keySet()) { |
| all.addAll(distanceMap.get(key)); |
| } |
| |
| // get min max for normalization, this could be more efficient |
| int min = all.first(); |
| int max = all.last(); |
| if (min == max) { |
| min = 0; |
| } |
| |
| for (String key : distanceMap.keySet()) { |
| TreeSet<Double> normalizedDistances = new TreeSet<>(); |
| for (Integer i : distanceMap.get(key)) { |
| Double norm = normalize(i, min, max); |
| //reverse the normed distance so low numbers (closer) are better |
| //this could be improved with a "decaying " function using an imcreaseing negative exponent |
| Double reverse = Math.abs(norm - 1); |
| normalizedDistances.add(reverse); |
| } |
| |
| List<Double> doubles = new ArrayList<>(normalizedDistances); |
| scoreMap.put(key, slidingDistanceAverage(doubles)); |
| } |
| return scoreMap; |
| } |
| |
| private boolean regexMatch(String placeName, String countryCode) { |
| if (regexMap.containsKey(countryCode)) { |
| String regexForCountry = regexMap.get(countryCode); |
| |
| Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE); |
| return p.matcher(placeName.trim()).matches(); |
| } |
| return false; |
| } |
| |
| /** |
| * this method is an attempt to make closer clusters of mentions group |
| * together to smooth out the average, so one distant outlier does not kill |
| * the score for an obviously good hit. More elegant solution is possible |
| * using Math.pow, and making the score decay with distance by using an |
| * increasing negative exponent (I think) |
| * |
| * @param normDis the normalized and sorted set of distances as a list |
| * @return |
| */ |
| private Double slidingDistanceAverage(List<Double> normDis) { |
| List<Double> windowOfAverages = new ArrayList<>(); |
| |
| if (normDis.size() < 3) { |
| windowOfAverages.addAll(normDis); |
| } else { |
| |
| for (int i = 0; i < normDis.size() - 1; i++) { |
| double a = normDis.get(i); |
| double b = normDis.get(i + 1); |
| windowOfAverages.add((a + b) / 2); |
| |
| } |
| } |
| double sum = 0d; |
| for (double d : windowOfAverages) { |
| sum += d; |
| } |
| double result = sum / windowOfAverages.size(); |
| //TODO: ++ prob when large amounts of mentions for a code |
| //System.out.println("avg of window:" + result); |
| return result; |
| } |
| |
| /** |
| * transposes a value within one range to a relative value in a different |
| * range. Used to normalize distances in this class. |
| * |
| * @param valueToNormalize the value to place within the new range |
| * @param minimum the min of the set to be transposed |
| * @param maximum the max of the set to be transposed |
| * @return |
| */ |
| private Double normalize(int valueToNormalize, int minimum, int maximum) { |
| Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0; |
| d = d == null ? 0d : d; |
| return d; |
| } |
| } |