OPENNLP-626 renamed packages for consistency in addons, also made small efficiencies

commit: 747ff238dfa657479f6d56e2d7910d3a8d936330 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Fri Jan 10 12:57:01 2014 +0000
committer: Mark Giaconia <markg@apache.org> Fri Jan 10 12:57:01 2014 +0000
tree: 8cea2f6d899f732350abdb77012d9c84f933768d
parent: 3bc5f315f9856bd4e1dbbba4948d83153ff8da0b [diff]
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
deleted file mode 100644
index 05fe749..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
+++ /dev/null

@@ -1,156 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.io.BufferedReader;

-import java.io.FileReader;

-import java.io.IOException;

-import java.util.ArrayList;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.List;

-import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import java.util.regex.Matcher;

-import java.util.regex.Pattern;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-

-/**

- * Finds instances of country mentions in a String, typically a document text.

- * Used to boost or degrade scoring of linked geo entities

- *

- */

-public class CountryContext {

-

- 

-  private List<CountryContextEntry> countrydata;

-  private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();

-  private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();

-  private Set<CountryContextEntry> countryHits = new HashSet<>();

-

-  public CountryContext() {

-  }

-

-  public Map<String, Set<Integer>> getCountryMentions() {

-    return countryMentions;

-  }

-

-  public Set<CountryContextEntry> getCountryHits() {

-    return countryHits;

-  }

-

-  public Map<String, Set<String>> getNameCodesMap() {

-    return nameCodesMap;

-  }

-

-  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {

-    this.nameCodesMap = nameCodesMap;

-  }

-

-  /**

-   * Finds mentions of countries based on a list from MySQL stored procedure

-   * called getCountryList. This method finds country mentions in documents,

-   * which is an essential element of the scoring that is done for geo

-   * linkedspans. Lazily loads the list from the database.

-   *

-   * @param docText    the full text of the document

-   * @param properties EntityLinkerProperties for getting database connection

-   * @return

-   */

-  public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {

-    countryMentions = new HashMap<>();

-    nameCodesMap.clear();

-    try {

-

-      if (countrydata == null) {

-        countrydata = getCountryContextFromFile(properties);

-        //   countrydata = getCountryData(properties);

-      }

-      for (CountryContextEntry entry : countrydata) {

-        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

-        Matcher rs = regex.matcher(docText);

-        String code = entry.getCc1().toLowerCase();

-

-        boolean found = false;

-        while (rs.find()) {

-          found = true;

-          Integer start = rs.start();

-          String hit = rs.group().toLowerCase();

-          if (countryMentions.containsKey(code)) {

-            countryMentions.get(code).add(start);

-          } else {

-            Set<Integer> newset = new HashSet<Integer>();

-            newset.add(start);

-            countryMentions.put(code, newset);

-          }

-          if (!hit.equals("")) {

-            if (this.nameCodesMap.containsKey(hit)) {

-              nameCodesMap.get(hit).add(code);

-            } else {

-              HashSet<String> newset = new HashSet<String>();

-              newset.add(code);

-              nameCodesMap.put(hit, newset);

-            }

-          }

-        }

-        if (found) {

-          countryHits.add(entry);

-        }

-

-      }

-

-    } catch (Exception ex) {

-      Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);

-    }

-

-

-    return countryMentions;

-  }

-

-  private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {

-    List<CountryContextEntry> entries = new ArrayList<>();

-    String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");

-    BufferedReader reader;

-

-    try {

-      path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

-

-      reader = new BufferedReader(new FileReader(path));

-

-      while (reader.read() != -1) {

-        String line = reader.readLine();

-        String[] values = line.split("\t");

-        if (values.length != 4) {

-          throw new IOException("improperly formatted country context file");

-        }

-        CountryContextEntry entry = new CountryContextEntry();

-        // rc,cc1, full_name_nd_ro,dsg

-        entry.setRc(values[0].toLowerCase());

-        entry.setCc1(values[1].toLowerCase());

-        entry.setFull_name_nd_ro(values[2].toLowerCase());

-        entry.setDsg(values[3].toLowerCase());

-        entries.add(entry);

-      }

-      reader.close();

-    } catch (IOException e) {

-      System.err.println(e);

-    }

-    return entries;

-

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
deleted file mode 100644
index a32642b..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
+++ /dev/null

@@ -1,118 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.Objects;

-

-/**

- *Stores a tuple from mysql that is used to find country mentions in document text.

- *

- */

-public class CountryContextEntry {

-  /*

-   * rc,cc1, full_name_nd_ro,dsg

-   */

-

-  private String rc;

-  private String cc1;

-  private String full_name_nd_ro;

-  private String dsg;

-  private String provCode;

-  public CountryContextEntry() {

-  }

-

-  public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) {

-    this.rc = rc;

-    this.cc1 = cc1;

-    this.full_name_nd_ro = full_name_nd_ro;

-    this.dsg = dsg;

-  }

-

-  public String getProvCode() {

-    return provCode;

-  }

-

-  public void setProvCode(String provCode) {

-    this.provCode = provCode;

-  }

-

-  public String getRc() {

-    return rc;

-  }

-

-  public void setRc(String rc) {

-    this.rc = rc;

-  }

-

-  public String getCc1() {

-    return cc1;

-  }

-

-  public void setCc1(String cc1) {

-    this.cc1 = cc1;

-  }

-

-  public String getFull_name_nd_ro() {

-    return full_name_nd_ro;

-  }

-

-  public void setFull_name_nd_ro(String full_name_nd_ro) {

-    this.full_name_nd_ro = full_name_nd_ro;

-  }

-

-  public String getDsg() {

-    return dsg;

-  }

-

-  public void setDsg(String dsg) {

-    this.dsg = dsg;

-  }

-

-  @Override

-  public int hashCode() {

-    int hash = 7;

-    hash = 17 * hash + Objects.hashCode(this.rc);

-    hash = 17 * hash + Objects.hashCode(this.cc1);

-    hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro);

-    hash = 17 * hash + Objects.hashCode(this.dsg);

-    return hash;

-  }

-

-  @Override

-  public boolean equals(Object obj) {

-    if (obj == null) {

-      return false;

-    }

-    if (getClass() != obj.getClass()) {

-      return false;

-    }

-    final CountryContextEntry other = (CountryContextEntry) obj;

-    if (!Objects.equals(this.rc, other.rc)) {

-      return false;

-    }

-    if (!Objects.equals(this.cc1, other.cc1)) {

-      return false;

-    }

-    if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) {

-      return false;

-    }

-    if (!Objects.equals(this.dsg, other.dsg)) {

-      return false;

-    }

-    return true;

-  }

-  

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
deleted file mode 100644
index 694cec6..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
+++ /dev/null

@@ -1,60 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-/**

- *Stores a "hit" on a country and the start and end of the hit

-

- */

-public class CountryContextHit {

-

-  private String countryCode;

-  private int start;

-  private int end;

-

-  public CountryContextHit() {

-  }

-

-  public CountryContextHit(String countryCode, int start, int end) {

-    this.countryCode = countryCode;

-    this.start = start;

-    this.end = end;

-  }

-

-  public String getCountryCode() {

-    return countryCode;

-  }

-

-  public void setCountryCode(String countryCode) {

-    this.countryCode = countryCode;

-  }

-

-  public int getStart() {

-    return start;

-  }

-

-  public void setStart(int start) {

-    this.start = start;

-  }

-

-  public int getEnd() {

-    return end;

-  }

-

-  public void setEnd(int end) {

-    this.end = end;

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
deleted file mode 100644
index 36bfb86..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
+++ /dev/null

@@ -1,262 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.ArrayList;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.List;

-import java.util.Map;

-import java.util.Set;

-import java.util.TreeSet;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.entitylinker.domain.BaseLink;

-import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.util.Span;

-

-/**

- * Scores toponyms based on country context as well as fuzzy string matching

- */

-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {

-

-  private Map<String, Set<String>> nameCodesMap;

-  String dominantCode = "";

-

-  @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

-

-    score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

-

-  }

-

-  /**

-   * Assigns a score to each BaseLink in each linkedSpan's set of N best

-   * matches. Currently the scoring indicates the probability that the toponym

-   * is correct based on the country context in the document and fuzzy string

-   * matching

-   *

-   * @param linkedData     the linked spans, holds the Namefinder results, and

-   *                       the list of BaseLink for each

-   * @param countryHits    all the country mentions in the document

-   * @param nameCodesMap   maps a country indicator name to a country code. Used

-   *                       to determine if the namefinder found the same exact

-   *                       toponym the country context did. If so the score is

-   *                       boosted due to the high probability that the

-   *                       NameFinder actually "rediscovered" a country

-   * @param docText        the full text of the document...not used in this

-   *                       default implementation

-   * @param sentences      the sentences that correspond to the doc text.

-   * @param maxAllowedDist a constant that is used to determine which country

-   *                       mentions, based on proximity within the text, should

-   *                       be used to score the Named Entity.

-   * @return

-   */

-  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {

-    this.nameCodesMap = nameCodesMap;

-    setDominantCode(countryHits);

-    for (LinkedSpan<BaseLink> linkedspan : linkedData) {

-

-      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);

-    }

-    return linkedData;

-  }

-

-  /**

-   * sets class level variable to a code based on the number of mentions

-   *

-   * @param countryHits

-   */

-  private void setDominantCode(Map<String, Set<Integer>> countryHits) {

-    int hits = -1;

-    for (String code : countryHits.keySet()) {

-      if (countryHits.get(code).size() > hits) {

-        hits = countryHits.get(code).size();

-        dominantCode = code;

-      }

-    }

-  }

-

-  /**

-   * Generates distances from each country mention to the span's location in the

-   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms

-   * are resolved to the correct country and coordinate.

-   *

-   * @param sentences

-   * @param countryHits

-   * @param span

-   * @return

-   */

-  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {

-    Double score = 0.0;

-    //get the index of the actual span, begining of sentence

-    //should generate tokens from sentence and create a char offset...

-    //could have large sentences due to poor sentence detection or wonky doc text

-    int sentenceIdx = span.getSentenceid();

-    int sentIndexInDoc = sentences[sentenceIdx].getStart();

-    /**

-     * create a map of all the span's proximal country mentions in the document

-     * Map< countrycode, set of <distances from this NamedEntity>>

-     */

-    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();

-    //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>

-    for (String cCode : countryHits.keySet()) {

-//iterate over all the regex start values and calculate an offset

-      for (Integer cHit : countryHits.get(cCode)) {

-        Integer absDist = Math.abs(sentIndexInDoc - cHit);

-        //only include near mentions based on a heuristic

-        //TODO make this a property

-        //  if (absDist < maxAllowedDistance) {

-        if (distancesFromCodeMap.containsKey(cCode)) {

-          distancesFromCodeMap.get(cCode).add(absDist);

-        } else {

-          HashSet<Integer> newset = new HashSet<Integer>();

-          newset.add(absDist);

-          distancesFromCodeMap.put(cCode, newset);

-        }

-      }

-

-      //}

-    }

-    //we now know how far this named entity is from every country mention in the document

-

-    /**

-     * the gaz matches that have a country code that have mentions in the doc

-     * that are closest to the Named Entity should return the best score.

-     * Analyzemap generates a likelihood score that the toponym from the gaz is

-     * referring to one of the countries, i.e, Map<countrycode, prob that this

-     * span is referring to the toponym form this code key>

-     */

-    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);

-    for (BaseLink link : span.getLinkedEntries()) {

-      //getItemParentId is the country code

-      String spanCountryCode = link.getItemParentID();

-      if (scoreMap.containsKey(spanCountryCode)) {

-

-        score = scoreMap.get(spanCountryCode);

-        ///does the name extracted match a country name?

-        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {

-          //if so, is it the correct country code for that name?

-          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {

-            //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1

-            //TODO: make this multiplier configurable

-            score = (score + .75) > 1.0 ? 1d : (score + .75);

-

-            if (link.getItemParentID().equals(dominantCode)) {

-              score = (score + .25) > 1.0 ? 1d : (score + .25);

-            }

-          }

-        }

-      }

-      link.getScoreMap().put("countrycontext", score);

-    }

-    return span;

-  }

-

-  /**

-   * takes a map of distances from the NE to each country mention and generates

-   * a map of scores for each country code. The map is then correlated to teh

-   * correlated to the code of the BaseLink parentid for retrieval. Then the

-   * score is added to the overall.

-   *

-   * @param distanceMap

-   * @param sentences

-   * @param span

-   * @return

-   */

-  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {

-

-    Map<String, Double> scoreMap = new HashMap<String, Double>();

-    if (distanceMap.isEmpty()) {

-      return scoreMap;

-    }

-    TreeSet<Integer> all = new TreeSet<Integer>();

-    for (String key : distanceMap.keySet()) {

-      all.addAll(distanceMap.get(key));

-    }

-    //get min max for normalization, this could be more efficient

-

-    Integer min = all.first();

-    Integer max = all.last();

-    if (min == max) {

-      min = 0;

-    }

-    for (String key : distanceMap.keySet()) {

-

-      TreeSet<Double> normalizedDistances = new TreeSet<Double>();

-      for (Integer i : distanceMap.get(key)) {

-        Double norm = normalize(i, min, max);

-        //reverse the normed distance so low numbers (closer) are better

-        //this could be improved with a "decaying " function using an imcreaseing negative exponent

-        Double reverse = Math.abs(norm - 1);

-        normalizedDistances.add(reverse);

-      }

-

-

-      List<Double> doubles = new ArrayList<Double>(normalizedDistances);

-      scoreMap.put(key, slidingDistanceAverage(doubles));

-    }

-    return scoreMap;

-  }

-

-  /**

-   * this method is an attempt to make closer clusters of mentions group

-   * together to smooth out the average, so one distant outlier does not kill

-   * the score for an obviously good hit. More elegant solution is possible

-   * using Math.pow, and making the score decay with distance by using an

-   * increasing negative exponent

-   *

-   * @param normDis the normalized and sorted set of distances as a list

-   * @return

-   */

-  private Double slidingDistanceAverage(List<Double> normDis) {

-    List<Double> windowOfAverages = new ArrayList<Double>();

-

-    if (normDis.size() < 3) {

-      windowOfAverages.addAll(normDis);

-    } else {

-

-      for (int i = 0; i < normDis.size() - 1; i++) {

-        double a = normDis.get(i);

-        double b = normDis.get(i + 1);

-        windowOfAverages.add((a + b) / 2);

-

-      }

-    }

-    double sum = 0d;

-    for (double d : windowOfAverages) {

-      sum += d;

-    }

-    double result = sum / windowOfAverages.size();

-    //TODO: ++ prob when large amounts of mentions for a code

-    //System.out.println("avg of window:" + result);

-    return result;

-  }

-

-  /**

-   * transposes a value within one range to a relative value in a different

-   * range. Used to normalize distances in this class.

-   *

-   * @param valueToNormalize the value to place within the new range

-   * @param minimum          the min of the set to be transposed

-   * @param maximum          the max of the set to be transposed

-   * @return

-   */

-  private Double normalize(int valueToNormalize, int minimum, int maximum) {

-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

-    d = d == null ? 0d : d;

-    return d;

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
deleted file mode 100644
index af1aa1c..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
+++ /dev/null

@@ -1,96 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.HashSet;

-import java.util.List;

-import java.util.Set;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.entitylinker.domain.BaseLink;

-import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.ngram.NGramGenerator;

-import opennlp.tools.util.Span;

-

-/**

- *

- * Generates scores for string comparisons.

- */

-public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

-

-  @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

-    for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {

-      for (BaseLink link : linkedSpan.getLinkedEntries()) {

-        Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);

-        link.getScoreMap().put("dice", dice);

-        Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));

-        link.getScoreMap().put("levenshtein", ld);

-      }

-    }

-

-

-  }

-

-  /**

-   * Generates a score based on an overlap of nGrams between two strings using

-   * the DiceCoefficient technique.

-   *

-   * @param s1     first string

-   * @param s2     second string

-   * @param nGrams number of chars in each gram

-   * @return

-   */

-  public double getDiceCoefficient(String s1, String s2, int nGrams) {

-    if (s1.equals("") || s1.equals("")) {

-      return 0d;

-    }

-    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");

-    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");

-

-    Set<String> overlap = new HashSet<String>(s1Grams);

-    overlap.retainAll(s2Grams);

-    double totcombigrams = overlap.size();

-

-    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());

-  }

-

-  private int minimum(int a, int b, int c) {

-    return Math.min(Math.min(a, b), c);

-  }

-

-  public int getLevenshteinDistance(CharSequence str1,

-          CharSequence str2) {

-    int[][] distance = new int[str1.length() + 1][str2.length() + 1];

-

-    for (int i = 0; i <= str1.length(); i++) {

-      distance[i][0] = i;

-    }

-    for (int j = 1; j <= str2.length(); j++) {

-      distance[0][j] = j;

-    }

-

-    for (int i = 1; i <= str1.length(); i++) {

-      for (int j = 1; j <= str2.length(); j++) {

-        distance[i][j] = minimum(

-                distance[i - 1][j] + 1,

-                distance[i][j - 1] + 1,

-                distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));

-      }

-    }

-

-    return distance[str1.length()][str2.length()];

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
deleted file mode 100644
index 2371333..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
+++ /dev/null

@@ -1,74 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.HashMap;

-import java.util.Map;

-import opennlp.tools.entitylinker.domain.BaseLink;

-

-/**

- *

- * Stores a record from a geographic placenames gazateer

- */

-public class GazateerEntry extends BaseLink {

-

-  private Double latitude;

-  private Double longitude;

-  private String source;

-  private String indexID;

-  private Map<String, String> indexData=new HashMap<>();

-

-  public String getIndexID() {

-    return indexID;

-  }

-

-  public void setIndexID(String indexID) {

-    this.indexID = indexID;

-  }

-

-  public Double getLatitude() {

-    return latitude;

-  }

-

-  public void setLatitude(Double latitude) {

-    this.latitude = latitude;

-  }

-

-  public Double getLongitude() {

-    return longitude;

-  }

-

-  public void setLongitude(Double longitude) {

-    this.longitude = longitude;

-  }

-

-  public String getSource() {

-    return source;

-  }

-

-  public void setSource(String source) {

-    this.source = source;

-  }

-

-  public Map<String, String> getIndexData() {

-    return indexData;

-  }

-

-  public void setIndexData(Map<String, String> indexData) {

-    this.indexData = indexData;

-  }

-  

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
deleted file mode 100644
index d8be425..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
+++ /dev/null

@@ -1,167 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.io.BufferedReader;

-import java.io.File;

-import java.io.FileReader;

-import java.util.ArrayList;

-import java.util.HashMap;

-import java.util.List;

-import java.util.Map;

-

-import org.apache.lucene.analysis.Analyzer;

-import org.apache.lucene.analysis.ar.ArabicAnalyzer;

-import org.apache.lucene.analysis.fa.PersianAnalyzer;

-import org.apache.lucene.analysis.ru.RussianAnalyzer;

-import org.apache.lucene.analysis.standard.StandardAnalyzer;

-import org.apache.lucene.analysis.th.ThaiAnalyzer;

-import org.apache.lucene.document.Document;

-import org.apache.lucene.document.Field;

-import org.apache.lucene.document.TextField;

-import org.apache.lucene.index.IndexWriter;

-import org.apache.lucene.index.IndexWriterConfig;

-import org.apache.lucene.store.Directory;

-import org.apache.lucene.store.MMapDirectory;

-import org.apache.lucene.util.Version;

-

-/**

- *

- * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker

- */

-public class GazateerIndexer {

-

-  public GazateerIndexer() {

-    loadAnalyzerMap();

-  }

-  Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();

-

-  public static interface Separable {

-

-    String getSeparator();

-  }

-

-  public enum GazType implements Separable {

-

-    GEONAMES {

-      @Override

-      public String toString() {

-        return "/opennlp_geoentitylinker_geonames_idx";

-      }

-

-      @Override

-      public String getSeparator() {

-        return "\t";

-      }

-    },

-    USGS {

-      @Override

-      public String toString() {

-        return "/opennlp_geoentitylinker_usgsgaz_idx";

-      }

-

-      @Override

-      public String getSeparator() {

-        return "\\|";

-      }

-    }

-  }

-

-  public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {

-    if (!outputIndexDir.isDirectory()) {

-      throw new IllegalArgumentException("outputIndexDir must be a directory.");

-    }

-

-    String indexloc = outputIndexDir + type.toString();

-    Directory index = new MMapDirectory(new File(indexloc));

-

-    Analyzer a = new StandardAnalyzer(Version.LUCENE_45);

-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);

-

-    IndexWriter w = new IndexWriter(index, config);

-

-    readFile(gazateerInputData, w, type);

-    w.commit();

-    w.close();

-

-  }

-

-  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {

-    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

-    List<String> fields = new ArrayList<String>();

-    int counter = 0;

-    int langCodeIndex = 0;

-    System.out.println("reading gazateer data from file...........");

-    while (reader.read() != -1) {

-      String line = reader.readLine();

-      String[] values = line.split(type.getSeparator());

-      if (counter == 0) {

-        // build fields

-        for (int i = 0; i < values.length; i++) {

-          String columnName = values[i];

-          fields.add(columnName.replace("»¿", "").trim());

-          if (columnName.toLowerCase().equals("lc")) {

-            langCodeIndex = i;

-          }

-        }

-

-

-      } else {

-        Document doc = new Document();

-        for (int i = 0; i < fields.size() - 1; i++) {

-          doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));

-        }

-        if (type == GazType.GEONAMES) {

-          /**

-           * see if the map contains a language specific analyzer

-           */

-          if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {

-            /*

-             * if so retrieve it from the map

-             */

-            Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);

-            /**

-             * index the doc using the specified analyzer

-             */

-            w.addDocument(doc, analyzer);

-          } else {

-            w.addDocument(doc);

-          }

-        } else {

-          w.addDocument(doc);

-        }

-      }

-      counter++;

-      if (counter % 10000 == 0) {

-        w.commit();

-        System.out.println(counter + " .........committed to index..............");

-      }

-

-    }

-    w.commit();

-    System.out.println("Completed indexing gaz! index name is: " + type.toString());

-  }

-/**

- * TODO: make these analyzers configurable

- */

-  private void loadAnalyzerMap() {

-    languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));

-    languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));

-    languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));

-    languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));

- 

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearchCache.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearchCache.java
deleted file mode 100644
index d4470d9..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearchCache.java
+++ /dev/null

@@ -1,45 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.ArrayList;

-import java.util.HashMap;

-import java.util.Map;

-

-/**

- *

- * Caches gazateer query results statically

- */

-public class GazateerSearchCache {

-

-  private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();

-

-

-  public static synchronized ArrayList<GazateerEntry> get(String searchString) {

-    return gazCache.get(searchString);

-  }

-

-  public static synchronized void put(String searchString, ArrayList<GazateerEntry> hits) {

-    if (gazCache.size() > 10000) {

-      gazCache.clear();

-    }

-    if (!gazCache.containsKey(searchString)) {

-      gazCache.put(searchString, hits);

-    }

-  }

-

-

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
deleted file mode 100644
index ca9b93f..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
+++ /dev/null

@@ -1,281 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.io.File;

-import java.io.IOException;

-import java.util.ArrayList;

-import java.util.Iterator;

-import java.util.List;

-import org.apache.lucene.analysis.Analyzer;

-import org.apache.lucene.analysis.standard.StandardAnalyzer;

-import org.apache.lucene.document.Document;

-import org.apache.lucene.index.DirectoryReader;

-import org.apache.lucene.index.IndexReader;

-import org.apache.lucene.index.IndexableField;

-import org.apache.lucene.queryparser.classic.ParseException;

-

-import org.apache.lucene.queryparser.classic.QueryParser;

-import org.apache.lucene.search.IndexSearcher;

-import org.apache.lucene.search.Query;

-import org.apache.lucene.search.TopDocs;

-import org.apache.lucene.store.Directory;

-import org.apache.lucene.store.MMapDirectory;

-import org.apache.lucene.util.Version;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-

-/**

- *

- * Searches Gazateers stored in a MMapDirectory lucene index

- */

-public class GazateerSearcher {

-

-  private double scoreCutoff = .75;

-  private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

-  private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

-  private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

-  private Analyzer geonamesAnalyzer;

-  //usgs US gazateer

-  private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));

-  private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);

-  private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);

-  private Analyzer usgsAnalyzer;

-

-  public GazateerSearcher() {

-  }

-

-  /**

-   *

-   * @param searchString the nameed entity to look up in the lucene index

-   * @param rowsReturned how many rows to allow lucene to return

-   * @param code         the country code

-   * @param properties   properties file that states where the lucene indexes

-   *                     are

-   * @return

-   */

-  public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {

-    ArrayList<GazateerEntry> linkedData = new ArrayList<>();

-    try {

-      /**

-       * build the search string

-       */

-      String luceneQueryString = !code.equals("")

-              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"

-              : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();

-      /**

-       * check the cache and go no further if the records already exist

-       */

-      ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);

-      if (get != null) {

-        return get;

-      }

-      if (geonamesIndex == null) {

-        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

-        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");

-        scoreCutoff = Double.valueOf(cutoff);

-        geonamesIndex = new MMapDirectory(new File(indexloc));

-        geonamesReader = DirectoryReader.open(geonamesIndex);

-        geonamesSearcher = new IndexSearcher(geonamesReader);

-        geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

-

-      }

-

-

-

-      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);

-      Query q = parser.parse(luceneQueryString);

-

-

-      TopDocs search = geonamesSearcher.search(q, rowsReturned);

-      double maxScore = (double) search.getMaxScore();

-

-      for (int i = 0; i < search.scoreDocs.length; ++i) {

-        GazateerEntry entry = new GazateerEntry();

-        int docId = search.scoreDocs[i].doc;

-        double sc = search.scoreDocs[i].score;

-

-        entry.getScoreMap().put("lucene", sc);

-

-        entry.getScoreMap().put("rawlucene", sc);

-        entry.setIndexID(docId + "");

-        entry.setSource("geonames");

-

-        Document d = geonamesSearcher.doc(docId);

-        List<IndexableField> fields = d.getFields();

-        for (int idx = 0; idx < fields.size(); idx++) {

-          String value = d.get(fields.get(idx).name());

-          value = value.toLowerCase();

-          switch (idx) {

-            case 1:

-              entry.setItemID(value);

-              break;

-            case 3:

-              entry.setLatitude(Double.valueOf(value));

-              break;

-            case 4:

-              entry.setLongitude(Double.valueOf(value));

-              break;

-            case 10:

-              entry.setItemType(value);

-              break;

-            case 12:

-              entry.setItemParentID(value);

-              break;

-            case 23:

-              entry.setItemName(value);

-              break;

-          }

-          entry.getIndexData().put(fields.get(idx).name(), value);

-        }

-        //only keep it if the country code is a match

-        if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {

-          linkedData.add(entry);

-        }

-      }

-

-      normalize(linkedData, 0d, maxScore);

-      prune(linkedData);

-    } catch (IOException | ParseException ex) {

-      System.err.println(ex);

-    }

-    /**

-     * add the records to the cache for this query

-     */

-    GazateerSearchCache.put(searchString, linkedData);

-    return linkedData;

-  }

-

-  /**

-   * Looks up the name in the USGS gazateer, after checking the cache

-   *

-   * @param searchString the nameed entity to look up in the lucene index

-   * @param rowsReturned how many rows to allow lucene to return

-   *

-   * @param properties   properties file that states where the lucene indexes

-   * @return

-   */

-  public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {

-    ArrayList<GazateerEntry> linkedData = new ArrayList<>();

-    try {

-

-      String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();

-      /**

-       * hit the cache

-       */

-      ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);

-      if (get != null) {

-        //if the name is already there, return the list of cavhed results

-        return get;

-      }

-      if (usgsIndex == null) {

-        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

-        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");

-        scoreCutoff = Double.valueOf(cutoff);

-        usgsIndex = new MMapDirectory(new File(indexloc));

-        usgsReader = DirectoryReader.open(usgsIndex);

-        usgsSearcher = new IndexSearcher(usgsReader);

-        usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

-      }

-

-

-      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);

-      Query q = parser.parse(luceneQueryString);

-

-

-      TopDocs search = usgsSearcher.search(q, rowsReturned);

-      double maxScore = (double) search.getMaxScore();

-

-

-      for (int i = 0; i < search.scoreDocs.length; ++i) {

-        GazateerEntry entry = new GazateerEntry();

-        int docId = search.scoreDocs[i].doc;

-        double sc = search.scoreDocs[i].score;

-        //keep track of the min score for normalization

-

-        entry.getScoreMap().put("lucene", sc);

-        entry.getScoreMap().put("rawlucene", sc);

-        entry.setIndexID(docId + "");

-        entry.setSource("usgs");

-        entry.setItemParentID("us");

-

-

-        Document d = usgsSearcher.doc(docId);

-        List<IndexableField> fields = d.getFields();

-        for (int idx = 0; idx < fields.size(); idx++) {

-          String value = d.get(fields.get(idx).name());

-          value = value.toLowerCase();

-          switch (idx) {

-            case 0:

-              entry.setItemID(value);

-              break;

-            case 1:

-              entry.setItemName(value);

-              break;

-            case 2:

-              entry.setItemType(value);

-              break;

-            case 9:

-              entry.setLatitude(Double.valueOf(value));

-              break;

-            case 10:

-              entry.setLongitude(Double.valueOf(value));

-              break;

-          }

-          entry.getIndexData().put(fields.get(idx).name(), value);

-        }

-        linkedData.add(entry);

-

-

-      }

-

-      normalize(linkedData, 0d, maxScore);

-      prune(linkedData);

-    } catch (IOException | ParseException ex) {

-      System.err.println(ex);

-    }

-    /**

-     * add the records to the cache for this query

-     */

-    GazateerSearchCache.put(searchString, linkedData);

-    return linkedData;

-  }

-

-  private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {

-    for (GazateerEntry gazateerEntry : linkedData) {

-

-      double luceneScore = gazateerEntry.getScoreMap().get("lucene");

-      luceneScore = normalize(luceneScore, minScore, maxScore);

-      luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;

-      luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;

-      gazateerEntry.getScoreMap().put("lucene", luceneScore);

-    }

-  }

-

-  private void prune(ArrayList<GazateerEntry> linkedData) {

-    for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {

-      GazateerEntry ge = itr.next();

-      if (ge.getScoreMap().get("lucene") < scoreCutoff) {

-        itr.remove();

-      }

-    }

-  }

-

-  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

-    d = d == null ? 0d : d;

-    return d;

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
deleted file mode 100644
index 05c63d7..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
+++ /dev/null

@@ -1,134 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.ArrayList;

-import java.util.List;

-import java.util.Map;

-import java.util.Set;

-import opennlp.tools.entitylinker.domain.BaseLink;

-import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.util.Span;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.entitylinker.EntityLinker;

-

-/**

- * Links location entities to gazatteers. Currently supports gazateers in a

- * MySql database (NGA and USGS)

- *

- *

- */

-public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

-

-  private CountryContext countryContext;

-  private Map<String, Set<Integer>> countryMentions;

-  private EntityLinkerProperties linkerProperties;

-  private GazateerSearcher gazateerSearcher = new GazateerSearcher();

-  private List<LinkedEntityScorer> scorers = new ArrayList<>();

-  /**

-   * Flag for deciding whether to search gaz only for toponyms within countries

-   * that are mentioned in the document

-   */

-  private Boolean filterCountryContext = true;

-

-  public GeoEntityLinker() {

-    countryContext = new CountryContext();

-  }

-

-  @Override

-  public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {

-    ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();

-

-    if (linkerProperties == null) {

-      throw new IllegalArgumentException("EntityLinkerProperties cannot be null");

-    }

-    countryMentions = countryContext.regexfind(doctext, linkerProperties);

-

-    for (int s = 0; s < sentences.length; s++) {

-      Span[] names = namesBySentence[s];

-      String[] tokens = tokensBySentence[s];

-      String[] matches = Span.spansToStrings(names, tokens);

-

-      for (int i = 0; i < matches.length; i++) {

-

-//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document

-        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();

-        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {

-          // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);

-          if (!countryMentions.keySet().isEmpty()) {

-            for (String code : countryMentions.keySet()) {

-              if (!code.equals("us")) {

-                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));

-              }

-            }

-          } else {

-            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));

-

-          }

-

-        }

-        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();

-        if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {

-          //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);

-          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));

-        }

-        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());

-

-        if (!usgsEntries.isEmpty()) {

-          geoSpan.getLinkedEntries().addAll(usgsEntries);

-          geoSpan.setSearchTerm(matches[i]);

-        }

-

-        if (!geoSpan.getLinkedEntries().isEmpty()) {

-          geoSpan.setSearchTerm(matches[i]);

-          geoSpan.setSentenceid(s);

-          spans.add(geoSpan);

-        }

-      }

-    }

-

-    if (scorers.isEmpty()) {

-      scorers.add(new FuzzyStringMatchScorer());

-      scorers.add(new GeoHashBinningScorer());

-      scorers.add(new CountryProximityScorer());

-      scorers.add(new ModelBasedScorer());

-    }

-    for (LinkedEntityScorer scorer : scorers) {

-      scorer.score(spans, doctext, sentences, linkerProperties, countryContext);

-    }

-    return spans;

-  }

-

-  @Override

-  public void setEntityLinkerProperties(EntityLinkerProperties properties) {

-    this.linkerProperties = properties;

-  }

-

-  @Override

-  public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {

-    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

-  }

-

-  @Override

-  public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {

-    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

-  }

-

-  @Override

-  public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {

-    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

-  }

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
deleted file mode 100644
index b1b9d11..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ /dev/null

@@ -1,146 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.io.BufferedOutputStream;

-import java.io.File;

-import java.io.FileInputStream;

-import java.io.FileOutputStream;

-import java.io.FileWriter;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.OutputStream;

-import java.util.ArrayList;

-import java.util.Collection;

-import java.util.HashMap;

-import java.util.Map;

-import java.util.Set;

-import opennlp.tools.doccat.DoccatModel;

-import opennlp.tools.doccat.DocumentCategorizerME;

-import opennlp.tools.doccat.DocumentSample;

-import opennlp.tools.doccat.DocumentSampleStream;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.util.ObjectStream;

-import opennlp.tools.util.PlainTextByLineStream;

-import static org.apache.opennlp.addons.tools.entitylinker.geoentitylinker.ModelBasedScorer.RADIUS;

-

-

-/**

- *

- * Tools for setting up GeoEntityLinker gazateers and doccat scoring model

- */

-public class GeoEntityLinkerSetupUtils {

-  public static ModelBasedScorer scorer;

-

-  static {

-    scorer = new ModelBasedScorer();

-  }

-    public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){

-      GazateerIndexer indexer = new GazateerIndexer();

-      try {

-        indexer.index(outputIndexDir, gazateerInputData, type);

-      } catch (Exception ex) {

-       ex.printStackTrace();

-      }

-    }

-    /**

-   *

-   * @param documents         A list of document texts, for best results try to

-   *                          ensure each country you care about will be

-   *                          represented in the collection

-   * @param annotationOutFile the location where the annotated doccat text file

-   *                          will be stored

-   * @param modelOutFile      the location where the doccat model will be stored

-   * @param properties        the properties where the country context object

-   *                          will find it's country data from this property:

-   *                          opennlp.geoentitylinker.countrycontext.filepath

-   * @throws IOException

-   */

-  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {

-    CountryContext context = new CountryContext();

-    FileWriter writer = new FileWriter(annotationOutFile, true);

-    System.out.println("processing " + documents.size() + " documents");

-    for (String docText : documents) {

-      System.out.append(".");

-      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);

-      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);

-      for (String key : modelCountryContext.keySet()) {

-        for (String wordbag : modelCountryContext.get(key)) {

-          writer.write(key + " " + wordbag + "\n");

-        }

-      }

-    }

-    System.out.println("Document processing complete. Writing training data to "+ annotationOutFile.getAbsolutePath());

-    writer.close();

-    System.out.println("Building Doccat model...");

-    DoccatModel model = null;

-

-    InputStream dataIn = new FileInputStream(annotationOutFile);

-    try {

-

-      ObjectStream<String> lineStream =

-              new PlainTextByLineStream(dataIn, "UTF-8");

-      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

-

-      model = DocumentCategorizerME.train("en", sampleStream);

-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));

-      model.serialize(modelOut);

-       System.out.println("Model complete!");

-    } catch (IOException e) {

-      // Failed to read or parse training data, training failed

-      e.printStackTrace();

-    }

-

-  }

-

-  /**

-   * generates proximal wordbags within the radius of a country mention within

-   * the doctext based on the country context object

-   *

-   *

-   * @param docText

-   * @param additionalContext

-   * @param radius

-   * @return

-   */

-  private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {

-    Map<String, ArrayList< String>> featureBags = new HashMap<>();

-    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

-    /**

-     * iterator over the map that contains a mapping of every country code to

-     * all of its mentions in the document

-     */

-    for (String code : countryMentions.keySet()) {

-      /**

-       * for each mention, collect features from around each mention, then

-       * consolidate the features into another map

-       */

-      for (int mentionIdx : countryMentions.get(code)) {

-        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);

-        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));

-        if (featureBags.containsKey(code)) {

-          featureBags.get(code).add(chunk);

-        } else {

-          ArrayList<String> newlist = new ArrayList<>();

-          newlist.add(chunk);

-          featureBags.put(code, newlist);

-        }

-      }

-    }

-    return featureBags;

-  }

-

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
deleted file mode 100644
index 26b69c1..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
+++ /dev/null

@@ -1,276 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.ArrayList;

-import java.util.HashMap;

-import java.util.List;

-import java.util.Map;

-import java.util.Set;

-import java.util.TreeMap;

-import java.util.TreeSet;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.entitylinker.domain.BaseLink;

-import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.util.Span;

-

-/**

- *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality

- * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the

- * Lucene search is set to an appropriate level so this class if not fed poor data.

- */

-public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {

-

-  @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {

-     score( linkedSpans);

-  }

-

-  private  void score(List<LinkedSpan> geospans) {

-    Map<Double, Double> latLongs = new HashMap<Double, Double>();

-

-    /**

-     * collect all the lat longs

-     */

-    for (LinkedSpan<BaseLink> ls : geospans) {

-      for (BaseLink bl : ls.getLinkedEntries()) {

-        if (bl instanceof GazateerEntry) {

-          GazateerEntry entry = (GazateerEntry) bl;

-          latLongs.put(entry.getLatitude(), entry.getLongitude());

-        

-        }

-      }

-    }

-

-    /**

-     * convert to geohash and add to sortedset

-     */

-    TreeSet<Long> geoHashes = new TreeSet<Long>();

-    for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {

-      geoHashes.add(geoHash(entry.getKey(), entry.getValue()));

-    }

-    /**

-     * bin the points and generate a scoremap

-     */

-    Map<Long, Set<Long>> bins = bin(geoHashes);

-    Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);

-    /**

-     * iterate over the data again and assign the score based on the bins

-     */

-    for (LinkedSpan<BaseLink> ls : geospans) {

-      for (BaseLink bl : ls.getLinkedEntries()) {

-        Long geohash = -1L;

-        Double score = 0d;

-        if (bl instanceof GazateerEntry) {

-          GazateerEntry entry = (GazateerEntry) bl;

-          geohash = geoHash(entry.getLatitude(), entry.getLongitude());

-        

-        }

-        if (scores.containsKey(geohash)) {

-          score = scores.get(geohash);

-

-        } else {

-          for (Long bin : bins.keySet()) {

-            if (bin == geohash || bins.get(bin).contains(geohash)) {

-              score = scores.get(bin);

-              break;

-            }

-          }

-        }

-        bl.getScoreMap().put("geohashbin", score);

-      }

-    }

-

-

-  }

-

-  private Long normalize(Double coordpart, Boolean isLat) {

-    Integer add = isLat ? 90 : 180;

-    coordpart = Math.abs(coordpart + add);

-    coordpart = coordpart * 1000000;

-

-    Long l = Math.round(coordpart);

-    String coord = String.valueOf(l);

-    if (coord.length() < 8) {

-      while (coord.length() < 8) {

-        coord += "0";

-      }

-    }

-    coord = coord.substring(0, 8);

-    l = Long.valueOf(coord);

-    return l;

-  }

-

-  /**

-   * interleaves a lat and a long to place the coordinate in linear sortable

-   * space for binning simplicity

-   *

-   * @param lat

-   * @param lon

-   * @return

-   */

-  private Long geoHash(double lat, double lon) {

-    Long normLat = normalize(lat, Boolean.TRUE);

-    Long normLon = normalize(lon, Boolean.FALSE);

-    String sLat = String.valueOf(normLat);

-    String sLon = String.valueOf(normLon);

-    char[] latInts = sLat.toCharArray();

-    char[] lonInts = sLon.toCharArray();

-    String geoHash = "";

-    int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;

-    for (int i = 0; i < len - 1; i++) {

-      String a = String.valueOf(latInts[i]);

-      String b = String.valueOf(lonInts[i]);

-      geoHash += a + b;

-    }

-

-    return Long.valueOf(geoHash);

-  }

-

-  private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {

-    ArrayList<Long> list = new ArrayList<Long>(sets);

-    ArrayList<Long> diffs = new ArrayList<Long>();

-    /**

-     * create a set of differences between the points

-     */

-    for (int i = 0; i < list.size() - 1; i++) {

-      Long n = list.get(i + 1);

-      Long v = list.get(i);

-      diffs.add(Math.abs(n - v));

-    }

-    /**

-     * generate an average "distance" between the normed points

-     */

-    Long sum = 0L;

-    for (Long l : diffs) {

-      sum += l;

-    }

-    Long avg=sum;

-    if(!diffs.isEmpty()){

-     avg = sum / diffs.size();

-    }

-

-

-    /**

-     * generate break values where the disparity is greater than the average

-     */

-    TreeSet<Long> breaks = new TreeSet<Long>();

-    for (int i = 0; i < list.size() - 1; i++) {

-      Long n = list.get(i + 1);

-      Long v = list.get(i);

-      //Long percent = 100 - (v / n * 100);

-      Long diff = n - v;

-      if (diff > avg) {

-        breaks.add(v);

-      }

-    }

-    /**

-     * based on the break values, place subsets of close points into bins

-     */

-    TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();

-    Long lastBreak = -1L;

-    for (Long br : breaks) {

-      if (lastBreak == -1L) {

-        binToAmount.put(br, sets.subSet(0L, true, br, true));

-      } else {

-        binToAmount.put(br, sets.subSet(lastBreak, false, br, true));

-      }

-      lastBreak = br;

-    }

-    lastBreak = sets.higher(lastBreak);

-    if (lastBreak != null) {

-      binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));

-      if (binToAmount.get(lastBreak).isEmpty()) {

-        binToAmount.get(lastBreak).add(lastBreak);

-      }

-    }

-    /**

-     * "binToAmount" is a map of the break value to all the points behind it

-     * (it's sorted), so the key is the max value of its set of values

-     */

-    return binToAmount;

-  }

-

-  /**

-   * returns a map of geohashes and their score

-   *

-   * @param binToAmount

-   * @return Map< Geohash, score>

-   */

-  private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {

-    TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();

-    TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();

-    /**

-     * if there is only one bin return 1 as the rank for each item in the value

-     */

-    if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {

-      for (Long bin : binToAmount.keySet()) {

-        for (Long hash : binToAmount.get(bin)) {

-          ranks.put(bin, 1d);

-        }

-      }

-      return ranks;

-    }

-    int total = 0;

-    /**

-     * generate a total number of points

-     */

-    for (Set<Long> geohashes : binToAmount.values()) {

-      total += geohashes.size();

-    }

-    /**

-     * divide total by bin size, largest bin size gets best score, everything in

-     * that bin gets that score because it is part of that primary cluster

-     * TODO... do an extra iteration of clustering within the predominant

-     * cluster to refine the scoring or make the basis of the binning more

-     * granular than > avg

-     */

-    TreeSet<Double> rankSet = new TreeSet<Double>();

-    for (Long key : binToAmount.keySet()) {

-      int size = binToAmount.get(key).size();

-      Double rank = (double) total / size;

-      rankSet.add(rank);

-      ranks.put(key, rank);

-    }

-    /**

-     * load the final score map with normalized values

-     */

-    for (Map.Entry<Long, Double> rank : ranks.entrySet()) {

-      double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);

-      double reverse = Math.abs(norm - 1);

-      double score = reverse > 1d ? 1.0 : reverse;

-      normRanks.put(rank.getKey(), score);

-    }

-

-    return normRanks;

-  }

-

-  /**

-   * transposes a number in a range to a double between 0 and 1

-   *

-   * @param valueToNormalize the value to be normalized (placed within a new

-   *                         range of 0-1)

-   * @param minimum          the min of the current range

-   * @param maximum          the max of the current range

-   * @return

-   */

-  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

-    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

-    d = d == null ? 0d : d;

-    return d;

-  }

-}

-


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
deleted file mode 100644
index 3f7d5fa..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
+++ /dev/null

@@ -1,38 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.util.List;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.util.Span;

-

-/**

- * Structure for scoring linked entities. The Map logically represents a pair :

- * "Score type" to the "actual Score."

- */

-public interface LinkedEntityScorer<T> {

-

-/**

- * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan

- * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored

- * @param docText the full text of the document.

- * @param sentenceSpans the sentence spans the correspond to the document text

- * @param additionalContext any additional data required to perform the scoring operation

- * @return void

- */

-  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);

-}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
deleted file mode 100644
index e25ba07..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
+++ /dev/null

@@ -1,159 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

-

-import java.io.File;

-import java.io.FileNotFoundException;

-import java.io.IOException;

-import java.util.HashMap;

-import java.util.List;

-import java.util.Map;

-import opennlp.tools.doccat.DoccatModel;

-import opennlp.tools.doccat.DocumentCategorizerME;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.entitylinker.domain.BaseLink;

-import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.util.Span;

-

-/**

- *

- * Utilizes a doccat model to score toponyms based on surrounding context

- */

-public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {

-

-

-  DocumentCategorizerME documentCategorizerME;

-  DoccatModel doccatModel;

-  public static final int RADIUS = 100;

-

-  @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

-    try {

-      if (doccatModel == null) {

-        String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");

-        if (path.equals("")) {

-          return;

-        }

-        doccatModel = new DoccatModel(new File(path));

-        documentCategorizerME = new DocumentCategorizerME(doccatModel);

-      }

-      Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);

-      for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {

-        Map<String, Double> scores = this.getScore(entry.getValue());

-        for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {

-          double score = 0d;

-          if (scores.containsKey(link.getItemParentID())) {

-            score = scores.get(link.getItemParentID());

-          }

-          link.getScoreMap().put("countrymodel", score);

-        }

-      }

-

-    } catch (FileNotFoundException ex) {

-      System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");

-    } catch (IOException ex) {

-      System.err.println(ex);

-    } catch (Exception ex) {

-      System.err.println(ex);

-    }

-  }

-

-  /**

-   * generates features using a BagOfWordsfeatureGenerator that are within the

-   * radius of a mention within the doctext

-   *

-   * @param linkedSpans

-   * @param docText

-   * @param additionalContext

-   * @param radius

-   * @return a map of the index of the linked span to the string of surrounding

-   *         text: Map<indexofspan,surrounding text>

-   */

-  public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {

-    Map<Integer, String> featureBags = new HashMap<>();

-    Map<Integer, Integer> nameMentionMap = new HashMap<>();

-    /**

-     * iterator over the map that contains a mapping of every country code to

-     * all of its mentions in the document

-     */

-    for (int i = 0; i < linkedSpans.size(); i++) {

-      LinkedSpan span = linkedSpans.get(i);

-      if (span.getLinkedEntries().isEmpty()) {

-        //don't care about spans that did not get linked to anything at all; nothing to work with

-        continue;

-      }

-      /**

-       * get the sentence the name span was found in, the beginning of the

-       * sentence will suffice as a centroid for feature generation around the

-       * named entity

-       */

-      Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();

-      nameMentionMap.put(i, mentionIdx);

-    }

-    /**

-     * now associate each span to a string that will be used for categorization

-     * against the model.

-     */

-    for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {

-      featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));

-    }

-

-

-    return featureBags;

-  }

-

-  public String getTextChunk(int mentionIdx, String docText, int radius) {

-    int docSize = docText.length();

-    int left = 0, right = 0;

-    left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;

-    right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;

-    String chunk = "";

-    if (right <= left) {

-      chunk = "";

-    } else {

-      /**

-       * don't want to chop any words in half, so take fron the first space to

-       * the last space in the chunk string

-       */

-      chunk = docText.substring(left, right);

-      if (left != 0) {

-        left = chunk.indexOf(" ");

-      }

-      right = chunk.lastIndexOf(" ");

-      /**

-       * now get the substring again with only whole words

-       */

-      if (left < right) {

-        chunk = chunk.substring(left, right);

-      }

-    }

-

-    return chunk;

-  }

-

-  private Map<String, Double> getScore(String text) throws Exception {

-    Map<String, Double> scoreMap = new HashMap<>();

-    double[] categorize = documentCategorizerME.categorize(text);

-    int catSize = documentCategorizerME.getNumberOfCategories();

-    for (int i = 0; i < catSize; i++) {

-      String category = documentCategorizerME.getCategory(i);

-      scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);

-    }

-    return scoreMap;

-  }

-

-  

-}
commit	747ff238dfa657479f6d56e2d7910d3a8d936330	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Fri Jan 10 12:57:01 2014 +0000
committer	Mark Giaconia <markg@apache.org>	Fri Jan 10 12:57:01 2014 +0000
tree	8cea2f6d899f732350abdb77012d9c84f933768d
parent	3bc5f315f9856bd4e1dbbba4948d83153ff8da0b [diff]