OPENNLP-626 renamed packages for consistency in addons, also made small efficiencies

commit: 574a9360298ec15406b438c713dc9bf91d316ad3 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Fri Jan 10 12:57:21 2014 +0000
committer: Mark Giaconia <markg@apache.org> Fri Jan 10 12:57:21 2014 +0000
tree: 1629b53a15e758000f5d6476a2a132722ddc1381
parent: 747ff238dfa657479f6d56e2d7910d3a8d936330 [diff]
diff --git a/apache-opennlp-addons/pom.xml b/apache-opennlp-addons/pom.xml
index 062afa4..6fd5059 100644
--- a/apache-opennlp-addons/pom.xml
+++ b/apache-opennlp-addons/pom.xml

@@ -1,28 +1,33 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

   <modelVersion>4.0.0</modelVersion>

+  <parent>

+    <groupId>org.apache.opennlp</groupId>

+    <artifactId>opennlp</artifactId>

+    <version>1.6.0-SNAPSHOT</version>

+    <relativePath>../opennlp/pom.xml</relativePath>

+  </parent>

 

-  <groupId>apache-opennlp-addons</groupId>

-  <artifactId>apache-opennlp-addons</artifactId>

+  <artifactId>geoentitylinker-addon</artifactId>

   <version>1.0-SNAPSHOT</version>

   <packaging>jar</packaging>

-<name>Apache OpenNLP Addons</name>

+  <name>geoentitylinker-addon</name>

 

   <url>http://maven.apache.org</url>

-    <build>

-        <plugins>

-            <plugin>

-                <groupId>org.apache.maven.plugins</groupId>

-                <artifactId>maven-compiler-plugin</artifactId>

-                <version>2.3.2</version>

-                <configuration>

-                    <source>1.7</source>

-                    <target>1.7</target>

-                </configuration>

-            </plugin>

-        </plugins>

-    </build>

-    <properties>

+  <build>

+    <plugins>

+      <plugin>

+        <groupId>org.apache.maven.plugins</groupId>

+        <artifactId>maven-compiler-plugin</artifactId>

+        <version>2.3.2</version>

+        <configuration>

+          <source>1.7</source>

+          <target>1.7</target>

+        </configuration>

+      </plugin>

+    </plugins>

+  </build>

+  <properties>

     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

   </properties>

 

@@ -33,7 +38,7 @@
       <version>3.8.1</version>

       <scope>test</scope>

     </dependency>

-       <dependency>

+    <dependency>

       <groupId>org.apache.lucene</groupId>

       <artifactId>lucene-core</artifactId>

       <version>4.5.0</version>

@@ -51,7 +56,7 @@
       <version>4.5.0</version>

       <optional>true</optional>

     </dependency>

-      <dependency>

+    <dependency>

       <groupId>org.apache.opennlp</groupId>

       <artifactId>opennlp-tools</artifactId>

       <version>1.6.0-SNAPSHOT</version>


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
new file mode 100644
index 0000000..bc6d787
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java

@@ -0,0 +1,156 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.io.BufferedReader;

+import java.io.FileReader;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import java.util.regex.Matcher;

+import java.util.regex.Pattern;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+

+/**

+ * Finds instances of country mentions in a String, typically a document text.

+ * Used to boost or degrade scoring of linked geo entities

+ *

+ */

+public class CountryContext {

+

+ 

+  private List<CountryContextEntry> countrydata;

+  private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();

+  private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();

+  private Set<CountryContextEntry> countryHits = new HashSet<>();

+

+  public CountryContext() {

+  }

+

+  public Map<String, Set<Integer>> getCountryMentions() {

+    return countryMentions;

+  }

+

+  public Set<CountryContextEntry> getCountryHits() {

+    return countryHits;

+  }

+

+  public Map<String, Set<String>> getNameCodesMap() {

+    return nameCodesMap;

+  }

+

+  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {

+    this.nameCodesMap = nameCodesMap;

+  }

+

+  /**

+   * Finds mentions of countries based on a list from MySQL stored procedure

+   * called getCountryList. This method finds country mentions in documents,

+   * which is an essential element of the scoring that is done for geo

+   * linkedspans. Lazily loads the list from the database.

+   *

+   * @param docText    the full text of the document

+   * @param properties EntityLinkerProperties for getting database connection

+   * @return

+   */

+  public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {

+    countryMentions = new HashMap<>();

+    nameCodesMap.clear();

+    try {

+

+      if (countrydata == null) {

+        countrydata = getCountryContextFromFile(properties);

+        //   countrydata = getCountryData(properties);

+      }

+      for (CountryContextEntry entry : countrydata) {

+        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

+        Matcher rs = regex.matcher(docText);

+        String code = entry.getCc1().toLowerCase();

+

+        boolean found = false;

+        while (rs.find()) {

+          found = true;

+          Integer start = rs.start();

+          String hit = rs.group().toLowerCase();

+          if (countryMentions.containsKey(code)) {

+            countryMentions.get(code).add(start);

+          } else {

+            Set<Integer> newset = new HashSet<Integer>();

+            newset.add(start);

+            countryMentions.put(code, newset);

+          }

+          if (!hit.equals("")) {

+            if (this.nameCodesMap.containsKey(hit)) {

+              nameCodesMap.get(hit).add(code);

+            } else {

+              HashSet<String> newset = new HashSet<String>();

+              newset.add(code);

+              nameCodesMap.put(hit, newset);

+            }

+          }

+        }

+        if (found) {

+          countryHits.add(entry);

+        }

+

+      }

+

+    } catch (Exception ex) {

+      Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);

+    }

+

+

+    return countryMentions;

+  }

+

+  private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {

+    List<CountryContextEntry> entries = new ArrayList<>();

+    String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");

+    BufferedReader reader;

+

+    try {

+      path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

+

+      reader = new BufferedReader(new FileReader(path));

+

+      while (reader.read() != -1) {

+        String line = reader.readLine();

+        String[] values = line.split("\t");

+        if (values.length != 4) {

+          throw new IOException("improperly formatted country context file");

+        }

+        CountryContextEntry entry = new CountryContextEntry();

+        // rc,cc1, full_name_nd_ro,dsg

+        entry.setRc(values[0].toLowerCase());

+        entry.setCc1(values[1].toLowerCase());

+        entry.setFull_name_nd_ro(values[2].toLowerCase());

+        entry.setDsg(values[3].toLowerCase());

+        entries.add(entry);

+      }

+      reader.close();

+    } catch (IOException e) {

+      System.err.println(e);

+    }

+    return entries;

+

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
new file mode 100644
index 0000000..61cfcbb
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java

@@ -0,0 +1,118 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.Objects;

+

+/**

+ *Stores a tuple from mysql that is used to find country mentions in document text.

+ *

+ */

+public class CountryContextEntry {

+  /*

+   * rc,cc1, full_name_nd_ro,dsg

+   */

+

+  private String rc;

+  private String cc1;

+  private String full_name_nd_ro;

+  private String dsg;

+  private String provCode;

+  public CountryContextEntry() {

+  }

+

+  public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) {

+    this.rc = rc;

+    this.cc1 = cc1;

+    this.full_name_nd_ro = full_name_nd_ro;

+    this.dsg = dsg;

+  }

+

+  public String getProvCode() {

+    return provCode;

+  }

+

+  public void setProvCode(String provCode) {

+    this.provCode = provCode;

+  }

+

+  public String getRc() {

+    return rc;

+  }

+

+  public void setRc(String rc) {

+    this.rc = rc;

+  }

+

+  public String getCc1() {

+    return cc1;

+  }

+

+  public void setCc1(String cc1) {

+    this.cc1 = cc1;

+  }

+

+  public String getFull_name_nd_ro() {

+    return full_name_nd_ro;

+  }

+

+  public void setFull_name_nd_ro(String full_name_nd_ro) {

+    this.full_name_nd_ro = full_name_nd_ro;

+  }

+

+  public String getDsg() {

+    return dsg;

+  }

+

+  public void setDsg(String dsg) {

+    this.dsg = dsg;

+  }

+

+  @Override

+  public int hashCode() {

+    int hash = 7;

+    hash = 17 * hash + Objects.hashCode(this.rc);

+    hash = 17 * hash + Objects.hashCode(this.cc1);

+    hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro);

+    hash = 17 * hash + Objects.hashCode(this.dsg);

+    return hash;

+  }

+

+  @Override

+  public boolean equals(Object obj) {

+    if (obj == null) {

+      return false;

+    }

+    if (getClass() != obj.getClass()) {

+      return false;

+    }

+    final CountryContextEntry other = (CountryContextEntry) obj;

+    if (!Objects.equals(this.rc, other.rc)) {

+      return false;

+    }

+    if (!Objects.equals(this.cc1, other.cc1)) {

+      return false;

+    }

+    if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) {

+      return false;

+    }

+    if (!Objects.equals(this.dsg, other.dsg)) {

+      return false;

+    }

+    return true;

+  }

+  

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java
new file mode 100644
index 0000000..3df2392
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java

@@ -0,0 +1,60 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+/**

+ *Stores a "hit" on a country and the start and end of the hit

+

+ */

+public class CountryContextHit {

+

+  private String countryCode;

+  private int start;

+  private int end;

+

+  public CountryContextHit() {

+  }

+

+  public CountryContextHit(String countryCode, int start, int end) {

+    this.countryCode = countryCode;

+    this.start = start;

+    this.end = end;

+  }

+

+  public String getCountryCode() {

+    return countryCode;

+  }

+

+  public void setCountryCode(String countryCode) {

+    this.countryCode = countryCode;

+  }

+

+  public int getStart() {

+    return start;

+  }

+

+  public void setStart(int start) {

+    this.start = start;

+  }

+

+  public int getEnd() {

+    return end;

+  }

+

+  public void setEnd(int end) {

+    this.end = end;

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
new file mode 100644
index 0000000..48ebccf
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java

@@ -0,0 +1,262 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.TreeSet;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ * Scores toponyms based on country context as well as fuzzy string matching

+ */

+public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {

+

+  private Map<String, Set<String>> nameCodesMap;

+  String dominantCode = "";

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+

+    score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

+

+  }

+

+  /**

+   * Assigns a score to each BaseLink in each linkedSpan's set of N best

+   * matches. Currently the scoring indicates the probability that the toponym

+   * is correct based on the country context in the document and fuzzy string

+   * matching

+   *

+   * @param linkedData     the linked spans, holds the Namefinder results, and

+   *                       the list of BaseLink for each

+   * @param countryHits    all the country mentions in the document

+   * @param nameCodesMap   maps a country indicator name to a country code. Used

+   *                       to determine if the namefinder found the same exact

+   *                       toponym the country context did. If so the score is

+   *                       boosted due to the high probability that the

+   *                       NameFinder actually "rediscovered" a country

+   * @param docText        the full text of the document...not used in this

+   *                       default implementation

+   * @param sentences      the sentences that correspond to the doc text.

+   * @param maxAllowedDist a constant that is used to determine which country

+   *                       mentions, based on proximity within the text, should

+   *                       be used to score the Named Entity.

+   * @return

+   */

+  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {

+    this.nameCodesMap = nameCodesMap;

+    setDominantCode(countryHits);

+    for (LinkedSpan<BaseLink> linkedspan : linkedData) {

+

+      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);

+    }

+    return linkedData;

+  }

+

+  /**

+   * sets class level variable to a code based on the number of mentions

+   *

+   * @param countryHits

+   */

+  private void setDominantCode(Map<String, Set<Integer>> countryHits) {

+    int hits = -1;

+    for (String code : countryHits.keySet()) {

+      if (countryHits.get(code).size() > hits) {

+        hits = countryHits.get(code).size();

+        dominantCode = code;

+      }

+    }

+  }

+

+  /**

+   * Generates distances from each country mention to the span's location in the

+   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms

+   * are resolved to the correct country and coordinate.

+   *

+   * @param sentences

+   * @param countryHits

+   * @param span

+   * @return

+   */

+  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {

+    Double score = 0.0;

+    //get the index of the actual span, begining of sentence

+    //should generate tokens from sentence and create a char offset...

+    //could have large sentences due to poor sentence detection or wonky doc text

+    int sentenceIdx = span.getSentenceid();

+    int sentIndexInDoc = sentences[sentenceIdx].getStart();

+    /**

+     * create a map of all the span's proximal country mentions in the document

+     * Map< countrycode, set of <distances from this NamedEntity>>

+     */

+    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();

+    //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>

+    for (String cCode : countryHits.keySet()) {

+//iterate over all the regex start values and calculate an offset

+      for (Integer cHit : countryHits.get(cCode)) {

+        Integer absDist = Math.abs(sentIndexInDoc - cHit);

+        //only include near mentions based on a heuristic

+        //TODO make this a property

+        //  if (absDist < maxAllowedDistance) {

+        if (distancesFromCodeMap.containsKey(cCode)) {

+          distancesFromCodeMap.get(cCode).add(absDist);

+        } else {

+          HashSet<Integer> newset = new HashSet<Integer>();

+          newset.add(absDist);

+          distancesFromCodeMap.put(cCode, newset);

+        }

+      }

+

+      //}

+    }

+    //we now know how far this named entity is from every country mention in the document

+

+    /**

+     * the gaz matches that have a country code that have mentions in the doc

+     * that are closest to the Named Entity should return the best score.

+     * Analyzemap generates a likelihood score that the toponym from the gaz is

+     * referring to one of the countries, i.e, Map<countrycode, prob that this

+     * span is referring to the toponym form this code key>

+     */

+    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);

+    for (BaseLink link : span.getLinkedEntries()) {

+      //getItemParentId is the country code

+      String spanCountryCode = link.getItemParentID();

+      if (scoreMap.containsKey(spanCountryCode)) {

+

+        score = scoreMap.get(spanCountryCode);

+        ///does the name extracted match a country name?

+        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {

+          //if so, is it the correct country code for that name?

+          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {

+            //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1

+            //TODO: make this multiplier configurable

+            score = (score + .75) > 1.0 ? 1d : (score + .75);

+

+            if (link.getItemParentID().equals(dominantCode)) {

+              score = (score + .25) > 1.0 ? 1d : (score + .25);

+            }

+          }

+        }

+      }

+      link.getScoreMap().put("countrycontext", score);

+    }

+    return span;

+  }

+

+  /**

+   * takes a map of distances from the NE to each country mention and generates

+   * a map of scores for each country code. The map is then correlated to teh

+   * correlated to the code of the BaseLink parentid for retrieval. Then the

+   * score is added to the overall.

+   *

+   * @param distanceMap

+   * @param sentences

+   * @param span

+   * @return

+   */

+  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {

+

+    Map<String, Double> scoreMap = new HashMap<String, Double>();

+    if (distanceMap.isEmpty()) {

+      return scoreMap;

+    }

+    TreeSet<Integer> all = new TreeSet<Integer>();

+    for (String key : distanceMap.keySet()) {

+      all.addAll(distanceMap.get(key));

+    }

+    //get min max for normalization, this could be more efficient

+

+    Integer min = all.first();

+    Integer max = all.last();

+    if (min == max) {

+      min = 0;

+    }

+    for (String key : distanceMap.keySet()) {

+

+      TreeSet<Double> normalizedDistances = new TreeSet<Double>();

+      for (Integer i : distanceMap.get(key)) {

+        Double norm = normalize(i, min, max);

+        //reverse the normed distance so low numbers (closer) are better

+        //this could be improved with a "decaying " function using an imcreaseing negative exponent

+        Double reverse = Math.abs(norm - 1);

+        normalizedDistances.add(reverse);

+      }

+

+

+      List<Double> doubles = new ArrayList<Double>(normalizedDistances);

+      scoreMap.put(key, slidingDistanceAverage(doubles));

+    }

+    return scoreMap;

+  }

+

+  /**

+   * this method is an attempt to make closer clusters of mentions group

+   * together to smooth out the average, so one distant outlier does not kill

+   * the score for an obviously good hit. More elegant solution is possible

+   * using Math.pow, and making the score decay with distance by using an

+   * increasing negative exponent

+   *

+   * @param normDis the normalized and sorted set of distances as a list

+   * @return

+   */

+  private Double slidingDistanceAverage(List<Double> normDis) {

+    List<Double> windowOfAverages = new ArrayList<Double>();

+

+    if (normDis.size() < 3) {

+      windowOfAverages.addAll(normDis);

+    } else {

+

+      for (int i = 0; i < normDis.size() - 1; i++) {

+        double a = normDis.get(i);

+        double b = normDis.get(i + 1);

+        windowOfAverages.add((a + b) / 2);

+

+      }

+    }

+    double sum = 0d;

+    for (double d : windowOfAverages) {

+      sum += d;

+    }

+    double result = sum / windowOfAverages.size();

+    //TODO: ++ prob when large amounts of mentions for a code

+    //System.out.println("avg of window:" + result);

+    return result;

+  }

+

+  /**

+   * transposes a value within one range to a relative value in a different

+   * range. Used to normalize distances in this class.

+   *

+   * @param valueToNormalize the value to place within the new range

+   * @param minimum          the min of the set to be transposed

+   * @param maximum          the max of the set to be transposed

+   * @return

+   */

+  private Double normalize(int valueToNormalize, int minimum, int maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    d = d == null ? 0d : d;

+    return d;

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
new file mode 100644
index 0000000..29cf58b
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java

@@ -0,0 +1,96 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.HashSet;

+import java.util.List;

+import java.util.Set;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.ngram.NGramGenerator;

+import opennlp.tools.util.Span;

+

+/**

+ *

+ * Generates scores for string comparisons.

+ */

+public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+    for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {

+      for (BaseLink link : linkedSpan.getLinkedEntries()) {

+        Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);

+        link.getScoreMap().put("dice", dice);

+        Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));

+        link.getScoreMap().put("levenshtein", ld);

+      }

+    }

+

+

+  }

+

+  /**

+   * Generates a score based on an overlap of nGrams between two strings using

+   * the DiceCoefficient technique.

+   *

+   * @param s1     first string

+   * @param s2     second string

+   * @param nGrams number of chars in each gram

+   * @return

+   */

+  public double getDiceCoefficient(String s1, String s2, int nGrams) {

+    if (s1.equals("") || s1.equals("")) {

+      return 0d;

+    }

+    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");

+    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");

+

+    Set<String> overlap = new HashSet<String>(s1Grams);

+    overlap.retainAll(s2Grams);

+    double totcombigrams = overlap.size();

+

+    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());

+  }

+

+  private int minimum(int a, int b, int c) {

+    return Math.min(Math.min(a, b), c);

+  }

+

+  public int getLevenshteinDistance(CharSequence str1,

+          CharSequence str2) {

+    int[][] distance = new int[str1.length() + 1][str2.length() + 1];

+

+    for (int i = 0; i <= str1.length(); i++) {

+      distance[i][0] = i;

+    }

+    for (int j = 1; j <= str2.length(); j++) {

+      distance[0][j] = j;

+    }

+

+    for (int i = 1; i <= str1.length(); i++) {

+      for (int j = 1; j <= str2.length(); j++) {

+        distance[i][j] = minimum(

+                distance[i - 1][j] + 1,

+                distance[i][j - 1] + 1,

+                distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));

+      }

+    }

+

+    return distance[str1.length()][str2.length()];

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
new file mode 100644
index 0000000..f375dcf
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java

@@ -0,0 +1,74 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.HashMap;

+import java.util.Map;

+import opennlp.tools.entitylinker.domain.BaseLink;

+

+/**

+ *

+ * Stores a record from a geographic placenames gazateer

+ */

+public class GazateerEntry extends BaseLink {

+

+  private Double latitude;

+  private Double longitude;

+  private String source;

+  private String indexID;

+  private Map<String, String> indexData=new HashMap<>();

+

+  public String getIndexID() {

+    return indexID;

+  }

+

+  public void setIndexID(String indexID) {

+    this.indexID = indexID;

+  }

+

+  public Double getLatitude() {

+    return latitude;

+  }

+

+  public void setLatitude(Double latitude) {

+    this.latitude = latitude;

+  }

+

+  public Double getLongitude() {

+    return longitude;

+  }

+

+  public void setLongitude(Double longitude) {

+    this.longitude = longitude;

+  }

+

+  public String getSource() {

+    return source;

+  }

+

+  public void setSource(String source) {

+    this.source = source;

+  }

+

+  public Map<String, String> getIndexData() {

+    return indexData;

+  }

+

+  public void setIndexData(Map<String, String> indexData) {

+    this.indexData = indexData;

+  }

+  

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
new file mode 100644
index 0000000..5e72a9f
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java

@@ -0,0 +1,167 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+import org.apache.lucene.analysis.Analyzer;

+import org.apache.lucene.analysis.ar.ArabicAnalyzer;

+import org.apache.lucene.analysis.fa.PersianAnalyzer;

+import org.apache.lucene.analysis.ru.RussianAnalyzer;

+import org.apache.lucene.analysis.standard.StandardAnalyzer;

+import org.apache.lucene.analysis.th.ThaiAnalyzer;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.document.Field;

+import org.apache.lucene.document.TextField;

+import org.apache.lucene.index.IndexWriter;

+import org.apache.lucene.index.IndexWriterConfig;

+import org.apache.lucene.store.Directory;

+import org.apache.lucene.store.MMapDirectory;

+import org.apache.lucene.util.Version;

+

+/**

+ *

+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker

+ */

+public class GazateerIndexer {

+

+  public GazateerIndexer() {

+    loadAnalyzerMap();

+  }

+  Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();

+

+  public static interface Separable {

+

+    String getSeparator();

+  }

+

+  public enum GazType implements Separable {

+

+    GEONAMES {

+      @Override

+      public String toString() {

+        return "/opennlp_geoentitylinker_geonames_idx";

+      }

+

+      @Override

+      public String getSeparator() {

+        return "\t";

+      }

+    },

+    USGS {

+      @Override

+      public String toString() {

+        return "/opennlp_geoentitylinker_usgsgaz_idx";

+      }

+

+      @Override

+      public String getSeparator() {

+        return "\\|";

+      }

+    }

+  }

+

+  public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {

+    if (!outputIndexDir.isDirectory()) {

+      throw new IllegalArgumentException("outputIndexDir must be a directory.");

+    }

+

+    String indexloc = outputIndexDir + type.toString();

+    Directory index = new MMapDirectory(new File(indexloc));

+

+    Analyzer a = new StandardAnalyzer(Version.LUCENE_45);

+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);

+

+    IndexWriter w = new IndexWriter(index, config);

+

+    readFile(gazateerInputData, w, type);

+    w.commit();

+    w.close();

+

+  }

+

+  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {

+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+    List<String> fields = new ArrayList<String>();

+    int counter = 0;

+    int langCodeIndex = 0;

+    System.out.println("reading gazateer data from file...........");

+    while (reader.read() != -1) {

+      String line = reader.readLine();

+      String[] values = line.split(type.getSeparator());

+      if (counter == 0) {

+        // build fields

+        for (int i = 0; i < values.length; i++) {

+          String columnName = values[i];

+          fields.add(columnName.replace("»¿", "").trim());

+          if (columnName.toLowerCase().equals("lc")) {

+            langCodeIndex = i;

+          }

+        }

+

+

+      } else {

+        Document doc = new Document();

+        for (int i = 0; i < fields.size() - 1; i++) {

+          doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));

+        }

+        if (type == GazType.GEONAMES) {

+          /**

+           * see if the map contains a language specific analyzer

+           */

+          if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {

+            /*

+             * if so retrieve it from the map

+             */

+            Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);

+            /**

+             * index the doc using the specified analyzer

+             */

+            w.addDocument(doc, analyzer);

+          } else {

+            w.addDocument(doc);

+          }

+        } else {

+          w.addDocument(doc);

+        }

+      }

+      counter++;

+      if (counter % 10000 == 0) {

+        w.commit();

+        System.out.println(counter + " .........committed to index..............");

+      }

+

+    }

+    w.commit();

+    System.out.println("Completed indexing gaz! index name is: " + type.toString());

+  }

+/**

+ * TODO: make these analyzers configurable

+ */

+  private void loadAnalyzerMap() {

+    languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));

+    languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));

+    languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));

+    languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));

+ 

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
new file mode 100644
index 0000000..437227e
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java

@@ -0,0 +1,45 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.Map;

+

+/**

+ *

+ * Caches gazateer query results statically

+ */

+public class GazateerSearchCache {

+

+  private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();

+

+

+  public static synchronized ArrayList<GazateerEntry> get(String searchString) {

+    return gazCache.get(searchString);

+  }

+

+  public static synchronized void put(String searchString, ArrayList<GazateerEntry> hits) {

+    if (gazCache.size() > 10000) {

+      gazCache.clear();

+    }

+    if (!gazCache.containsKey(searchString)) {

+      gazCache.put(searchString, hits);

+    }

+  }

+

+

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
new file mode 100644
index 0000000..ed220e8
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java

@@ -0,0 +1,281 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.io.File;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.Iterator;

+import java.util.List;

+import org.apache.lucene.analysis.Analyzer;

+import org.apache.lucene.analysis.standard.StandardAnalyzer;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.index.DirectoryReader;

+import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.index.IndexableField;

+import org.apache.lucene.queryparser.classic.ParseException;

+

+import org.apache.lucene.queryparser.classic.QueryParser;

+import org.apache.lucene.search.IndexSearcher;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.TopDocs;

+import org.apache.lucene.store.Directory;

+import org.apache.lucene.store.MMapDirectory;

+import org.apache.lucene.util.Version;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+

+/**

+ *

+ * Searches Gazateers stored in a MMapDirectory lucene index

+ */

+public class GazateerSearcher {

+

+  private double scoreCutoff = .75;

+  private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

+  private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

+  private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

+  private Analyzer geonamesAnalyzer;

+  //usgs US gazateer

+  private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));

+  private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);

+  private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);

+  private Analyzer usgsAnalyzer;

+

+  public GazateerSearcher() {

+  }

+

+  /**

+   *

+   * @param searchString the nameed entity to look up in the lucene index

+   * @param rowsReturned how many rows to allow lucene to return

+   * @param code         the country code

+   * @param properties   properties file that states where the lucene indexes

+   *                     are

+   * @return

+   */

+  public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {

+    ArrayList<GazateerEntry> linkedData = new ArrayList<>();

+    try {

+      /**

+       * build the search string

+       */

+      String luceneQueryString = !code.equals("")

+              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"

+              : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();

+      /**

+       * check the cache and go no further if the records already exist

+       */

+      ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);

+      if (get != null) {

+        return get;

+      }

+      if (geonamesIndex == null) {

+        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

+        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");

+        scoreCutoff = Double.valueOf(cutoff);

+        geonamesIndex = new MMapDirectory(new File(indexloc));

+        geonamesReader = DirectoryReader.open(geonamesIndex);

+        geonamesSearcher = new IndexSearcher(geonamesReader);

+        geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

+

+      }

+

+

+

+      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);

+      Query q = parser.parse(luceneQueryString);

+

+

+      TopDocs search = geonamesSearcher.search(q, rowsReturned);

+      double maxScore = (double) search.getMaxScore();

+

+      for (int i = 0; i < search.scoreDocs.length; ++i) {

+        GazateerEntry entry = new GazateerEntry();

+        int docId = search.scoreDocs[i].doc;

+        double sc = search.scoreDocs[i].score;

+

+        entry.getScoreMap().put("lucene", sc);

+

+        entry.getScoreMap().put("rawlucene", sc);

+        entry.setIndexID(docId + "");

+        entry.setSource("geonames");

+

+        Document d = geonamesSearcher.doc(docId);

+        List<IndexableField> fields = d.getFields();

+        for (int idx = 0; idx < fields.size(); idx++) {

+          String value = d.get(fields.get(idx).name());

+          value = value.toLowerCase();

+          switch (idx) {

+            case 1:

+              entry.setItemID(value);

+              break;

+            case 3:

+              entry.setLatitude(Double.valueOf(value));

+              break;

+            case 4:

+              entry.setLongitude(Double.valueOf(value));

+              break;

+            case 10:

+              entry.setItemType(value);

+              break;

+            case 12:

+              entry.setItemParentID(value);

+              break;

+            case 23:

+              entry.setItemName(value);

+              break;

+          }

+          entry.getIndexData().put(fields.get(idx).name(), value);

+        }

+        //only keep it if the country code is a match

+        if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {

+          linkedData.add(entry);

+        }

+      }

+

+      normalize(linkedData, 0d, maxScore);

+      prune(linkedData);

+    } catch (IOException | ParseException ex) {

+      System.err.println(ex);

+    }

+    /**

+     * add the records to the cache for this query

+     */

+    GazateerSearchCache.put(searchString, linkedData);

+    return linkedData;

+  }

+

+  /**

+   * Looks up the name in the USGS gazateer, after checking the cache

+   *

+   * @param searchString the nameed entity to look up in the lucene index

+   * @param rowsReturned how many rows to allow lucene to return

+   *

+   * @param properties   properties file that states where the lucene indexes

+   * @return

+   */

+  public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {

+    ArrayList<GazateerEntry> linkedData = new ArrayList<>();

+    try {

+

+      String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();

+      /**

+       * hit the cache

+       */

+      ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);

+      if (get != null) {

+        //if the name is already there, return the list of cavhed results

+        return get;

+      }

+      if (usgsIndex == null) {

+        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

+        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");

+        scoreCutoff = Double.valueOf(cutoff);

+        usgsIndex = new MMapDirectory(new File(indexloc));

+        usgsReader = DirectoryReader.open(usgsIndex);

+        usgsSearcher = new IndexSearcher(usgsReader);

+        usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

+      }

+

+

+      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);

+      Query q = parser.parse(luceneQueryString);

+

+

+      TopDocs search = usgsSearcher.search(q, rowsReturned);

+      double maxScore = (double) search.getMaxScore();

+

+

+      for (int i = 0; i < search.scoreDocs.length; ++i) {

+        GazateerEntry entry = new GazateerEntry();

+        int docId = search.scoreDocs[i].doc;

+        double sc = search.scoreDocs[i].score;

+        //keep track of the min score for normalization

+

+        entry.getScoreMap().put("lucene", sc);

+        entry.getScoreMap().put("rawlucene", sc);

+        entry.setIndexID(docId + "");

+        entry.setSource("usgs");

+        entry.setItemParentID("us");

+

+

+        Document d = usgsSearcher.doc(docId);

+        List<IndexableField> fields = d.getFields();

+        for (int idx = 0; idx < fields.size(); idx++) {

+          String value = d.get(fields.get(idx).name());

+          value = value.toLowerCase();

+          switch (idx) {

+            case 0:

+              entry.setItemID(value);

+              break;

+            case 1:

+              entry.setItemName(value);

+              break;

+            case 2:

+              entry.setItemType(value);

+              break;

+            case 9:

+              entry.setLatitude(Double.valueOf(value));

+              break;

+            case 10:

+              entry.setLongitude(Double.valueOf(value));

+              break;

+          }

+          entry.getIndexData().put(fields.get(idx).name(), value);

+        }

+        linkedData.add(entry);

+

+

+      }

+

+      normalize(linkedData, 0d, maxScore);

+      prune(linkedData);

+    } catch (IOException | ParseException ex) {

+      System.err.println(ex);

+    }

+    /**

+     * add the records to the cache for this query

+     */

+    GazateerSearchCache.put(searchString, linkedData);

+    return linkedData;

+  }

+

+  private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {

+    for (GazateerEntry gazateerEntry : linkedData) {

+

+      double luceneScore = gazateerEntry.getScoreMap().get("lucene");

+      luceneScore = normalize(luceneScore, minScore, maxScore);

+      luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;

+      luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;

+      gazateerEntry.getScoreMap().put("lucene", luceneScore);

+    }

+  }

+

+  private void prune(ArrayList<GazateerEntry> linkedData) {

+    for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {

+      GazateerEntry ge = itr.next();

+      if (ge.getScoreMap().get("lucene") < scoreCutoff) {

+        itr.remove();

+      }

+    }

+  }

+

+  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    d = d == null ? 0d : d;

+    return d;

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
new file mode 100644
index 0000000..3dc8c81
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

@@ -0,0 +1,134 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.EntityLinker;

+

+/**

+ * Links location entities to gazatteers. Currently supports gazateers in a

+ * MySql database (NGA and USGS)

+ *

+ *

+ */

+public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

+

+  private CountryContext countryContext;

+  private Map<String, Set<Integer>> countryMentions;

+  private EntityLinkerProperties linkerProperties;

+  private GazateerSearcher gazateerSearcher = new GazateerSearcher();

+  private List<LinkedEntityScorer> scorers = new ArrayList<>();

+  /**

+   * Flag for deciding whether to search gaz only for toponyms within countries

+   * that are mentioned in the document

+   */

+  private Boolean filterCountryContext = true;

+

+  public GeoEntityLinker() {

+    countryContext = new CountryContext();

+  }

+

+  @Override

+  public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {

+    ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();

+

+    if (linkerProperties == null) {

+      throw new IllegalArgumentException("EntityLinkerProperties cannot be null");

+    }

+    countryMentions = countryContext.regexfind(doctext, linkerProperties);

+

+    for (int s = 0; s < sentences.length; s++) {

+      Span[] names = namesBySentence[s];

+      String[] tokens = tokensBySentence[s];

+      String[] matches = Span.spansToStrings(names, tokens);

+

+      for (int i = 0; i < matches.length; i++) {

+

+//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document

+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();

+        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {

+          // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);

+          if (!countryMentions.keySet().isEmpty()) {

+            for (String code : countryMentions.keySet()) {

+              if (!code.equals("us")) {

+                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));

+              }

+            }

+          } else {

+            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));

+

+          }

+

+        }

+        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();

+        if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {

+          //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);

+          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));

+        }

+        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());

+

+        if (!usgsEntries.isEmpty()) {

+          geoSpan.getLinkedEntries().addAll(usgsEntries);

+          geoSpan.setSearchTerm(matches[i]);

+        }

+

+        if (!geoSpan.getLinkedEntries().isEmpty()) {

+          geoSpan.setSearchTerm(matches[i]);

+          geoSpan.setSentenceid(s);

+          spans.add(geoSpan);

+        }

+      }

+    }

+

+    if (scorers.isEmpty()) {

+      scorers.add(new FuzzyStringMatchScorer());

+      scorers.add(new GeoHashBinningScorer());

+      scorers.add(new CountryProximityScorer());

+      scorers.add(new ModelBasedScorer());

+    }

+    for (LinkedEntityScorer scorer : scorers) {

+      scorer.score(spans, doctext, sentences, linkerProperties, countryContext);

+    }

+    return spans;

+  }

+

+  @Override

+  public void setEntityLinkerProperties(EntityLinkerProperties properties) {

+    this.linkerProperties = properties;

+  }

+

+  @Override

+  public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {

+    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

+  }

+

+  @Override

+  public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {

+    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

+  }

+

+  @Override

+  public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {

+    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
new file mode 100644
index 0000000..00c3f9e
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java

@@ -0,0 +1,146 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.io.BufferedOutputStream;

+import java.io.File;

+import java.io.FileInputStream;

+import java.io.FileOutputStream;

+import java.io.FileWriter;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.OutputStream;

+import java.util.ArrayList;

+import java.util.Collection;

+import java.util.HashMap;

+import java.util.Map;

+import java.util.Set;

+import opennlp.tools.doccat.DoccatModel;

+import opennlp.tools.doccat.DocumentCategorizerME;

+import opennlp.tools.doccat.DocumentSample;

+import opennlp.tools.doccat.DocumentSampleStream;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.PlainTextByLineStream;

+import static opennlp.addons.geoentitylinker.ModelBasedScorer.RADIUS;

+

+

+/**

+ *

+ * Tools for setting up GeoEntityLinker gazateers and doccat scoring model

+ */

+public class GeoEntityLinkerSetupUtils {

+  public static ModelBasedScorer scorer;

+

+  static {

+    scorer = new ModelBasedScorer();

+  }

+    public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){

+      GazateerIndexer indexer = new GazateerIndexer();

+      try {

+        indexer.index(outputIndexDir, gazateerInputData, type);

+      } catch (Exception ex) {

+       ex.printStackTrace();

+      }

+    }

+    /**

+   *

+   * @param documents         A list of document texts, for best results try to

+   *                          ensure each country you care about will be

+   *                          represented in the collection

+   * @param annotationOutFile the location where the annotated doccat text file

+   *                          will be stored

+   * @param modelOutFile      the location where the doccat model will be stored

+   * @param properties        the properties where the country context object

+   *                          will find it's country data from this property:

+   *                          opennlp.geoentitylinker.countrycontext.filepath

+   * @throws IOException

+   */

+  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {

+    CountryContext context = new CountryContext();

+    FileWriter writer = new FileWriter(annotationOutFile, true);

+    System.out.println("processing " + documents.size() + " documents");

+    for (String docText : documents) {

+      System.out.append(".");

+      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);

+      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);

+      for (String key : modelCountryContext.keySet()) {

+        for (String wordbag : modelCountryContext.get(key)) {

+          writer.write(key + " " + wordbag + "\n");

+        }

+      }

+    }

+    System.out.println("Document processing complete. Writing training data to "+ annotationOutFile.getAbsolutePath());

+    writer.close();

+    System.out.println("Building Doccat model...");

+    DoccatModel model = null;

+

+    InputStream dataIn = new FileInputStream(annotationOutFile);

+    try {

+

+      ObjectStream<String> lineStream =

+              new PlainTextByLineStream(dataIn, "UTF-8");

+      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

+

+      model = DocumentCategorizerME.train("en", sampleStream);

+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));

+      model.serialize(modelOut);

+       System.out.println("Model complete!");

+    } catch (IOException e) {

+      // Failed to read or parse training data, training failed

+      e.printStackTrace();

+    }

+

+  }

+

+  /**

+   * generates proximal wordbags within the radius of a country mention within

+   * the doctext based on the country context object

+   *

+   *

+   * @param docText

+   * @param additionalContext

+   * @param radius

+   * @return

+   */

+  private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {

+    Map<String, ArrayList< String>> featureBags = new HashMap<>();

+    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

+    /**

+     * iterator over the map that contains a mapping of every country code to

+     * all of its mentions in the document

+     */

+    for (String code : countryMentions.keySet()) {

+      /**

+       * for each mention, collect features from around each mention, then

+       * consolidate the features into another map

+       */

+      for (int mentionIdx : countryMentions.get(code)) {

+        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);

+        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));

+        if (featureBags.containsKey(code)) {

+          featureBags.get(code).add(chunk);

+        } else {

+          ArrayList<String> newlist = new ArrayList<>();

+          newlist.add(chunk);

+          featureBags.put(code, newlist);

+        }

+      }

+    }

+    return featureBags;

+  }

+

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
new file mode 100644
index 0000000..4d7467f
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java

@@ -0,0 +1,276 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.TreeMap;

+import java.util.TreeSet;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality

+ * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the

+ * Lucene search is set to an appropriate level so this class if not fed poor data.

+ */

+public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {

+     score( linkedSpans);

+  }

+

+  private  void score(List<LinkedSpan> geospans) {

+    Map<Double, Double> latLongs = new HashMap<Double, Double>();

+

+    /**

+     * collect all the lat longs

+     */

+    for (LinkedSpan<BaseLink> ls : geospans) {

+      for (BaseLink bl : ls.getLinkedEntries()) {

+        if (bl instanceof GazateerEntry) {

+          GazateerEntry entry = (GazateerEntry) bl;

+          latLongs.put(entry.getLatitude(), entry.getLongitude());

+        

+        }

+      }

+    }

+

+    /**

+     * convert to geohash and add to sortedset

+     */

+    TreeSet<Long> geoHashes = new TreeSet<Long>();

+    for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {

+      geoHashes.add(geoHash(entry.getKey(), entry.getValue()));

+    }

+    /**

+     * bin the points and generate a scoremap

+     */

+    Map<Long, Set<Long>> bins = bin(geoHashes);

+    Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);

+    /**

+     * iterate over the data again and assign the score based on the bins

+     */

+    for (LinkedSpan<BaseLink> ls : geospans) {

+      for (BaseLink bl : ls.getLinkedEntries()) {

+        Long geohash = -1L;

+        Double score = 0d;

+        if (bl instanceof GazateerEntry) {

+          GazateerEntry entry = (GazateerEntry) bl;

+          geohash = geoHash(entry.getLatitude(), entry.getLongitude());

+        

+        }

+        if (scores.containsKey(geohash)) {

+          score = scores.get(geohash);

+

+        } else {

+          for (Long bin : bins.keySet()) {

+            if (bin == geohash || bins.get(bin).contains(geohash)) {

+              score = scores.get(bin);

+              break;

+            }

+          }

+        }

+        bl.getScoreMap().put("geohashbin", score);

+      }

+    }

+

+

+  }

+

+  private Long normalize(Double coordpart, Boolean isLat) {

+    Integer add = isLat ? 90 : 180;

+    coordpart = Math.abs(coordpart + add);

+    coordpart = coordpart * 1000000;

+

+    Long l = Math.round(coordpart);

+    String coord = String.valueOf(l);

+    if (coord.length() < 8) {

+      while (coord.length() < 8) {

+        coord += "0";

+      }

+    }

+    coord = coord.substring(0, 8);

+    l = Long.valueOf(coord);

+    return l;

+  }

+

+  /**

+   * interleaves a lat and a long to place the coordinate in linear sortable

+   * space for binning simplicity

+   *

+   * @param lat

+   * @param lon

+   * @return

+   */

+  private Long geoHash(double lat, double lon) {

+    Long normLat = normalize(lat, Boolean.TRUE);

+    Long normLon = normalize(lon, Boolean.FALSE);

+    String sLat = String.valueOf(normLat);

+    String sLon = String.valueOf(normLon);

+    char[] latInts = sLat.toCharArray();

+    char[] lonInts = sLon.toCharArray();

+    String geoHash = "";

+    int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;

+    for (int i = 0; i < len - 1; i++) {

+      String a = String.valueOf(latInts[i]);

+      String b = String.valueOf(lonInts[i]);

+      geoHash += a + b;

+    }

+

+    return Long.valueOf(geoHash);

+  }

+

+  private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {

+    ArrayList<Long> list = new ArrayList<Long>(sets);

+    ArrayList<Long> diffs = new ArrayList<Long>();

+    /**

+     * create a set of differences between the points

+     */

+    for (int i = 0; i < list.size() - 1; i++) {

+      Long n = list.get(i + 1);

+      Long v = list.get(i);

+      diffs.add(Math.abs(n - v));

+    }

+    /**

+     * generate an average "distance" between the normed points

+     */

+    Long sum = 0L;

+    for (Long l : diffs) {

+      sum += l;

+    }

+    Long avg=sum;

+    if(!diffs.isEmpty()){

+     avg = sum / diffs.size();

+    }

+

+

+    /**

+     * generate break values where the disparity is greater than the average

+     */

+    TreeSet<Long> breaks = new TreeSet<Long>();

+    for (int i = 0; i < list.size() - 1; i++) {

+      Long n = list.get(i + 1);

+      Long v = list.get(i);

+      //Long percent = 100 - (v / n * 100);

+      Long diff = n - v;

+      if (diff > avg) {

+        breaks.add(v);

+      }

+    }

+    /**

+     * based on the break values, place subsets of close points into bins

+     */

+    TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();

+    Long lastBreak = -1L;

+    for (Long br : breaks) {

+      if (lastBreak == -1L) {

+        binToAmount.put(br, sets.subSet(0L, true, br, true));

+      } else {

+        binToAmount.put(br, sets.subSet(lastBreak, false, br, true));

+      }

+      lastBreak = br;

+    }

+    lastBreak = sets.higher(lastBreak);

+    if (lastBreak != null) {

+      binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));

+      if (binToAmount.get(lastBreak).isEmpty()) {

+        binToAmount.get(lastBreak).add(lastBreak);

+      }

+    }

+    /**

+     * "binToAmount" is a map of the break value to all the points behind it

+     * (it's sorted), so the key is the max value of its set of values

+     */

+    return binToAmount;

+  }

+

+  /**

+   * returns a map of geohashes and their score

+   *

+   * @param binToAmount

+   * @return Map< Geohash, score>

+   */

+  private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {

+    TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();

+    TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();

+    /**

+     * if there is only one bin return 1 as the rank for each item in the value

+     */

+    if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {

+      for (Long bin : binToAmount.keySet()) {

+        for (Long hash : binToAmount.get(bin)) {

+          ranks.put(bin, 1d);

+        }

+      }

+      return ranks;

+    }

+    int total = 0;

+    /**

+     * generate a total number of points

+     */

+    for (Set<Long> geohashes : binToAmount.values()) {

+      total += geohashes.size();

+    }

+    /**

+     * divide total by bin size, largest bin size gets best score, everything in

+     * that bin gets that score because it is part of that primary cluster

+     * TODO... do an extra iteration of clustering within the predominant

+     * cluster to refine the scoring or make the basis of the binning more

+     * granular than > avg

+     */

+    TreeSet<Double> rankSet = new TreeSet<Double>();

+    for (Long key : binToAmount.keySet()) {

+      int size = binToAmount.get(key).size();

+      Double rank = (double) total / size;

+      rankSet.add(rank);

+      ranks.put(key, rank);

+    }

+    /**

+     * load the final score map with normalized values

+     */

+    for (Map.Entry<Long, Double> rank : ranks.entrySet()) {

+      double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);

+      double reverse = Math.abs(norm - 1);

+      double score = reverse > 1d ? 1.0 : reverse;

+      normRanks.put(rank.getKey(), score);

+    }

+

+    return normRanks;

+  }

+

+  /**

+   * transposes a number in a range to a double between 0 and 1

+   *

+   * @param valueToNormalize the value to be normalized (placed within a new

+   *                         range of 0-1)

+   * @param minimum          the min of the current range

+   * @param maximum          the max of the current range

+   * @return

+   */

+  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    d = d == null ? 0d : d;

+    return d;

+  }

+}

+


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
new file mode 100644
index 0000000..1acca46
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java

@@ -0,0 +1,38 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.List;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ * Structure for scoring linked entities. The Map logically represents a pair :

+ * "Score type" to the "actual Score."

+ */

+public interface LinkedEntityScorer<T> {

+

+/**

+ * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan

+ * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored

+ * @param docText the full text of the document.

+ * @param sentenceSpans the sentence spans the correspond to the document text

+ * @param additionalContext any additional data required to perform the scoring operation

+ * @return void

+ */

+  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);

+}


diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
new file mode 100644
index 0000000..d370ec8
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java

@@ -0,0 +1,159 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.io.File;

+import java.io.FileNotFoundException;

+import java.io.IOException;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import opennlp.tools.doccat.DoccatModel;

+import opennlp.tools.doccat.DocumentCategorizerME;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ *

+ * Utilizes a doccat model to score toponyms based on surrounding context

+ */

+public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {

+

+

+  DocumentCategorizerME documentCategorizerME;

+  DoccatModel doccatModel;

+  public static final int RADIUS = 100;

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+    try {

+      if (doccatModel == null) {

+        String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");

+        if (path.equals("")) {

+          return;

+        }

+        doccatModel = new DoccatModel(new File(path));

+        documentCategorizerME = new DocumentCategorizerME(doccatModel);

+      }

+      Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);

+      for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {

+        Map<String, Double> scores = this.getScore(entry.getValue());

+        for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {

+          double score = 0d;

+          if (scores.containsKey(link.getItemParentID())) {

+            score = scores.get(link.getItemParentID());

+          }

+          link.getScoreMap().put("countrymodel", score);

+        }

+      }

+

+    } catch (FileNotFoundException ex) {

+      System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");

+    } catch (IOException ex) {

+      System.err.println(ex);

+    } catch (Exception ex) {

+      System.err.println(ex);

+    }

+  }

+

+  /**

+   * generates features using a BagOfWordsfeatureGenerator that are within the

+   * radius of a mention within the doctext

+   *

+   * @param linkedSpans

+   * @param docText

+   * @param additionalContext

+   * @param radius

+   * @return a map of the index of the linked span to the string of surrounding

+   *         text: Map<indexofspan,surrounding text>

+   */

+  public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {

+    Map<Integer, String> featureBags = new HashMap<>();

+    Map<Integer, Integer> nameMentionMap = new HashMap<>();

+    /**

+     * iterator over the map that contains a mapping of every country code to

+     * all of its mentions in the document

+     */

+    for (int i = 0; i < linkedSpans.size(); i++) {

+      LinkedSpan span = linkedSpans.get(i);

+      if (span.getLinkedEntries().isEmpty()) {

+        //don't care about spans that did not get linked to anything at all; nothing to work with

+        continue;

+      }

+      /**

+       * get the sentence the name span was found in, the beginning of the

+       * sentence will suffice as a centroid for feature generation around the

+       * named entity

+       */

+      Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();

+      nameMentionMap.put(i, mentionIdx);

+    }

+    /**

+     * now associate each span to a string that will be used for categorization

+     * against the model.

+     */

+    for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {

+      featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));

+    }

+

+

+    return featureBags;

+  }

+

+  public String getTextChunk(int mentionIdx, String docText, int radius) {

+    int docSize = docText.length();

+    int left = 0, right = 0;

+    left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;

+    right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;

+    String chunk = "";

+    if (right <= left) {

+      chunk = "";

+    } else {

+      /**

+       * don't want to chop any words in half, so take fron the first space to

+       * the last space in the chunk string

+       */

+      chunk = docText.substring(left, right);

+      if (left != 0) {

+        left = chunk.indexOf(" ");

+      }

+      right = chunk.lastIndexOf(" ");

+      /**

+       * now get the substring again with only whole words

+       */

+      if (left < right) {

+        chunk = chunk.substring(left, right);

+      }

+    }

+

+    return chunk;

+  }

+

+  private Map<String, Double> getScore(String text) throws Exception {

+    Map<String, Double> scoreMap = new HashMap<>();

+    double[] categorize = documentCategorizerME.categorize(text);

+    int catSize = documentCategorizerME.getNumberOfCategories();

+    for (int i = 0; i < catSize; i++) {

+      String category = documentCategorizerME.getCategory(i);

+      scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);

+    }

+    return scoreMap;

+  }

+

+  

+}
commit	574a9360298ec15406b438c713dc9bf91d316ad3	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Fri Jan 10 12:57:21 2014 +0000
committer	Mark Giaconia <markg@apache.org>	Fri Jan 10 12:57:21 2014 +0000
tree	1629b53a15e758000f5d6476a2a132722ddc1381
parent	747ff238dfa657479f6d56e2d7910d3a8d936330 [diff]