OPENNLP-614 Moved all GeoEntityLinker impl classes to sandbox. Called this module addons as a place to consolidate useful addons to the base opennlp modules.

commit: eb4698419276468abcfdc4e96867342994121484 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Wed Nov 06 11:47:37 2013 +0000
committer: Mark Giaconia <markg@apache.org> Wed Nov 06 11:47:37 2013 +0000
tree: 264c96f4de15ffaf9a6ccbdfe8ab4476dd66d455
parent: 543b97aaed282dc97a37ceaa63e5db7ea21c242b [diff]
diff --git a/apache-opennlp-addons/pom.xml b/apache-opennlp-addons/pom.xml
new file mode 100644
index 0000000..062afa4
--- /dev/null
+++ b/apache-opennlp-addons/pom.xml

@@ -0,0 +1,61 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

+  <modelVersion>4.0.0</modelVersion>

+

+  <groupId>apache-opennlp-addons</groupId>

+  <artifactId>apache-opennlp-addons</artifactId>

+  <version>1.0-SNAPSHOT</version>

+  <packaging>jar</packaging>

+<name>Apache OpenNLP Addons</name>

+

+  <url>http://maven.apache.org</url>

+    <build>

+        <plugins>

+            <plugin>

+                <groupId>org.apache.maven.plugins</groupId>

+                <artifactId>maven-compiler-plugin</artifactId>

+                <version>2.3.2</version>

+                <configuration>

+                    <source>1.7</source>

+                    <target>1.7</target>

+                </configuration>

+            </plugin>

+        </plugins>

+    </build>

+    <properties>

+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

+  </properties>

+

+  <dependencies>

+    <dependency>

+      <groupId>junit</groupId>

+      <artifactId>junit</artifactId>

+      <version>3.8.1</version>

+      <scope>test</scope>

+    </dependency>

+       <dependency>

+      <groupId>org.apache.lucene</groupId>

+      <artifactId>lucene-core</artifactId>

+      <version>4.5.0</version>

+      <optional>true</optional>

+    </dependency>

+    <dependency>

+      <groupId>org.apache.lucene</groupId>

+      <artifactId>lucene-analyzers-common</artifactId>

+      <version>4.5.0</version>

+      <optional>true</optional>

+    </dependency>

+    <dependency>

+      <groupId>org.apache.lucene</groupId>

+      <artifactId>lucene-queryparser</artifactId>

+      <version>4.5.0</version>

+      <optional>true</optional>

+    </dependency>

+      <dependency>

+      <groupId>org.apache.opennlp</groupId>

+      <artifactId>opennlp-tools</artifactId>

+      <version>1.6.0-SNAPSHOT</version>

+      <optional>true</optional>

+    </dependency>

+  </dependencies>

+</project>


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
new file mode 100644
index 0000000..1702f85
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java

@@ -0,0 +1,245 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.io.BufferedReader;

+import java.io.FileReader;

+import java.io.IOException;

+import java.sql.CallableStatement;

+import java.sql.Connection;

+import java.sql.DriverManager;

+import java.sql.ResultSet;

+import java.sql.SQLException;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import java.util.regex.Matcher;

+import java.util.regex.Pattern;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+

+/**

+ * Finds instances of country mentions in a String, typically a document text.

+ * Used to boost or degrade scoring of linked geo entities

+ *

+ */

+public class CountryContext {

+

+  private Connection con;

+  private List<CountryContextEntry> countrydata;

+  private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();

+  private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();

+  private Set<CountryContextEntry> countryHits = new HashSet<>();

+

+  public Map<String, Set<String>> getNameCodesMap() {

+    return nameCodesMap;

+  }

+

+  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {

+    this.nameCodesMap = nameCodesMap;

+  }

+

+  public CountryContext() {

+  }

+

+

+  /**

+   * Finds mentions of countries based on a list from MySQL stored procedure

+   * called getCountryList. This method finds country mentions in documents,

+   * which is an essential element of the scoring that is done for geo

+   * linkedspans. Lazily loads the list from the database.

+   *

+   * @param docText    the full text of the document

+   * @param properties EntityLinkerProperties for getting database connection

+   * @return

+   */

+  public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {

+    countryMentions = new HashMap<String, Set<Integer>>();

+    nameCodesMap.clear();

+    try {

+//      if (con == null) {

+//        con = getMySqlConnection(properties);

+//      }

+      if (countrydata == null) {

+         countrydata = getCountryContextFromFile(properties);

+     //   countrydata = getCountryData(properties);

+      }

+      for (CountryContextEntry entry : countrydata) {

+        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

+        Matcher rs = regex.matcher(docText);

+        String code = entry.getCc1().toLowerCase();

+

+        boolean found = false;

+        while (rs.find()) {

+          found = true;

+          Integer start = rs.start();

+          String hit = rs.group().toLowerCase();

+          if (countryMentions.containsKey(code)) {

+            countryMentions.get(code).add(start);

+          } else {

+            Set<Integer> newset = new HashSet<Integer>();

+            newset.add(start);

+            countryMentions.put(code, newset);

+          }

+          if (!hit.equals("")) {

+            if (this.nameCodesMap.containsKey(hit)) {

+              nameCodesMap.get(hit).add(code);

+            } else {

+              HashSet<String> newset = new HashSet<String>();

+              newset.add(code);

+              nameCodesMap.put(hit, newset);

+            }

+          }

+        }

+        if (found) {

+          countryHits.add(entry);

+        }

+

+      }

+

+    } catch (Exception ex) {

+      Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);

+    }

+

+

+    return countryMentions;

+  }

+

+  /**

+   * returns a unique list of country codes

+   *

+   * @param countryMentions the countryMentions discovered

+   * @return

+   */

+  public static Set<String> getCountryCodes(List<CountryContextHit> hits) {

+    Set<String> ccs = new HashSet<String>();

+    for (CountryContextHit hit : hits) {

+      ccs.add(hit.getCountryCode().toLowerCase());

+    }

+    return ccs;

+  }

+

+  public static String getCountryCodeCSV(Set<String> hits) {

+    String csv = "";

+    if (hits.isEmpty()) {

+      return csv;

+    }

+

+    for (String code : hits) {

+      csv += "," + code;

+    }

+    return csv.substring(1);

+  }

+

+  private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {

+

+    String driver = properties.getProperty("db.driver", "org.gjt.mm.mysql.Driver");

+    String url = properties.getProperty("db.url", "jdbc:mysql://localhost:3306/world");

+    String username = properties.getProperty("db.username", "root");

+    String password = properties.getProperty("db.password", "?");

+

+    Class.forName(driver);

+    Connection conn = DriverManager.getConnection(url, username, password);

+    return conn;

+  }

+

+  /**

+   * reads the list from the database by calling a stored procedure

+   * getCountryList

+   *

+   * @param properties

+   * @return

+   * @throws SQLException

+   */

+  private List<CountryContextEntry> getCountryData(EntityLinkerProperties properties) throws SQLException {

+    List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();

+    try {

+      if (con == null) {

+        con = getMySqlConnection(properties);

+      }

+      CallableStatement cs;

+      cs = con.prepareCall("CALL `getCountryList`()");

+      ResultSet rs;

+      rs = cs.executeQuery();

+      if (rs == null) {

+        return entries;

+      }

+      while (rs.next()) {

+        CountryContextEntry s = new CountryContextEntry();

+        //rc,cc1, full_name_nd_ro,dsg

+        s.setRc(rs.getString(1));

+        s.setCc1(rs.getString(2));

+//a.district, 

+        s.setFull_name_nd_ro(rs.getString(3));

+//b.name as countryname, 

+        s.setDsg(rs.getString(4));

+        entries.add(s);

+      }

+

+    } catch (SQLException ex) {

+      System.err.println(ex);

+    } catch (Exception e) {

+      System.err.println(e);

+    } finally {

+      con.close();

+    }

+    return entries;

+  }

+

+  public Map<String, Set<Integer>> getCountryMentions() {

+    return countryMentions;

+  }

+

+  public Set<CountryContextEntry> getCountryHits() {

+    return countryHits;

+  }

+

+  private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {

+    List<CountryContextEntry> entries = new ArrayList<>();

+    String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");

+    BufferedReader reader;

+

+    try {

+      path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

+

+      reader = new BufferedReader(new FileReader(path));

+

+      while (reader.read() != -1) {

+        String line = reader.readLine();

+        String[] values = line.split("\t");

+        if (values.length != 4) {

+          throw new IOException("improperly formatted country context file");

+        }

+        CountryContextEntry entry = new CountryContextEntry();

+        // rc,cc1, full_name_nd_ro,dsg

+        entry.setRc(values[0].toLowerCase());

+        entry.setCc1(values[1].toLowerCase());

+        entry.setFull_name_nd_ro(values[2].toLowerCase());

+        entry.setDsg(values[3].toLowerCase());

+        entries.add(entry);

+      }

+      reader.close();

+    } catch (IOException e) {

+      System.err.println(e);

+    }

+    return entries;

+

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
new file mode 100644
index 0000000..827ec77
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java

@@ -0,0 +1,110 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.Objects;

+

+/**

+ *Stores a tuple from mysql that is used to find country mentions in document text.

+ *

+ */

+public class CountryContextEntry {

+  /*

+   * rc,cc1, full_name_nd_ro,dsg

+   */

+

+  private String rc;

+  private String cc1;

+  private String full_name_nd_ro;

+  private String dsg;

+

+  public CountryContextEntry() {

+  }

+

+  public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) {

+    this.rc = rc;

+    this.cc1 = cc1;

+    this.full_name_nd_ro = full_name_nd_ro;

+    this.dsg = dsg;

+  }

+

+  public String getRc() {

+    return rc;

+  }

+

+  public void setRc(String rc) {

+    this.rc = rc;

+  }

+

+  public String getCc1() {

+    return cc1;

+  }

+

+  public void setCc1(String cc1) {

+    this.cc1 = cc1;

+  }

+

+  public String getFull_name_nd_ro() {

+    return full_name_nd_ro;

+  }

+

+  public void setFull_name_nd_ro(String full_name_nd_ro) {

+    this.full_name_nd_ro = full_name_nd_ro;

+  }

+

+  public String getDsg() {

+    return dsg;

+  }

+

+  public void setDsg(String dsg) {

+    this.dsg = dsg;

+  }

+

+  @Override

+  public int hashCode() {

+    int hash = 7;

+    hash = 17 * hash + Objects.hashCode(this.rc);

+    hash = 17 * hash + Objects.hashCode(this.cc1);

+    hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro);

+    hash = 17 * hash + Objects.hashCode(this.dsg);

+    return hash;

+  }

+

+  @Override

+  public boolean equals(Object obj) {

+    if (obj == null) {

+      return false;

+    }

+    if (getClass() != obj.getClass()) {

+      return false;

+    }

+    final CountryContextEntry other = (CountryContextEntry) obj;

+    if (!Objects.equals(this.rc, other.rc)) {

+      return false;

+    }

+    if (!Objects.equals(this.cc1, other.cc1)) {

+      return false;

+    }

+    if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) {

+      return false;

+    }

+    if (!Objects.equals(this.dsg, other.dsg)) {

+      return false;

+    }

+    return true;

+  }

+  

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
new file mode 100644
index 0000000..694cec6
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java

@@ -0,0 +1,60 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+/**

+ *Stores a "hit" on a country and the start and end of the hit

+

+ */

+public class CountryContextHit {

+

+  private String countryCode;

+  private int start;

+  private int end;

+

+  public CountryContextHit() {

+  }

+

+  public CountryContextHit(String countryCode, int start, int end) {

+    this.countryCode = countryCode;

+    this.start = start;

+    this.end = end;

+  }

+

+  public String getCountryCode() {

+    return countryCode;

+  }

+

+  public void setCountryCode(String countryCode) {

+    this.countryCode = countryCode;

+  }

+

+  public int getStart() {

+    return start;

+  }

+

+  public void setStart(int start) {

+    this.start = start;

+  }

+

+  public int getEnd() {

+    return end;

+  }

+

+  public void setEnd(int end) {

+    this.end = end;

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
new file mode 100644
index 0000000..4b24b11
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java

@@ -0,0 +1,262 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.TreeSet;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ * Scores toponyms based on country context as well as fuzzy string matching

+ */

+public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {

+

+  private Map<String, Set<String>> nameCodesMap;

+  String dominantCode = "";

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {

+

+    score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

+

+  }

+

+  /**

+   * Assigns a score to each BaseLink in each linkedSpan's set of N best

+   * matches. Currently the scoring indicates the probability that the toponym

+   * is correct based on the country context in the document and fuzzy string

+   * matching

+   *

+   * @param linkedData     the linked spans, holds the Namefinder results, and

+   *                       the list of BaseLink for each

+   * @param countryHits    all the country mentions in the document

+   * @param nameCodesMap   maps a country indicator name to a country code. Used

+   *                       to determine if the namefinder found the same exact

+   *                       toponym the country context did. If so the score is

+   *                       boosted due to the high probability that the

+   *                       NameFinder actually "rediscovered" a country

+   * @param docText        the full text of the document...not used in this

+   *                       default implementation

+   * @param sentences      the sentences that correspond to the doc text.

+   * @param maxAllowedDist a constant that is used to determine which country

+   *                       mentions, based on proximity within the text, should

+   *                       be used to score the Named Entity.

+   * @return

+   */

+  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {

+    this.nameCodesMap = nameCodesMap;

+    setDominantCode(countryHits);

+    for (LinkedSpan<BaseLink> linkedspan : linkedData) {

+

+      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);

+    }

+    return linkedData;

+  }

+

+  /**

+   * sets class level variable to a code based on the number of mentions

+   *

+   * @param countryHits

+   */

+  private void setDominantCode(Map<String, Set<Integer>> countryHits) {

+    int hits = -1;

+    for (String code : countryHits.keySet()) {

+      if (countryHits.get(code).size() > hits) {

+        hits = countryHits.get(code).size();

+        dominantCode = code;

+      }

+    }

+  }

+

+  /**

+   * Generates distances from each country mention to the span's location in the

+   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms

+   * are resolved to the correct country and coordinate.

+   *

+   * @param sentences

+   * @param countryHits

+   * @param span

+   * @return

+   */

+  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {

+    Double score = 0.0;

+    //get the index of the actual span, begining of sentence

+    //should generate tokens from sentence and create a char offset...

+    //could have large sentences due to poor sentence detection or wonky doc text

+    int sentenceIdx = span.getSentenceid();

+    int sentIndexInDoc = sentences[sentenceIdx].getStart();

+    /**

+     * create a map of all the span's proximal country mentions in the document

+     * Map< countrycode, set of <distances from this NamedEntity>>

+     */

+    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();

+    //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>

+    for (String cCode : countryHits.keySet()) {

+//iterate over all the regex start values and calculate an offset

+      for (Integer cHit : countryHits.get(cCode)) {

+        Integer absDist = Math.abs(sentIndexInDoc - cHit);

+        //only include near mentions based on a heuristic

+        //TODO make this a property

+        //  if (absDist < maxAllowedDistance) {

+        if (distancesFromCodeMap.containsKey(cCode)) {

+          distancesFromCodeMap.get(cCode).add(absDist);

+        } else {

+          HashSet<Integer> newset = new HashSet<Integer>();

+          newset.add(absDist);

+          distancesFromCodeMap.put(cCode, newset);

+        }

+      }

+

+      //}

+    }

+    //we now know how far this named entity is from every country mention in the document

+

+    /**

+     * the gaz matches that have a country code that have mentions in the doc

+     * that are closest to the Named Entity should return the best score Analyze

+     * map generates a likelihood score that the toponym from the gaz is

+     * referring to one of the countries Map<countrycode, prob that this span is

+     * referring to the toponym form this code key>

+     */

+    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);

+    for (BaseLink link : span.getLinkedEntries()) {

+      //getItemParentId is the country code

+      String spanCountryCode = link.getItemParentID();

+      if (scoreMap.containsKey(spanCountryCode)) {

+

+        score = scoreMap.get(spanCountryCode);

+        ///does the name extracted match a country name?

+        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {

+          //if so, is it the correct country code for that name

+          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {

+            //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1

+            //TODO: make this multiplier configurable

+            //TODO: improve this with a geographic/geometry based clustering (linear binning to be more precise) of points returned from the gaz

+            score = (score + .75) > 1.0 ? 1d : (score + .75);

+            //boost the score if the hit is from the dominant country context

+

+            if (link.getItemParentID().equals(dominantCode)) {

+              score = (score + .25) > 1.0 ? 1d : (score + .25);

+            }

+

+

+          }

+

+        }

+      }

+      link.getScoreMap().put("countrycontext", score);

+    }

+    return span;

+  }

+

+  /**

+   * takes a map of distances from the NE to each country mention and generates

+   * a map of scores for each country code. The map is then correlated to teh

+   * correlated to the code of the BaseLink parentid for retrieval. Then the

+   * score is added to the overall.

+   *

+   * @param distanceMap

+   * @param sentences

+   * @param span

+   * @return

+   */

+  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {

+

+    Map<String, Double> scoreMap = new HashMap<String, Double>();

+    if(distanceMap.isEmpty()){

+      return scoreMap;

+    }

+    TreeSet<Integer> all = new TreeSet<Integer>();

+    for (String key : distanceMap.keySet()) {

+      all.addAll(distanceMap.get(key));

+    }

+    //get min max for normalization, this could be more efficient

+    Integer min = all.first();

+    Integer max = all.last();

+    for (String key : distanceMap.keySet()) {

+

+      TreeSet<Double> normalizedDistances = new TreeSet<Double>();

+      for (Integer i : distanceMap.get(key)) {

+        Double norm = normalize(i, min, max);

+        //reverse the normed distance so low numbers (closer) are better

+        //this could be improved with a "decaying " function using an imcreaseing negative exponent

+        Double reverse = Math.abs(norm - 1);

+        normalizedDistances.add(reverse);

+      }

+

+

+      List<Double> doubles = new ArrayList<Double>(normalizedDistances);

+      scoreMap.put(key, slidingDistanceAverage(doubles));

+    }

+    return scoreMap;

+  }

+

+  /**

+   * this method is an attempt to make closer clusters of mentions group

+   * together to smooth out the average, so one distant outlier does not kill

+   * the score for an obviously good hit. More elegant solution is possible

+   * using Math.pow, and making the score decay with distance by using an

+   * increasing negative exponent

+   *

+   * @param normDis the normalized and sorted set of distances as a list

+   * @return

+   */

+  private Double slidingDistanceAverage(List<Double> normDis) {

+    List<Double> windowOfAverages = new ArrayList<Double>();

+

+    if (normDis.size() < 3) {

+      windowOfAverages.addAll(normDis);

+    } else {

+

+      for (int i = 0; i < normDis.size() - 1; i++) {

+        double a = normDis.get(i);

+        double b = normDis.get(i + 1);

+        windowOfAverages.add((a + b) / 2);

+

+      }

+    }

+    double sum = 0d;

+    for (double d : windowOfAverages) {

+      sum += d;

+    }

+    double result = sum / windowOfAverages.size();

+    //TODO: ++ prob when large amounts of mentions for a code

+    //System.out.println("avg of window:" + result);

+    return result;

+  }

+

+  /**

+   * transposes a value within one range to a relative value in a different

+   * range. Used to normalize distances in this class.

+   *

+   * @param valueToNormalize the value to place within the new range

+   * @param minimum          the min of the set to be transposed

+   * @param maximum          the max of the set to be transposed

+   * @return

+   */

+  private Double normalize(int valueToNormalize, int minimum, int maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    d = d == null ? 0d : d;

+    return d;

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
new file mode 100644
index 0000000..c21f5e2
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java

@@ -0,0 +1,95 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.HashSet;

+import java.util.List;

+import java.util.Set;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.ngram.NGramGenerator;

+import opennlp.tools.util.Span;

+

+/**

+ *

+ * Generates scores for string comparisons.

+ */

+public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {

+    for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {

+      for (BaseLink link : linkedSpan.getLinkedEntries()) {

+        Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);

+        link.getScoreMap().put("dice", dice);

+        Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));

+        link.getScoreMap().put("levenshtein", ld);

+      }

+    }

+

+  

+  }

+

+  /**

+   * Generates a score based on an overlap of nGrams between two strings using

+   * the DiceCoefficient technique.

+   *

+   * @param s1     first string

+   * @param s2     second string

+   * @param nGrams number of chars in each gram

+   * @return

+   */

+  public double getDiceCoefficient(String s1, String s2, int nGrams) {

+    if (s1.equals("") || s1.equals("")) {

+      return 0d;

+    }

+    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");

+    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");

+

+    Set<String> overlap = new HashSet<String>(s1Grams);

+    overlap.retainAll(s2Grams);

+    double totcombigrams = overlap.size();

+

+    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());

+  }

+

+  private int minimum(int a, int b, int c) {

+    return Math.min(Math.min(a, b), c);

+  }

+

+  public int getLevenshteinDistance(CharSequence str1,

+          CharSequence str2) {

+    int[][] distance = new int[str1.length() + 1][str2.length() + 1];

+

+    for (int i = 0; i <= str1.length(); i++) {

+      distance[i][0] = i;

+    }

+    for (int j = 1; j <= str2.length(); j++) {

+      distance[0][j] = j;

+    }

+

+    for (int i = 1; i <= str1.length(); i++) {

+      for (int j = 1; j <= str2.length(); j++) {

+        distance[i][j] = minimum(

+                distance[i - 1][j] + 1,

+                distance[i][j - 1] + 1,

+                distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));

+      }

+    }

+

+    return distance[str1.length()][str2.length()];

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
new file mode 100644
index 0000000..2371333
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java

@@ -0,0 +1,74 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.HashMap;

+import java.util.Map;

+import opennlp.tools.entitylinker.domain.BaseLink;

+

+/**

+ *

+ * Stores a record from a geographic placenames gazateer

+ */

+public class GazateerEntry extends BaseLink {

+

+  private Double latitude;

+  private Double longitude;

+  private String source;

+  private String indexID;

+  private Map<String, String> indexData=new HashMap<>();

+

+  public String getIndexID() {

+    return indexID;

+  }

+

+  public void setIndexID(String indexID) {

+    this.indexID = indexID;

+  }

+

+  public Double getLatitude() {

+    return latitude;

+  }

+

+  public void setLatitude(Double latitude) {

+    this.latitude = latitude;

+  }

+

+  public Double getLongitude() {

+    return longitude;

+  }

+

+  public void setLongitude(Double longitude) {

+    this.longitude = longitude;

+  }

+

+  public String getSource() {

+    return source;

+  }

+

+  public void setSource(String source) {

+    this.source = source;

+  }

+

+  public Map<String, String> getIndexData() {

+    return indexData;

+  }

+

+  public void setIndexData(Map<String, String> indexData) {

+    this.indexData = indexData;

+  }

+  

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
new file mode 100644
index 0000000..cbe8a0d
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java

@@ -0,0 +1,96 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.util.ArrayList;

+import java.util.List;

+

+import org.apache.lucene.analysis.Analyzer;

+import org.apache.lucene.analysis.standard.StandardAnalyzer;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.document.Field;

+import org.apache.lucene.document.TextField;

+import org.apache.lucene.index.IndexWriter;

+import org.apache.lucene.index.IndexWriterConfig;

+import org.apache.lucene.store.Directory;

+import org.apache.lucene.store.MMapDirectory;

+import org.apache.lucene.util.Version;

+

+/**

+ *

+ * @author Owner

+ */

+public class GazateerIndexer {

+

+  public enum GazType {

+

+    GEONAMES {

+      @Override

+      public String toString() {

+        return "/opennlp_geoentitylinker_usgsgaz_idx";

+      }

+    },

+    USGS {

+      @Override

+      public String toString() {

+        return "/opennlp_geoentitylinker_usgsgaz_idx";

+      }

+    }

+  }

+

+  public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {

+    if (!outputIndexDir.isDirectory()) {

+      throw new IllegalArgumentException("outputIndexDir must be a directory.");

+    }

+

+    String indexloc = outputIndexDir + type.toString();

+    Directory index = new MMapDirectory(new File(indexloc));

+

+    Analyzer a = new StandardAnalyzer(Version.LUCENE_45);

+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);

+

+    IndexWriter w = new IndexWriter(index, config);

+

+    readFile(gazateerInputData, w);

+    w.commit();

+    w.close();

+

+  }

+

+  public void readFile(File gazateerInputData, IndexWriter w) throws Exception {

+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+    List<String> fields = new ArrayList<String>();

+    int counter = 0;

+    System.out.println("reading gazateer data from file...........");

+    while (reader.read() != -1) {

+      String line = reader.readLine();

+      String[] values = line.split("\\|");//nga format

+      if (counter == 0) {

+        // build fields

+        for (String columnName : values) {

+          fields.add(columnName.replace("»¿", ""));

+        }

+

+

+      } else {

+        Document doc = new Document();

+        for (int i = 0; i < fields.size() - 1; i++) {

+          doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));

+        }

+        w.addDocument(doc);

+      }

+      counter++;

+      if (counter % 10000 == 0) {

+        w.commit();

+        System.out.println(counter + " .........committed to index..............");

+      }

+

+    }

+

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
new file mode 100644
index 0000000..e89e8a5
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java

@@ -0,0 +1,235 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.io.File;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.Iterator;

+import java.util.List;

+import org.apache.lucene.analysis.Analyzer;

+import org.apache.lucene.analysis.standard.StandardAnalyzer;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.index.DirectoryReader;

+import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.index.IndexableField;

+import org.apache.lucene.queryparser.classic.ParseException;

+

+import org.apache.lucene.queryparser.classic.QueryParser;

+import org.apache.lucene.search.IndexSearcher;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.TopDocs;

+import org.apache.lucene.store.Directory;

+import org.apache.lucene.store.MMapDirectory;

+import org.apache.lucene.util.Version;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+/**

+ *

+ * Searches Gazateers stored in a MMapDirectory lucene index

+ */

+public class GazateerSearcher {

+

+  private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();

+  private double scoreCutoff = .75;

+  private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

+  private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

+  private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

+  private Analyzer geonamesAnalyzer;

+  //usgs US gazateer

+  private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));

+  private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);

+  private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);

+  private Analyzer usgsAnalyzer;

+

+  public GazateerSearcher() {

+  }

+

+  public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {

+    ArrayList<GazateerEntry> linkedData = new ArrayList<>();

+    try {

+

+

+      if (geonamesIndex == null) {

+        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

+        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");

+        scoreCutoff = Double.valueOf(cutoff);

+        geonamesIndex = new MMapDirectory(new File(indexloc));

+        geonamesReader = DirectoryReader.open(geonamesIndex);

+        geonamesSearcher = new IndexSearcher(geonamesReader);

+        geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

+      }

+

+      String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " & CC1:" + code.toUpperCase();// + "~1.0";

+      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);

+      Query q = parser.parse(luceneQueryString);

+

+

+      TopDocs search = geonamesSearcher.search(q, rowsReturned);

+      double maxScore = (double) search.getMaxScore();

+

+      for (int i = 0; i < search.scoreDocs.length; ++i) {

+        GazateerEntry entry = new GazateerEntry();

+        int docId = search.scoreDocs[i].doc;

+        double sc = search.scoreDocs[i].score;

+

+        entry.getScoreMap().put("lucene", sc);

+       

+        entry.getScoreMap().put("rawlucene", sc);

+        entry.setIndexID(docId + "");

+        entry.setSource("geonames");

+

+        Document d = geonamesSearcher.doc(docId);

+        List<IndexableField> fields = d.getFields();

+        for (int idx = 0; idx < fields.size(); idx++) {

+          String value = d.get(fields.get(idx).name());

+          value = value.toLowerCase();

+          switch (idx) {

+            case 1:

+              entry.setItemID(value);

+              break;

+            case 3:

+              entry.setLatitude(Double.valueOf(value));

+              break;

+            case 4:

+              entry.setLongitude(Double.valueOf(value));

+              break;

+            case 10:

+              entry.setItemType(value);

+              break;

+            case 12:

+              entry.setItemParentID(value);

+              break;

+            case 23:

+              entry.setItemName(value);

+              break;

+          }

+          entry.getIndexData().put(fields.get(idx).name(), value);

+        }

+        //only keep it if the country code is a match

+        if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {

+          linkedData.add(entry);

+        }

+      }

+

+      normalize(linkedData, 0d, maxScore);

+      prune(linkedData);

+    } catch (IOException | ParseException ex) {

+      System.err.println(ex);

+    }

+    return linkedData;

+  }

+

+  public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {

+    ArrayList<GazateerEntry> linkedData = new ArrayList<>();

+    try {

+

+

+      if (usgsIndex == null) {

+        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

+        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");

+        scoreCutoff = Double.valueOf(cutoff);

+        usgsIndex = new MMapDirectory(new File(indexloc));

+        usgsReader = DirectoryReader.open(usgsIndex);

+        usgsSearcher = new IndexSearcher(usgsReader);

+        usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

+      }

+

+      String luceneQueryString = "FEATURE_NAME:" + searchString + " OR MAP_NAME: " + searchString;

+      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);

+      Query q = parser.parse(luceneQueryString);

+

+

+      TopDocs search = usgsSearcher.search(q, rowsReturned);

+      double maxScore = (double) search.getMaxScore();

+

+

+      for (int i = 0; i < search.scoreDocs.length; ++i) {

+        GazateerEntry entry = new GazateerEntry();

+        int docId = search.scoreDocs[i].doc;

+        double sc = search.scoreDocs[i].score;

+        //keep track of the min score for normalization

+

+        entry.getScoreMap().put("lucene", sc);

+        entry.getScoreMap().put("rawlucene", sc);

+        entry.setIndexID(docId + "");

+        entry.setSource("usgs");

+        entry.setItemParentID("us");

+

+

+        Document d = usgsSearcher.doc(docId);

+        List<IndexableField> fields = d.getFields();

+        for (int idx = 0; idx < fields.size(); idx++) {

+          String value = d.get(fields.get(idx).name());

+          value = value.toLowerCase();

+          switch (idx) {

+            case 0:

+              entry.setItemID(value);

+              break;

+            case 1:

+              entry.setItemName(value);

+              break;

+            case 2:

+              entry.setItemType(value);

+              break;

+            case 9:

+              entry.setLatitude(Double.valueOf(value));

+              break;

+            case 10:

+              entry.setLongitude(Double.valueOf(value));

+              break;

+          }

+          entry.getIndexData().put(fields.get(idx).name(), value);

+        }

+        linkedData.add(entry);

+

+

+      }

+

+      normalize(linkedData, 0d, maxScore);

+      prune(linkedData);

+    } catch (IOException | ParseException ex) {

+      System.err.println(ex);

+    }

+

+    return linkedData;

+  }

+

+  private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {

+    for (GazateerEntry gazateerEntry : linkedData) {

+

+      double luceneScore = gazateerEntry.getScoreMap().get("lucene");

+      luceneScore = normalize(luceneScore, minScore, maxScore);

+      luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;

+      luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;

+      gazateerEntry.getScoreMap().put("lucene", luceneScore);

+    }

+  }

+

+  private void prune(ArrayList<GazateerEntry> linkedData) {

+    for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {

+      GazateerEntry ge = itr.next();

+      if (ge.getScoreMap().get("lucene") < scoreCutoff) {

+        itr.remove();

+      }

+    }

+  }

+

+  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    d = d == null ? 0d : d;

+    return d;

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
new file mode 100644
index 0000000..fe58e0d
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java

@@ -0,0 +1,129 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.EntityLinker;

+/**

+ * Links location entities to gazatteers. Currently supports gazateers in a

+ * MySql database (NGA and USGS)

+ *

+ *

+ */

+public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

+

+  // CountryProximityScorer scorer = new CountryProximityScorer();

+//  private MySQLGeoNamesGazLinkable geoNamesGaz;// = new MySQLGeoNamesGazLinkable();

+//  private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();

+  private CountryContext countryContext;

+  private Map<String, Set<Integer>> countryMentions;

+  private EntityLinkerProperties linkerProperties;

+  private GazateerSearcher gazateerSearcher = new GazateerSearcher();

+  /**

+   * Flag for deciding whether to search gaz only for toponyms within countries

+   * that are mentioned in the document

+   */

+  private Boolean filterCountryContext = true;

+

+  public GeoEntityLinker() {

+    countryContext = new CountryContext();

+  }

+

+  @Override

+  public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {

+    ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();

+

+    if (linkerProperties == null) {

+      throw new IllegalArgumentException("EntityLinkerProperties cannot be null");

+    }

+    countryMentions = countryContext.regexfind(doctext, linkerProperties);

+

+    for (int s = 0; s < sentences.length; s++) {

+      Span[] names = namesBySentence[s];

+      String[] tokens = tokensBySentence[s];

+      String[] matches = Span.spansToStrings(names, tokens);

+

+      for (int i = 0; i < matches.length; i++) {

+

+//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document

+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();

+        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {

+          // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);

+          for (String code : countryMentions.keySet()) {

+            if (!code.equals("us")) {

+              geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code, linkerProperties));

+            }

+          }

+

+        }

+        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();

+        if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {

+          //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);

+          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));

+        }

+        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());

+

+        if (!usgsEntries.isEmpty()) {

+          geoSpan.getLinkedEntries().addAll(usgsEntries);

+          geoSpan.setSearchTerm(matches[i]);

+        }

+

+        if (!geoSpan.getLinkedEntries().isEmpty()) {

+          geoSpan.setSearchTerm(matches[i]);

+          geoSpan.setSentenceid(s);

+          spans.add(geoSpan);

+        }

+      }

+    }

+

+    List<LinkedEntityScorer<CountryContext>> scorers = new ArrayList<>();

+    scorers.add(new FuzzyStringMatchScorer());

+    scorers.add(new GeoHashBinningScorer());

+    scorers.add(new CountryProximityScorer());

+

+    for (LinkedEntityScorer scorer : scorers) {

+      scorer.score(spans, doctext, sentences, countryContext);

+    }

+    return spans;

+  }

+

+  @Override

+  public void setEntityLinkerProperties(EntityLinkerProperties properties) {

+    this.linkerProperties = properties;

+  }

+

+  @Override

+  public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {

+    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

+  }

+

+  @Override

+  public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {

+    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

+  }

+

+  @Override

+  public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {

+    throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.

+  }

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
new file mode 100644
index 0000000..7a87ee6
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java

@@ -0,0 +1,275 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.TreeMap;

+import java.util.TreeSet;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality

+ * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the

+ * Lucene search is set to an appropriate level so this class if not fed poor data.

+ */

+public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {

+     score( linkedSpans);

+  }

+

+  private  void score(List<LinkedSpan> geospans) {

+    Map<Double, Double> latLongs = new HashMap<Double, Double>();

+

+    /**

+     * collect all the lat longs

+     */

+    for (LinkedSpan<BaseLink> ls : geospans) {

+      for (BaseLink bl : ls.getLinkedEntries()) {

+        if (bl instanceof GazateerEntry) {

+          GazateerEntry entry = (GazateerEntry) bl;

+          latLongs.put(entry.getLatitude(), entry.getLongitude());

+        

+        }

+      }

+    }

+

+    /**

+     * convert to geohash and add to sortedset

+     */

+    TreeSet<Long> geoHashes = new TreeSet<Long>();

+    for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {

+      geoHashes.add(geoHash(entry.getKey(), entry.getValue()));

+    }

+    /**

+     * bin the points and generate a scoremap

+     */

+    Map<Long, Set<Long>> bins = bin(geoHashes);

+    Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);

+    /**

+     * iterate over the data again and assign the score based on the bins

+     */

+    for (LinkedSpan<BaseLink> ls : geospans) {

+      for (BaseLink bl : ls.getLinkedEntries()) {

+        Long geohash = -1L;

+        Double score = 0d;

+        if (bl instanceof GazateerEntry) {

+          GazateerEntry entry = (GazateerEntry) bl;

+          geohash = geoHash(entry.getLatitude(), entry.getLongitude());

+        

+        }

+        if (scores.containsKey(geohash)) {

+          score = scores.get(geohash);

+

+        } else {

+          for (Long bin : bins.keySet()) {

+            if (bin == geohash || bins.get(bin).contains(geohash)) {

+              score = scores.get(bin);

+              break;

+            }

+          }

+        }

+        bl.getScoreMap().put("geohashbin", score);

+      }

+    }

+

+

+  }

+

+  private Long normalize(Double coordpart, Boolean isLat) {

+    Integer add = isLat ? 90 : 180;

+    coordpart = Math.abs(coordpart + add);

+    coordpart = coordpart * 1000000;

+

+    Long l = Math.round(coordpart);

+    String coord = String.valueOf(l);

+    if (coord.length() < 8) {

+      while (coord.length() < 8) {

+        coord += "0";

+      }

+    }

+    coord = coord.substring(0, 8);

+    l = Long.valueOf(coord);

+    return l;

+  }

+

+  /**

+   * interleaves a lat and a long to place the coordinate in linear sortable

+   * space for binning simplicity

+   *

+   * @param lat

+   * @param lon

+   * @return

+   */

+  private Long geoHash(double lat, double lon) {

+    Long normLat = normalize(lat, Boolean.TRUE);

+    Long normLon = normalize(lon, Boolean.FALSE);

+    String sLat = String.valueOf(normLat);

+    String sLon = String.valueOf(normLon);

+    char[] latInts = sLat.toCharArray();

+    char[] lonInts = sLon.toCharArray();

+    String geoHash = "";

+    int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;

+    for (int i = 0; i < len - 1; i++) {

+      String a = String.valueOf(latInts[i]);

+      String b = String.valueOf(lonInts[i]);

+      geoHash += a + b;

+    }

+

+    return Long.valueOf(geoHash);

+  }

+

+  private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {

+    ArrayList<Long> list = new ArrayList<Long>(sets);

+    ArrayList<Long> diffs = new ArrayList<Long>();

+    /**

+     * create a set of differences between the points

+     */

+    for (int i = 0; i < list.size() - 1; i++) {

+      Long n = list.get(i + 1);

+      Long v = list.get(i);

+      diffs.add(Math.abs(n - v));

+    }

+    /**

+     * generate an average "distance" between the normed points

+     */

+    Long sum = 0L;

+    for (Long l : diffs) {

+      sum += l;

+    }

+    Long avg=sum;

+    if(!diffs.isEmpty()){

+     avg = sum / diffs.size();

+    }

+

+

+    /**

+     * generate break values where the disparity is greater than the average

+     */

+    TreeSet<Long> breaks = new TreeSet<Long>();

+    for (int i = 0; i < list.size() - 1; i++) {

+      Long n = list.get(i + 1);

+      Long v = list.get(i);

+      //Long percent = 100 - (v / n * 100);

+      Long diff = n - v;

+      if (diff > avg) {

+        breaks.add(v);

+      }

+    }

+    /**

+     * based on the break values, place subsets of close points into bins

+     */

+    TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();

+    Long lastBreak = -1L;

+    for (Long br : breaks) {

+      if (lastBreak == -1L) {

+        binToAmount.put(br, sets.subSet(0L, true, br, true));

+      } else {

+        binToAmount.put(br, sets.subSet(lastBreak, false, br, true));

+      }

+      lastBreak = br;

+    }

+    lastBreak = sets.higher(lastBreak);

+    if (lastBreak != null) {

+      binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));

+      if (binToAmount.get(lastBreak).isEmpty()) {

+        binToAmount.get(lastBreak).add(lastBreak);

+      }

+    }

+    /**

+     * "binToAmount" is a map of the break value to all the points behind it

+     * (it's sorted), so the key is the max value of its set of values

+     */

+    return binToAmount;

+  }

+

+  /**

+   * returns a map of geohashes and their score

+   *

+   * @param binToAmount

+   * @return Map< Geohash, score>

+   */

+  private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {

+    TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();

+    TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();

+    /**

+     * if there is only one bin return 1 as the rank for each item in the value

+     */

+    if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {

+      for (Long bin : binToAmount.keySet()) {

+        for (Long hash : binToAmount.get(bin)) {

+          ranks.put(bin, 1d);

+        }

+      }

+      return ranks;

+    }

+    int total = 0;

+    /**

+     * generate a total number of points

+     */

+    for (Set<Long> geohashes : binToAmount.values()) {

+      total += geohashes.size();

+    }

+    /**

+     * divide total by bin size, largest bin size gets best score, everything in

+     * that bin gets that score because it is part of that primary cluster

+     * TODO... do an extra iteration of clustering within the predominant

+     * cluster to refine the scoring or make the basis of the binning more

+     * granular than > avg

+     */

+    TreeSet<Double> rankSet = new TreeSet<Double>();

+    for (Long key : binToAmount.keySet()) {

+      int size = binToAmount.get(key).size();

+      Double rank = (double) total / size;

+      rankSet.add(rank);

+      ranks.put(key, rank);

+    }

+    /**

+     * load the final score map with normalized values

+     */

+    for (Map.Entry<Long, Double> rank : ranks.entrySet()) {

+      double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);

+      double reverse = Math.abs(norm - 1);

+      double score = reverse > 1d ? 1.0 : reverse;

+      normRanks.put(rank.getKey(), score);

+    }

+

+    return normRanks;

+  }

+

+  /**

+   * transposes a number in a range to a double between 0 and 1

+   *

+   * @param valueToNormalize the value to be normalized (placed within a new

+   *                         range of 0-1)

+   * @param minimum          the min of the current range

+   * @param maximum          the max of the current range

+   * @return

+   */

+  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;

+    d = d == null ? 0d : d;

+    return d;

+  }

+}

+


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
new file mode 100644
index 0000000..a70a628
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java

@@ -0,0 +1,37 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.util.List;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ * Structure for scoring linked entities. The Map logically represents a pair :

+ * "Score type" to the "actual Score."

+ */

+public interface LinkedEntityScorer<T> {

+

+/**

+ * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan

+ * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored

+ * @param docText the full text of the document.

+ * @param sentenceSpans the sentence spans the correspond to the document text

+ * @param additionalContext any additional data required to perform the scoring operation

+ * @return void

+ */

+  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, T additionalContext);

+}


diff --git a/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java b/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java
new file mode 100644
index 0000000..60ea0f2
--- /dev/null
+++ b/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java

@@ -0,0 +1,38 @@
+package apache.opennlp.addons;

+

+import junit.framework.Test;

+import junit.framework.TestCase;

+import junit.framework.TestSuite;

+

+/**

+ * Unit test for simple App.

+ */

+public class AppTest 

+    extends TestCase

+{

+    /**

+     * Create the test case

+     *

+     * @param testName name of the test case

+     */

+    public AppTest( String testName )

+    {

+        super( testName );

+    }

+

+    /**

+     * @return the suite of tests being tested

+     */

+    public static Test suite()

+    {

+        return new TestSuite( AppTest.class );

+    }

+

+    /**

+     * Rigourous Test :-)

+     */

+    public void testApp()

+    {

+        assertTrue( true );

+    }

+}
commit	eb4698419276468abcfdc4e96867342994121484	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Wed Nov 06 11:47:37 2013 +0000
committer	Mark Giaconia <markg@apache.org>	Wed Nov 06 11:47:37 2013 +0000
tree	264c96f4de15ffaf9a6ccbdfe8ab4476dd66d455
parent	543b97aaed282dc97a37ceaa63e5db7ea21c242b [diff]