OPENNLP-706 OPENNLP-707 OPENNLP-708 OPENNLP-709 OPENNLP-710 Addressed each ticket. Also adjusted the package structure a bit to separate responsibility better.

commit: 33446b94d0612eadb13492b3ea7e5f9a3d245595 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Fri Jul 11 01:04:58 2014 +0000
committer: Mark Giaconia <markg@apache.org> Fri Jul 11 01:04:58 2014 +0000
tree: 256a55ca02918960313c44360d3a8e331cdc9965
parent: 1229638ad20d70905d8761cb4b2c6b47e5e7c48a [diff]
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
new file mode 100644
index 0000000..638e603
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java

@@ -0,0 +1,125 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.Objects;

+

+/**

+ * Stores an admin boundary down to the US county level. Only US places from the

+ * USGS Gazetteer will have county level info

+ *

+ * @author mgiaconia

+ */

+public class AdminBoundary {

+

+  private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND";

+  private final String countryCode;

+  private final String provinceCode;

+  private final String provinceName;

+  private final String countryName;

+  private final String countyName;

+  private final String countyCode;

+

+  public AdminBoundary(String cc, String ac, String pname, String countryName) {

+    this.countryCode = cc;

+    this.provinceCode = ac;

+    this.provinceName = pname;

+    this.countryName = countryName;

+    this.countyCode = NO_DATA_FOUND_VALUE;

+    this.countyName = NO_DATA_FOUND_VALUE;

+  }

+

+  public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName) {

+    this.countryCode = countryCode;

+    this.provinceCode = provinceCode;

+    this.provinceName = provinceName;

+    this.countryName = countryName;

+    this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName;

+    this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode;

+  }

+

+  public String getCountryCode() {

+    return countryCode;

+  }

+

+  public String getProvCode() {

+    return provinceCode;

+  }

+

+  public String getProvinceName() {

+    return provinceName;

+  }

+

+  public String getCountryName() {

+    return countryName;

+  }

+

+  public String getCountyName() {

+    return countyName;

+  }

+

+  public String getCountyCode() {

+    return countyCode;

+  }

+

+  @Override

+  public String toString() {

+    return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}';

+  }

+

+  @Override

+  public int hashCode() {

+    int hash = 7;

+    hash = 11 * hash + Objects.hashCode(this.countryCode);

+    hash = 11 * hash + Objects.hashCode(this.provinceCode);

+    hash = 11 * hash + Objects.hashCode(this.provinceName);

+    hash = 11 * hash + Objects.hashCode(this.countryName);

+    hash = 11 * hash + Objects.hashCode(this.countyName);

+    hash = 11 * hash + Objects.hashCode(this.countyCode);

+    return hash;

+  }

+

+  @Override

+  public boolean equals(Object obj) {

+    if (obj == null) {

+      return false;

+    }

+    if (getClass() != obj.getClass()) {

+      return false;

+    }

+    final AdminBoundary other = (AdminBoundary) obj;

+    if (!Objects.equals(this.countryCode, other.countryCode)) {

+      return false;

+    }

+    if (!Objects.equals(this.provinceCode, other.provinceCode)) {

+      return false;

+    }

+    if (!Objects.equals(this.provinceName, other.provinceName)) {

+      return false;

+    }

+    if (!Objects.equals(this.countryName, other.countryName)) {

+      return false;

+    }

+    if (!Objects.equals(this.countyName, other.countyName)) {

+      return false;

+    }

+    if (!Objects.equals(this.countyCode, other.countyCode)) {

+      return false;

+    }

+    return true;

+  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
new file mode 100644
index 0000000..323aabb
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java

@@ -0,0 +1,138 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Map;

+import java.util.Set;

+

+/**

+ *

+ * @author mgiaconia

+ */

+public class AdminBoundaryContext {

+

+  private final Map<String, Set<Integer>> countryMentions;

+  private final Map<String, Set<Integer>> provMentions;

+  private final Map<String, Set<Integer>> countyMentions;

+  private final Set<String> countryHits;

+  private final Set<String> provHits;

+  private final Set<String> countyHits;

+  private final Map<String, String> countryRefMap;

+  private final Map<String, Map<String, String>> provRefMap;

+  private final Map<String, Map<String, String>> countyRefMap;

+  private final Set<String> whereClauses;

+  private final Map<String, Set<String>> nameCodesMap;

+

+  public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions,

+          Map<String, Set<Integer>> provMentions,

+          Map<String, Set<Integer>> countyMentions,

+          Set<String> countryHits,

+          Set<String> provHits,

+          Set<String> countyHits,

+          Map<String, String> countryRefMap,

+          Map<String, Map<String, String>> provRefMap,

+          Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap) {

+    this.countryMentions = countryMentions;

+    this.provMentions = provMentions;

+    this.countyMentions = countyMentions;

+    this.countryHits = countryHits;

+    this.provHits = provHits;

+    this.countyHits = countyHits;

+    this.countryRefMap = countryRefMap;

+    this.provRefMap = provRefMap;

+    this.countyRefMap = countyRefMap;

+    this.whereClauses = setWhereClauses();

+    this.nameCodesMap = nameCodesMap;

+  }

+

+  public Map<String, Set<String>> getNameCodesMap() {

+    return nameCodesMap;

+  }

+

+  public Map<String, Set<Integer>> getCountryMentions() {

+    return countryMentions;

+  }

+

+  public Map<String, Set<Integer>> getProvMentions() {

+    return provMentions;

+  }

+

+  public Map<String, Set<Integer>> getCountyMentions() {

+    return countyMentions;

+  }

+

+  public Set<String> getCountryHits() {

+    return countryHits;

+  }

+

+  public Set<String> getProvHits() {

+    return provHits;

+  }

+

+  public Set<String> getCountyHits() {

+    return countyHits;

+  }

+

+  public Map<String, String> getCountryRefMap() {

+    return countryRefMap;

+  }

+

+  public Map<String, Map<String, String>> getProvRefMap() {

+    return provRefMap;

+  }

+

+  public Map<String, Map<String, String>> getCountyRefMap() {

+    return countyRefMap;

+  }

+

+  public Set<String> getWhereClauses() {

+    return whereClauses;

+  }

+

+  private Set<String> setWhereClauses() {

+    Set<String> clauses = new HashSet<>();

+    for (String countryCode : this.getCountryHits()) {

+      String gazType = countryCode.toLowerCase().equals("us") ? " AND gazsource:usgs" : " AND gazsource:geonames";

+      if (countryCode.toLowerCase().matches(".*rg[0-9].*")) {

+        gazType = " AND gazsource:region";

+      }

+      Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode);

+      if (provsForCountry == null) {

+        provsForCountry = new HashMap<>();

+      }

+      Map<String, String> provs = new HashMap<>();

+

+      if (!provsForCountry.isEmpty()) {

+        for (String pcode : provsForCountry.keySet()) {

+          if (this.getProvHits().contains(pcode)) {

+            provs.put(pcode, provsForCountry.get(pcode));

+

+            clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType);

+

+          }

+        }

+      }

+      if (provs.isEmpty()) {

+        //got a country with no mentioned provs

+        clauses.add(" countrycode:" + countryCode + gazType);

+      }

+    }

+    return clauses;

+  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
new file mode 100644
index 0000000..c09afbd
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java

@@ -0,0 +1,406 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+

+import java.util.regex.Matcher;

+import java.util.regex.Pattern;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import org.apache.log4j.Logger;

+

+/**

+ * Finds instances of country mentions in a String, typically a document text.

+ * Used to boost or degrade scoring of linked geo entities

+ *

+ */

+public class AdminBoundaryContextGenerator {

+

+  private static final Logger LOGGER = Logger.getLogger(AdminBoundaryContextGenerator.class);

+  private List<CountryContextEntry> countrydata;

+  private Map<String, Set<String>> nameCodesMap = new HashMap<>();

+  private Map<String, Set<Integer>> countryMentions = new HashMap<>();

+  private Set<CountryContextEntry> countryHits = new HashSet<>();

+  private EntityLinkerProperties properties;

+  private List<AdminBoundary> adminBoundaryData;

+  private Set<AdminBoundary> adminBoundaryHits = new HashSet<>();

+  private AdminBoundaryContext context;

+

+  public AdminBoundaryContext getContext(String text) {

+    context = null;

+    nameCodesMap.clear();

+    context = process(text);

+

+    return context;

+  }

+

+  private Set<String> countryHitSet = new HashSet<>();

+  private Map<String, String> countryMap = new HashMap<>();

+  private Map<String, Map<String, String>> provMap = new HashMap<>();

+  private Map<String, Map<String, String>> countyMap = new HashMap<>();

+

+  private Map<String, Set<Integer>> provMentions = new HashMap<>();

+  private Map<String, Set<Integer>> countyMentions = new HashMap<>();

+

+  private Set<String> provHits = new HashSet<String>();

+  private Set<String> countyHits = new HashSet<String>();

+

+  public static void main(String[] args) {

+    try {

+      AdminBoundaryContextGenerator countryContext = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));

+      GeoEntityLinker linker = new GeoEntityLinker();

+      linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));

+

+      countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");

+

+    } catch (Exception ex) {

+      java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);

+    }

+  }

+

+  public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws Exception {

+    this.properties = properties;

+    if (countrydata == null) {

+      String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

+

+      File countryContextFile = new File(path);

+      //countrydata = getCountryContextFromFile(countryContextFile);

+      adminBoundaryData = getContextFromFile(countryContextFile);

+    }

+  }

+

+  public Map<String, Set<Integer>> getCountryMentions() {

+    return countryMentions;

+  }

+

+  /**

+   * returns the last set of hits after calling regexFind

+   *

+   * @return

+   */

+  public Set<CountryContextEntry> getCountryHits() {

+    return countryHits;

+  }

+

+  /**

+   * returns the last name to codes map after calling regexFind

+   *

+   * @return

+   */

+  public Map<String, Set<String>> getNameCodesMap() {

+    return nameCodesMap;

+  }

+

+  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {

+    this.nameCodesMap = nameCodesMap;

+  }

+

+  private void reset() {

+    this.nameCodesMap.clear();

+    this.countryHitSet.clear();

+    this.countryHits.clear();

+    this.countryMentions.clear();

+    this.provHits.clear();

+    this.provMentions.clear();

+    this.countyHits.clear();

+    this.countyMentions.clear();

+    this.adminBoundaryHits.clear();

+  }

+

+  /**

+   * Finds indicators of countries, provinces, and cities, as per the USGS and

+   * Geonames gazetteers. The results of this are used to score toponymns

+   * downstream. The full text of a document should be passed in here.

+   *

+   * @param text the full text of the document (block of text).

+   * @return

+   */

+  private AdminBoundaryContext process(String text) {

+    try {

+      if (text.contains("Convoy of terror")) {

+        System.out.println("");

+      }

+      reset();

+      Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);

+      if (!countryhitMap.isEmpty()) {

+        for (String cc : countryhitMap.keySet()) {

+          Map<String, String> provsForCc = provMap.get(cc);

+          if (provsForCc != null) {

+            provMentions = regexfind(text, provsForCc, provHits);

+            if (provMentions != null) {

+              for (String prov : provMentions.keySet()) {

+                Map<String, String> get = countyMap.get(prov);

+                if (get != null) {

+                  countyMentions = regexfind(text, get, countyHits);

+                }

+              }

+            }

+          }

+        }

+      } else {

+        for (Map<String, String> provsForCc : provMap.values()) {

+          if (provsForCc != null) {

+            provMentions = regexfind(text, provsForCc, provHits);

+            if (provMentions != null) {

+              for (String prov : provMentions.keySet()) {

+                //fake a country hit based on a province hit... this gets fuzzy

+                String cc = prov.split("\\.")[0];

+                if (!countryhitMap.containsKey(cc)) {

+                  countryhitMap.put(cc, provMentions.get(prov));

+                  countryHitSet.add(cc);

+                } else {

+                  countryhitMap.get(cc).addAll(provMentions.get(prov));

+                }

+                Map<String, String> get = countyMap.get(prov);

+                if (get != null) {

+                  countyMentions = regexfind(text, get, countyHits);

+                }

+              }

+            }

+          }

+        }

+      }

+

+      Map<String, String> countryRefMap = new HashMap<>();

+

+      for (String c : countryHitSet) {

+        String countryName = countryMap.get(c);

+        if (countryName != null) {

+          countryRefMap.put(c, countryName);

+        }

+      }

+

+      AdminBoundaryContext context = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, countryRefMap, provMap, countyMap, nameCodesMap);

+

+      return context;

+    } catch (Exception e) {

+      e.printStackTrace();

+    }

+    return null;

+  }

+

+  /**

+   * Finds mentions of countries to assist in toponym resolution. Countries are

+   * discovered via regex based on a configured file called

+   * opennlp.geoentitylinker.countrycontext.txt. the file is configured using

+   * the entitylinker.properties file as such:

+   * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt

+   *

+   * Finding mentions in documents is very helpful for scoring. Lazily loads the

+   * list from the file.

+   *

+   * @param docText the full text of the document

+   * @return

+   */

+  @Deprecated

+  public Map<String, Set<Integer>> regexfind(String docText) {

+    countryMentions = new HashMap<>();

+    nameCodesMap.clear();

+    try {

+

+      for (CountryContextEntry entry : countrydata) {

+        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

+        Matcher rs = regex.matcher(docText);

+        String code = entry.getCc1().toLowerCase();

+

+        boolean found = false;

+        while (rs.find()) {

+          found = true;

+          Integer start = rs.start();

+          String hit = rs.group().toLowerCase();

+          if (countryMentions.containsKey(code)) {

+            countryMentions.get(code).add(start);

+          } else {

+            Set<Integer> newset = new HashSet<Integer>();

+            newset.add(start);

+            countryMentions.put(code, newset);

+          }

+          if (!hit.equals("")) {

+            if (this.nameCodesMap.containsKey(hit)) {

+              nameCodesMap.get(hit).add(code);

+            } else {

+              HashSet<String> newset = new HashSet<String>();

+              newset.add(code);

+              nameCodesMap.put(hit, newset);

+            }

+          }

+        }

+        if (found) {

+          countryHits.add(entry);

+        }

+

+      }

+

+    } catch (Exception ex) {

+      LOGGER.error(ex);

+    }

+

+    return countryMentions;

+  }

+

+  /**

+   * discovers indicators of admin boundary data using regex.

+   *

+   * @param docText the full text

+   * @param lookupMap a map to use to find names. the key=a location code, the

+   * value is an actual name.

+   * @param hitsRef a reference to a set that stores the hits by id

+   * @return

+   */

+  private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef) {

+    Map<String, Set<Integer>> mentions = new HashMap<>();

+    if (lookupMap == null) {

+      return mentions;

+    }

+    try {

+

+      for (String entry : lookupMap.keySet()) {

+        String name = lookupMap.get(entry).toLowerCase();

+        if (name == null) {

+          continue;

+        }

+        name = "[^\\p{L}\\p{Nd}]" + name.replace(", the", "") + "[^\\p{L}\\p{Nd}]";

+        Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

+        Matcher rs = regex.matcher(docText);

+        String code = entry.toLowerCase();

+

+        boolean found = false;

+        while (rs.find()) {

+          found = true;

+          Integer start = rs.start();

+          String hit = rs.group().toLowerCase().trim();

+          hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", "");

+          if (mentions.containsKey(code)) {

+            mentions.get(code).add(start);

+          } else {

+            Set<Integer> newset = new HashSet<Integer>();

+            newset.add(start);

+            mentions.put(code, newset);

+          }

+          if (!hit.equals("")) {

+            if (this.nameCodesMap.containsKey(hit)) {

+              nameCodesMap.get(hit).add(code);

+            } else {

+              HashSet<String> newset = new HashSet<String>();

+              newset.add(code);

+              nameCodesMap.put(hit, newset);

+            }

+          }

+

+        }

+        if (found) {

+          hitsRef.add(code);

+

+        }

+      }

+

+    } catch (Exception ex) {

+      LOGGER.error(ex);

+      ex.printStackTrace();

+

+    }

+

+    return mentions;

+  }

+

+  private List<AdminBoundary> getContextFromFile(File countryContextFile) {

+    if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {

+      return adminBoundaryData;

+    }

+    List<AdminBoundary> entries = new ArrayList<>();

+    BufferedReader reader;

+    try {

+      reader = new BufferedReader(new FileReader(countryContextFile));

+      String line = "";

+      while ((line = reader.readLine()) != null) {

+        String[] values = line.split("\t");

+        int len = values.length;

+        if (len < 5 || len > 6) {

+          throw new IllegalArgumentException("Improperly formatted file");

+        }

+        if (values.length == 6) {

+          AdminBoundary entry = new AdminBoundary(

+                  values[0].toLowerCase().trim(),

+                  values[3].toLowerCase().trim(),

+                  values[1].toLowerCase().trim(),

+                  values[4].toLowerCase().trim(),

+                  values[2].toLowerCase().trim(),

+                  values[5].toLowerCase().trim());

+          entries.add(entry);

+        } else {

+          AdminBoundary entry = new AdminBoundary(

+                  values[0].toLowerCase().trim(),

+                  values[3].toLowerCase().trim(),

+                  values[1].toLowerCase().trim(),

+                  values[4].toLowerCase().trim(),

+                  values[2].toLowerCase().trim(),

+                  "");

+          entries.add(entry);

+        }

+

+      }

+      reader.close();

+    } catch (IOException ex) {

+      LOGGER.error(ex);

+    }

+    loadMaps(entries);

+    return entries;

+

+  }

+

+  private void loadMaps(List<AdminBoundary> boundaries) {

+    for (AdminBoundary adm : boundaries) {

+      if (!adm.getCountryCode().equals("null")) {

+        countryMap.put(adm.getCountryCode(), adm.getCountryName());

+

+        if (!adm.getProvCode().equals("null")) {

+          Map<String, String> provs = provMap.get(adm.getCountryCode());

+          if (provs == null) {

+            provs = new HashMap<>();

+          }

+          //if (!provs.containsKey(adm.getProvCode())) {

+          provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName());

+          provMap.put(adm.getCountryCode(), provs);

+          // }

+

+          if (!adm.getCountyCode().toLowerCase().equals("no_data_found") && !adm.getCountyName().toLowerCase().equals("no_data_found")) {

+            Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode());

+            if (counties == null) {

+              counties = new HashMap<>();

+            }            // if (!counties.containsKey(adm.getCountyCode())) {

+            String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode();

+            counties.put(countyid, adm.getCountyName());

+            countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties);

+            // }

+

+          }

+

+        }

+      }

+    }

+  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
deleted file mode 100644
index 4aa9e16..0000000
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
+++ /dev/null

@@ -1,174 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.addons.geoentitylinker;

-

-import java.io.BufferedReader;

-import java.io.File;

-import java.io.FileReader;

-import java.io.IOException;

-import java.util.ArrayList;

-import java.util.HashMap;

-import java.util.HashSet;

-import java.util.List;

-import java.util.Map;

-import java.util.Set;

-

-import java.util.regex.Matcher;

-import java.util.regex.Pattern;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import org.apache.log4j.Logger;

-

-/**

- * Finds instances of country mentions in a String, typically a document text.

- * Used to boost or degrade scoring of linked geo entities

- *

- */

-public class CountryContext {

-

-  private static final Logger LOGGER = Logger.getLogger(CountryContext.class);

-  private List<CountryContextEntry> countrydata;

-  private Map<String, Set<String>> nameCodesMap = new HashMap<>();

-  private Map<String, Set<Integer>> countryMentions = new HashMap<>();

-  private Set<CountryContextEntry> countryHits = new HashSet<>();

-  private EntityLinkerProperties properties;

-  

-  public CountryContext(EntityLinkerProperties properties) throws Exception {

-    this.properties = properties;

-    if (countrydata == null) {

-      String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

-      

-      File countryContextFile = new File(path);

-      countrydata = getCountryContextFromFile(countryContextFile);

-    }

-  }

-  

-  public Map<String, Set<Integer>> getCountryMentions() {

-    return countryMentions;

-  }

-

-  /**

-   * returns the last set of hits after calling regexFind

-   *

-   * @return

-   */

-  public Set<CountryContextEntry> getCountryHits() {

-    return countryHits;

-  }

-

-  /**

-   * returns the last name to codes map after calling regexFind

-   *

-   * @return

-   */

-  public Map<String, Set<String>> getNameCodesMap() {

-    return nameCodesMap;

-  }

-  

-  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {

-    this.nameCodesMap = nameCodesMap;

-  }

-

-  /**

-   * Finds mentions of countries to assist in toponym resolution. Countries are

-   * discovered via regex based on a configured file called

-   * opennlp.geoentitylinker.countrycontext.txt. the file is configured using

-   * the entitylinker.properties file as such:

-   * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt

-   *

-   * Finding mentions in documents is very helpful for scoring. Lazily loads the

-   * list from the file.

-   *

-   * @param docText the full text of the document

-   * @param properties EntityLinkerProperties for getting database connection

-   * @return

-   */

-  public Map<String, Set<Integer>> regexfind(String docText) {

-    countryMentions = new HashMap<>();

-    nameCodesMap.clear();

-    try {

-      

-      for (CountryContextEntry entry : countrydata) {

-        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

-        Matcher rs = regex.matcher(docText);

-        String code = entry.getCc1().toLowerCase();

-        

-        boolean found = false;

-        while (rs.find()) {

-          found = true;

-          Integer start = rs.start();

-          String hit = rs.group().toLowerCase();

-          if (countryMentions.containsKey(code)) {

-            countryMentions.get(code).add(start);

-          } else {

-            Set<Integer> newset = new HashSet<Integer>();

-            newset.add(start);

-            countryMentions.put(code, newset);

-          }

-          if (!hit.equals("")) {

-            if (this.nameCodesMap.containsKey(hit)) {

-              nameCodesMap.get(hit).add(code);

-            } else {

-              HashSet<String> newset = new HashSet<String>();

-              newset.add(code);

-              nameCodesMap.put(hit, newset);

-            }

-          }

-        }

-        if (found) {

-          countryHits.add(entry);

-        }

-        

-      }

-      

-    } catch (Exception ex) {

-      LOGGER.error(ex);

-    }

-    

-    return countryMentions;

-  }

-  

-  private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {

-    List<CountryContextEntry> entries = new ArrayList<>();

-    String path = countryContextFile.getPath();

-    BufferedReader reader;

-    

-    try {

-      path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

-      

-      reader = new BufferedReader(new FileReader(path));

-      

-      while (reader.read() != -1) {

-        String line = reader.readLine();

-        String[] values = line.split("\t");

-        if (values.length != 4) {

-          throw new IOException("improperly formatted country context file");

-        }

-        CountryContextEntry entry = new CountryContextEntry();

-        // rc,cc1, full_name_nd_ro,dsg

-        entry.setRc(values[0].toLowerCase());

-        entry.setCc1(values[1].toLowerCase());

-        entry.setFull_name_nd_ro(values[2].toLowerCase());

-        entry.setDsg(values[3].toLowerCase());

-        entries.add(entry);

-      }

-      reader.close();

-    } catch (IOException ex) {

-      LOGGER.error(ex);

-    }

-    return entries;

-    

-  }

-}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
index a208d78..0c37eee 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java

@@ -31,6 +31,9 @@
   private String source;

   private String indexID;

   private Map<String, String> indexData = new HashMap<>();

+  private String countryCode;

+  private String provinceCode;

+  private String hierarchy;

 

   /**

    * returns the id from the lucene document

@@ -159,5 +162,28 @@
     return true;

   }

 

+  public String getCountryCode() {

+    return countryCode;

+  }

+

+  public void setCountryCode(String countryCode) {

+    this.countryCode = countryCode;

+  }

+

+  public String getProvinceCode() {

+    return provinceCode;

+  }

+

+  public void setProvinceCode(String provinceCode) {

+    this.provinceCode = provinceCode;

+  }

+

+  public String getHierarchy() {

+    return hierarchy;

+  }

+

+  public void setHierarchy(String hierarchy) {

+    this.hierarchy = hierarchy;

+  }

 

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
deleted file mode 100644
index dd65ec7..0000000
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
+++ /dev/null

@@ -1,139 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.addons.geoentitylinker;

-

-import java.io.BufferedReader;

-import java.io.File;

-import java.io.FileReader;

-import java.util.ArrayList;

-import java.util.List;

-

-import org.apache.lucene.analysis.Analyzer;

-import org.apache.lucene.analysis.standard.StandardAnalyzer;

-import org.apache.lucene.analysis.util.CharArraySet;

-import org.apache.lucene.document.Document;

-import org.apache.lucene.document.Field;

-import org.apache.lucene.document.TextField;

-import org.apache.lucene.index.IndexWriter;

-import org.apache.lucene.index.IndexWriterConfig;

-import org.apache.lucene.store.Directory;

-import org.apache.lucene.store.MMapDirectory;

-import org.apache.lucene.util.Version;

-

-/**

- *

- * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker

- */

-public class GazetteerIndexer {

-

-  public GazetteerIndexer() {

-

-  }

-

-

-  public static interface Separable {

-

-    String getSeparator();

-  }

-

-  public enum GazType implements Separable {

-

-    GEONAMES {

-              @Override

-              public String toString() {

-                return "/opennlp_geoentitylinker_geonames_idx";

-              }

-

-              @Override

-              public String getSeparator() {

-                return "\t";

-              }

-            },

-    USGS {

-              @Override

-              public String toString() {

-                return "/opennlp_geoentitylinker_usgsgaz_idx";

-              }

-

-              @Override

-              public String getSeparator() {

-                return "\\|";

-              }

-            }

-  }

-

-  /**

-   * indexes the USGS or Geonames gazateers.

-   *

-   * @param outputIndexDir     a DIRECTORY path where you would like to store

-   *                           the output lucene indexes

-   * @param gazetteerInputData the file, "as is" that was downloaded from the

-   *                           USGS and GEONAMES website

-   * @param type               indicates whether the data is USGS or GEONAMES

-   *                           format

-   * @throws Exception

-   */

-  public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {

-    if (!outputIndexDir.isDirectory()) {

-      throw new IllegalArgumentException("outputIndexDir must be a directory.");

-    }

-

-    String indexloc = outputIndexDir + type.toString();

-    Directory index = new MMapDirectory(new File(indexloc));

-

-    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);

-

-    IndexWriter w = new IndexWriter(index, config);

-

-    readFile(gazetteerInputData, w, type);

-    w.commit();

-    w.close();

-

-  }

-

-  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {

-    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

-    List<String> fields = new ArrayList<>();

-    int counter = 0;

-    System.out.println("reading gazetteer data from file...........");

-    while (reader.read() != -1) {

-      String line = reader.readLine();

-      String[] values = line.split(type.getSeparator());

-      if (counter == 0) {

-        for (String columnName : values) {

-          fields.add(columnName.replace("»¿", "").trim());

-        }

-

-      } else {

-        Document doc = new Document();

-        for (int i = 0; i < fields.size() - 1; i++) {         

-          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

-        }     

-        w.addDocument(doc);

-      }

-      counter++;

-      if (counter % 100000 == 0) {

-        w.commit();

-        System.out.println(counter + " .........committed to index..............");

-      }

-

-    }

-    w.commit();

-    System.out.println("Completed indexing gaz! index name is: " + type.toString());

-  }

-

-}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
index 3049169..ac5b01e 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java

@@ -28,7 +28,7 @@
   private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>();

 

 /**

- * returns the cached entries. Returns null if the query does not exists in the cache

+ * returns the cached entries. Returns null if the query does not exist in the cache

  * @param searchString

  * @return

  */


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 1f976d6..9a8be47 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java

@@ -38,6 +38,7 @@
 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import org.apache.log4j.Logger;

 import org.apache.lucene.analysis.util.CharArraySet;

+import org.apache.lucene.search.Sort;

 

 /**

  *

@@ -63,11 +64,16 @@
   private Analyzer usgsAnalyzer;

   private EntityLinkerProperties properties;

 

+  private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));

+  private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex);

+  private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader);

+  private Analyzer opennlpAnalyzer;

+

   public static void main(String[] args) {

     try {

       boolean b = Boolean.valueOf("true");

 

-      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");

+      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("baghdad", 5, "iz");

     } catch (IOException ex) {

       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);

     } catch (Exception ex) {

@@ -79,6 +85,112 @@
     this.properties = properties;

     init();

   }

+/**

+ * Searches the single lucene index that includes the location hierarchy.

+ * @param searchString the location name to search for

+ * @param rowsReturned how many index entries to return (top N...)

+ * @param whereClause the conditional statement that defines the index type and the country oode.

+ * @return 

+ */

+  public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) {

+    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

+    searchString = cleanInput(searchString);

+    if (searchString.isEmpty()) {

+      return linkedData;

+    }

+    try {

+      /**

+       * build the search string Sometimes no country context is found. In this

+       * case the code variables will be empty strings

+       */

+      String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;

+      if (searchString.trim().contains(" ")) {

+        placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"

+                + " AND " + whereClause;

+      }

+

+      //  luceneQueryString = "hierarchy:(tampa florida) AND gazsource:usgs";

+      /**

+       * check the cache and go no further if the records already exist

+       */

+      ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(placeNameQueryString);

+      if (get != null) {

+

+        return get;

+      }

+      /**

+       * search the placename

+       */

+      QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);

+      Query q = parser.parse(placeNameQueryString);

+      

+      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned, Sort.RELEVANCE);

+  

+      for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {

+        GazetteerEntry entry = new GazetteerEntry();

+        int docId = bestDocs.scoreDocs[i].doc;

+        double sc = bestDocs.scoreDocs[i].score;

+

+        entry.getScoreMap().put("lucene", sc);

+        entry.setIndexID(docId + "");

+

+        Document d = opennlpSearcher.doc(docId);

+

+        List<IndexableField> fields = d.getFields();

+

+        String lat = d.get("latitude");

+        String lon = d.get("longitude");

+        String placename = d.get("placename");

+        String parentid = d.get("countrycode").toLowerCase();

+        String provid = d.get("admincode");

+        String itemtype = d.get("loctype");

+        String source = d.get("gazsource");

+        String hier = d.get("hierarchy");

+        entry.setSource(source);

+

+        entry.setItemID(docId + "");

+        entry.setLatitude(Double.valueOf(lat));

+        entry.setLongitude(Double.valueOf(lon));

+        entry.setItemType(itemtype);

+        entry.setItemParentID(parentid);

+        entry.setProvinceCode(provid);

+        entry.setCountryCode(parentid);

+        entry.setItemName(placename);

+        entry.setHierarchy(hier);

+        for (int idx = 0; idx < fields.size(); idx++) {

+          entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name()));

+        }

+        /**

+         * norm the levenstein distance

+         */

+        int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();

+

+        Double normLev = Math.abs(1-(sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();

+        /**

+         * only want hits above the levenstein thresh. This should be a low

+         * thresh due to the use of the hierarchy field in the index

+         */

+        if (normLev.compareTo(scoreCutoff) >= 0) {

+//          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {

+          entry.getScoreMap().put("normlucene", normLev);

+          //make sure we don't produce a duplicate

+          if (!linkedData.contains(entry)) {

+            linkedData.add(entry);

+            /**

+             * add the records to the cache for this query

+             */

+            GazetteerSearchCache.put(placeNameQueryString, linkedData);

+          }

+//          }

+        }

+      }

+

+    } catch (IOException | ParseException ex) {

+      LOGGER.error(ex);

+    }

+

+    return linkedData;

+  }

 

   /**

    *

@@ -88,6 +200,7 @@
    *

    * @return

    */

+  @Deprecated

   public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {

     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

     searchString = cleanInput(searchString);

@@ -198,6 +311,7 @@
    *

    * @return

    */

+    @Deprecated

   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {

     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

     searchString = cleanInput(searchString);

@@ -284,7 +398,8 @@
   }

 

   /**

-   * Replaces any noise chars with a space, and depending on configuration adds double quotes to the string

+   * Replaces any noise chars with a space, and depending on configuration adds

+   * double quotes to the string

    *

    * @param input

    * @return

@@ -300,36 +415,66 @@
   }

 

   private void init() throws Exception {

-    if (usgsIndex == null) {

-      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

+//    if (usgsIndex == null) {

+//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

+//      if (indexloc.equals("")) {

+//        // System.out.println("USGS Gaz location not found");

+//        LOGGER.error(new Exception("USGS Gaz location not found"));

+//      }

+//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

+//

+//      scoreCutoff = Double.valueOf(cutoff);

+//      String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));

+//      doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);

+//      usgsIndex = new MMapDirectory(new File(indexloc));

+//      usgsReader = DirectoryReader.open(usgsIndex);

+//      usgsSearcher = new IndexSearcher(usgsReader);

+//      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+//    }

+//    if (geonamesIndex == null) {

+//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

+//      if (indexloc.equals("")) {

+//        LOGGER.error(new Exception("Geonames Gaz location not found"));

+//

+//      }

+//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

+//      scoreCutoff = Double.valueOf(cutoff);

+//      geonamesIndex = new MMapDirectory(new File(indexloc));

+//      geonamesReader = DirectoryReader.open(geonamesIndex);

+//      geonamesSearcher = new IndexSearcher(geonamesReader);

+//      //TODO: a language code switch statement should be employed here at some point

+//      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+//

+//    }

+    if (opennlpIndex == null) {

+      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");

       if (indexloc.equals("")) {

-        // System.out.println("USGS Gaz location not found");

-        LOGGER.error(new Exception("USGS Gaz location not found"));

-      }

-      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

-

-      scoreCutoff = Double.valueOf(cutoff);

-      String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));

-      doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);

-      usgsIndex = new MMapDirectory(new File(indexloc));

-      usgsReader = DirectoryReader.open(usgsIndex);

-      usgsSearcher = new IndexSearcher(usgsReader);

-      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-    }

-    if (geonamesIndex == null) {

-      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

-      if (indexloc.equals("")) {

-        LOGGER.error(new Exception("Geonames Gaz location not found"));

+        LOGGER.error(new Exception("Opennlp combined Gaz directory location not found"));

 

       }

-      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

-      scoreCutoff = Double.valueOf(cutoff);

-      geonamesIndex = new MMapDirectory(new File(indexloc));

-      geonamesReader = DirectoryReader.open(geonamesIndex);

-      geonamesSearcher = new IndexSearcher(geonamesReader);

+      //  String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

+      //  scoreCutoff = Double.valueOf(cutoff);

+      opennlpIndex = new MMapDirectory(new File(indexloc));

+      opennlpReader = DirectoryReader.open(opennlpIndex);

+      opennlpSearcher = new IndexSearcher(opennlpReader);

       //TODO: a language code switch statement should be employed here at some point

-      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+      opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

 

     }

   }

+

+  private String formatForHierarchy(String searchTerm) {

+    String[] parts = searchTerm.split(" ");

+    String out = "";

+    if (parts.length != 0) {

+      for (String string : parts) {

+        out += string + " AND ";

+      }

+      out = out.substring(0, out.lastIndexOf(" AND "));

+    } else {

+      out = cleanInput(searchTerm);

+    }

+    return out;

+  }

+

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index b147d27..367a082 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

@@ -15,6 +15,11 @@
  */

 package opennlp.addons.geoentitylinker;

 

+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;

+import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;

+import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;

+import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;

+import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;

 import java.util.ArrayList;

 import java.util.List;

 import java.util.Map;

@@ -33,11 +38,11 @@
  */

 public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

 

-  private CountryContext countryContext;

+  private AdminBoundaryContextGenerator countryContext;

   private Map<String, Set<Integer>> countryMentions;

   private EntityLinkerProperties linkerProperties;

   private GazetteerSearcher gazateerSearcher;

-  private List<LinkedEntityScorer> scorers = new ArrayList<>();

+  private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();

 

   @Override

   public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {

@@ -46,8 +51,8 @@
     if (linkerProperties == null) {

       throw new IllegalArgumentException("EntityLinkerProperties cannot be null");

     }

-    countryMentions = countryContext.regexfind(doctext);

-

+    //countryMentions = countryContext.regexfind(doctext);

+    AdminBoundaryContext context = countryContext.getContext(doctext);

     for (int s = 0; s < sentences.length; s++) {

       Span[] names = namesBySentence[s];

       String[] tokens = tokensBySentence[s];

@@ -55,51 +60,27 @@
 

       for (int i = 0; i < matches.length; i++) {

 

-        /**

-         * nga gazateer is for other than US placenames,don't want to use it if

-         * US is the only country mentioned in the doc

-         *

-         */

         ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();

-        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)

-                || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {

-

-          if (!countryMentions.keySet().isEmpty()) {

-            for (String code : countryMentions.keySet()) {

-              if (!code.equals("us")) {

-                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));

-              }

-            }

-          } else {

-            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));

-

+        if (!context.getWhereClauses().isEmpty()) {

+          for (String whereclause : context.getWhereClauses()) {

+            geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));

           }

-

+        }else{//this means there were no where clauses generated so the where clause will default to look at the entire index

+          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));

         }

-        ArrayList<BaseLink> usgsEntries = new ArrayList<>();

-        if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {

-          //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);

-          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));

-        }

-        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd(), "location",names[i].getProb());

-    

-

-        if (!usgsEntries.isEmpty()) {

-          geoSpan.getLinkedEntries().addAll(usgsEntries);

-          geoSpan.setSearchTerm(matches[i]);

-        }

-

-        if (!geoSpan.getLinkedEntries().isEmpty()) {

-          geoSpan.setSearchTerm(matches[i]);

-          geoSpan.setSentenceid(s);

-          spans.add(geoSpan);

-        }

+        //start generating queries

+        LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);

+        newspan.setSearchTerm(matches[i]);

+        newspan.setLinkedEntries(geoNamesEntries);

+        newspan.setSentenceid(s);

+        spans.add(newspan);

       }

+

     }

 

     if (!scorers.isEmpty()) {

       for (LinkedEntityScorer scorer : scorers) {

-        scorer.score(spans, doctext, sentences, linkerProperties, countryContext);

+        scorer.score(spans, doctext, sentences, linkerProperties, context);

       }

     }

 

@@ -111,6 +92,8 @@
       scorers.add(new GeoHashBinningScorer());

       scorers.add(new CountryProximityScorer());

       scorers.add(new ModelBasedScorer());

+      scorers.add(new FuzzyStringMatchScorer());

+     // scorers.add(new ProvinceProximityScorer());

     }

   }

 

@@ -118,7 +101,7 @@
   public void init(EntityLinkerProperties properties) {

     try {

       this.linkerProperties = properties;

-      countryContext = new CountryContext(this.linkerProperties);

+      countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);

       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);

       loadScorers();

     } catch (Exception ex) {


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
new file mode 100644
index 0000000..98dd7b5
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java

@@ -0,0 +1,227 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker.indexing;

+

+import java.io.File;

+import java.io.FileNotFoundException;

+import java.util.ArrayList;

+

+import org.apache.lucene.analysis.Analyzer;

+import org.apache.lucene.analysis.standard.StandardAnalyzer;

+import org.apache.lucene.analysis.util.CharArraySet;

+import org.apache.lucene.index.IndexWriter;

+import org.apache.lucene.index.IndexWriterConfig;

+import org.apache.lucene.store.Directory;

+import org.apache.lucene.store.MMapDirectory;

+import org.apache.lucene.util.Version;

+

+/**

+ *

+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker

+ */

+public class GazetteerIndexer {

+

+  public static void main(String[] args) {

+    try {

+      GazetteerIndexer i = new GazetteerIndexer();

+      i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),

+              new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),

+              new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),

+              new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),

+              new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),

+              new File("C:\\temp\\gazetteers\\"),

+              new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),

+              new File("C:\\temp\\gazetteers\\regions.txt"));

+    } catch (Exception ex) {

+      ex.printStackTrace();

+    }

+  }

+

+  public GazetteerIndexer() {

+

+  }

+

+  public static interface Separable {

+

+    String getSeparator();

+  }

+

+  public enum GazType implements Separable {

+

+    GEONAMES {

+              @Override

+              public String toString() {

+                return "/opennlp_geoentitylinker_geonames_idx";

+              }

+

+              @Override

+              public String getSeparator() {

+                return "\t";

+              }

+            },

+    USGS {

+              @Override

+              public String toString() {

+                return "/opennlp_geoentitylinker_usgsgaz_idx";

+              }

+

+              @Override

+              public String getSeparator() {

+                return "\\|";

+              }

+            }

+  }

+

+  /**

+   *

+   * @param geonamesData the actual Geonames gazetteer data downloaded from

+   * here: http://download.geonames.org/export/dump/ then click on this

+   * link 'allCountries.zip'

+   * @param geoNamesCountryInfo the countryinfo lookup table that can be

+   * downloaded from here

+   * http://download.geonames.org/export/dump/countryinfo.txt

+   * @param geonamesAdmin1CodesASCII The lookup data for the province names for

+   * each place found here:

+   * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight the

+   * table view, and copy results into a text file. Make sure the tab delimitted

+   * format is maintained.

+   * @param usgsDataFile the actual USGS gazetteer downloaded from here:

+   * http://geonames.usgs.gov/domestic/download_data.htm click on the

+   * national_file####.zip link to get all the most recent features

+   *

+   * @param usgsGovUnitsFile go to here:

+   * http://geonames.usgs.gov/domestic/download_data.htm in the section titled

+   * "Topical Gazetteers -- File Format" click on the drop down list and select

+   * "Government Units". The downloaded file is what you need for this param.

+   * @param outputIndexDir where you want the final index. Must be a directory,

+   * not an actual file.

+   * @param outputCountryContextFile The output countrycontext file. THis is a

+   * very important file used inside the GeoEntityLinker to assist in toponym

+   * resolution.

+   * @param regionsFile this file contains a list of regions in the following

+   * format: tab delimitted text with index 0 as the name of the region, index 1

+   * as the longitude, and index 2 as the latitude

+   * @throws Exception

+   */

+  public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII,

+          File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) throws Exception {

+    if (!outputIndexDir.isDirectory()) {

+      throw new IllegalArgumentException("outputIndexDir must be a directory.");

+    }

+    if (!geonamesData.exists()) {

+      throw new FileNotFoundException("geonames data file does not exist");

+    }

+    if (!geoNamesCountryInfo.exists()) {

+      throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");

+    }

+    if (!geonamesAdmin1CodesASCII.exists()) {

+      throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");

+    }

+

+    if (!usgsDataFile.exists()) {

+      throw new FileNotFoundException("usgsDataFile data file does not exist");

+    }

+    if (!usgsGovUnitsFile.exists()) {

+      throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");

+    }

+    if (!outputIndexDir.exists()) {

+      throw new FileNotFoundException("outputIndexDir data file does not exist");

+    }

+    if (!regionsFile.exists()) {

+      throw new FileNotFoundException("regionsFile data file does not exist");

+    }

+

+    String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";

+    Directory index = new MMapDirectory(new File(indexloc));

+

+    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);

+

+    IndexWriter w = new IndexWriter(index, config);

+    USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);

+

+    GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);

+

+    RegionProcessor.process(regionsFile, outputCountryContextFile, w);

+    w.commit();

+    w.close();

+    System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" + outputCountryContextFile.getPath() + "' to entitylinker.properties file");

+  }

+

+  /**

+   * indexes the USGS or Geonames gazateers.

+   *

+   * @param outputIndexDir a DIRECTORY path where you would like to store the

+   * output lucene indexes

+   * @param gazetteerInputData the file, "as is" that was downloaded from the

+   * USGS and GEONAMES website

+   * @param type indicates whether the data is USGS or GEONAMES format

+   * @throws Exception

+   */

+  @Deprecated

+  public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {

+    if (!outputIndexDir.isDirectory()) {

+      throw new IllegalArgumentException("outputIndexDir must be a directory.");

+

+    }

+

+    String indexloc = outputIndexDir + type.toString();

+    Directory index = new MMapDirectory(new File(indexloc));

+

+    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);

+

+    IndexWriter w = new IndexWriter(index, config);

+    //  GeonamesProcessor.process(new File("C:\\temp\\gazetteers\\geonamesdata\\countrycodes.txt"), new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"), gazetteerInputData, null, w);

+    // USGSProcessor.process(gazetteerInputData, outputIndexDir, w);

+    //  readFile(gazetteerInputData, w, type);

+    w.commit();

+    w.close();

+

+  }

+//

+//  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {

+//    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+//    List<String> fields = new ArrayList<>();

+//    int counter = 0;

+//    System.out.println("reading gazetteer data from file...........");

+//    while (reader.read() != -1) {

+//      String line = reader.readLine();

+//      String[] values = line.split(type.getSeparator());

+//      if (counter == 0) {

+//        for (String columnName : values) {

+//          fields.add(columnName.replace("»¿", "").trim());

+//        }

+//

+//      } else {

+//        Document doc = new Document();

+//        for (int i = 0; i < fields.size() - 1; i++) {

+//          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

+//        }

+//        w.addDocument(doc);

+//      }

+//      counter++;

+//      if (counter % 100000 == 0) {

+//        w.commit();

+//        System.out.println(counter + " .........committed to index..............");

+//      }

+//

+//    }

+//    w.commit();

+//    System.out.println("Completed indexing gaz! index name is: " + type.toString());

+//  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
similarity index 94%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
index 991081a..63cb88c 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java

@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.indexing;

 

 import java.io.BufferedOutputStream;

 import java.io.File;

@@ -28,6 +28,8 @@
 import java.util.HashMap;

 import java.util.Map;

 import java.util.Set;

+import opennlp.addons.geoentitylinker.AdminBoundaryContextGenerator;

+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;

 

 import opennlp.tools.doccat.DoccatModel;

 import opennlp.tools.doccat.DocumentCategorizerME;

@@ -41,6 +43,7 @@
  *

  * Tools for setting up GeoEntityLinker gazateers and doccat scoring model

  */

+@Deprecated

 public class GeoEntityLinkerSetupUtils {

   private static final int RADIUS = 200;

   public static ModelBasedScorer scorer;

@@ -86,7 +89,7 @@
    * @throws IOException

    */

   public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {

-    CountryContext context = new CountryContext(properties);

+    AdminBoundaryContextGenerator context = new AdminBoundaryContextGenerator(properties);

     FileWriter writer = new FileWriter(annotationOutFile, true);

     System.out.println("processing " + documents.size() + " documents");

     for (String docText : documents) {

@@ -131,7 +134,7 @@
    * @param radius

    * @return

    */

-  private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {

+  private static Map<String, ArrayList<String>> modelCountryContext(String docText, AdminBoundaryContextGenerator additionalContext, int radius) {

     Map<String, ArrayList< String>> featureBags = new HashMap<>();

     Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

     /**


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
new file mode 100644
index 0000000..73ff14e
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java

@@ -0,0 +1,135 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker.indexing;

+

+/**

+ *

+ * @author mgiaconia

+ */

+import java.io.*;

+import java.net.*;

+import java.util.Enumeration;

+import java.util.zip.*;

+public class GeonamesFileDownloader {

+

+  final static int size = 1024;

+  private static final String ALL_COUNTRIES = "http://download.geonames.org/export/dump/ZM.zip";

+  private static final String COUNTRY_INFO = "";

+  private static final String ADM1_LOOKUP = "";

+

+  public static void main(String[] args) {

+    downloadGeonamesFiles(COUNTRY_INFO, "c:\\temp\\gazetteers");

+  }

+

+  public static void downloadGeonamesFiles(String outputFileName, String outputDir) {

+    String fileDownload = fileDownload(ALL_COUNTRIES, outputDir);

+

+    unzipMyZip(fileDownload, outputDir);

+

+    fileDownload(COUNTRY_INFO, outputDir);

+    fileDownload(ADM1_LOOKUP, outputDir);

+

+  }

+

+  public static final void writeFile(InputStream in, OutputStream out)

+          throws IOException {

+    byte[] buffer = new byte[1024];

+    int len;

+

+    while ((len = in.read(buffer)) != 0) {

+      out.write(buffer, 0, len);

+    }

+

+    in.close();

+    out.close();

+  }

+

+  public static void unzipMyZip(String zipFileName,

+          String directoryToExtractTo) {

+    Enumeration entriesEnum;

+    ZipFile zip;

+    try {

+      zip = new ZipFile(zipFileName);

+      entriesEnum = zip.entries();

+      while (entriesEnum.hasMoreElements()) {

+        ZipEntry entry = (ZipEntry) entriesEnum.nextElement();

+        InputStream is = zip.getInputStream(entry); // get the input stream

+        OutputStream os = new java.io.FileOutputStream(new File(zipFileName.replace("\\.zip", ".txt")));

+        byte[] buf = new byte[4096];

+        int r;

+        while ((r = is.read(buf)) != -1) {

+          os.write(buf, 0, r);

+        }

+        os.close();

+        is.close();

+      }

+    } catch (IOException ioe) {

+      System.err.println("Some Exception Occurred:");

+      ioe.printStackTrace();

+      return;

+    }

+  }

+

+  public static String fileUrl(String fAddress, String localFileName, String destinationDir) {

+    OutputStream outStream = null;

+    URLConnection uCon = null;

+    String filename = destinationDir + "\\" + localFileName;

+    InputStream is = null;

+    try {

+      URL Url;

+      byte[] buf;

+      int ByteRead, ByteWritten = 0;

+      Url = new URL(fAddress);

+      outStream = new BufferedOutputStream(new FileOutputStream(destinationDir + "\\" + localFileName));

+

+      uCon = Url.openConnection();

+      is = uCon.getInputStream();

+      buf = new byte[size];

+      while ((ByteRead = is.read(buf)) != -1) {

+        outStream.write(buf, 0, ByteRead);

+        ByteWritten += ByteRead;

+      }

+      System.out.println("Downloaded Successfully.");

+      System.out.println("File name:\"" + localFileName + "\"\nNo ofbytes :" + ByteWritten);

+    } catch (Exception e) {

+      e.printStackTrace();

+    } finally {

+      try {

+        is.close();

+        outStream.close();

+      } catch (IOException e) {

+        e.printStackTrace();

+      }

+    }

+    return filename;

+  }

+

+  public static String fileDownload(String fAddress, String destinationDir) {

+    int slashIndex = fAddress.lastIndexOf('/');

+    int periodIndex = fAddress.lastIndexOf('.');

+

+    String fileName = fAddress.substring(slashIndex + 1);

+    String retFileName = "";

+    if (periodIndex >= 1 && slashIndex >= 0

+            && slashIndex < fAddress.length() - 1) {

+      retFileName = fileUrl(fAddress, fileName, destinationDir);

+    } else {

+      System.err.println("path or file name.");

+    }

+    return retFileName;

+  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
new file mode 100644
index 0000000..bd73bb9
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java

@@ -0,0 +1,278 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker.indexing;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.io.FileWriter;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import opennlp.addons.geoentitylinker.AdminBoundary;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.document.Field;

+import org.apache.lucene.document.TextField;

+import org.apache.lucene.index.IndexWriter;

+

+/**

+ *

+ * @author mgiaconia

+ */

+public class GeonamesProcessor {

+

+  public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception {

+    Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);

+

+    Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes);

+    //  List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));

+    //admin2Codes.txt

+

+    readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w);

+    //now append to the coutnry context file

+    writeCountryContextFile(outputCountryContextFile, adm1s);

+

+  }

+

+  public GeonamesProcessor() {

+  }

+

+  private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) {

+    System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath());

+

+    Map<String, AdminBoundary> outmap = new HashMap<>();

+    BufferedReader reader;

+    Set<String> nullcodes = new HashSet<>();

+    try {

+

+      reader = new BufferedReader(new FileReader(adm1CodesLookupFile));

+      int i = 0;

+      String line = "";

+      while ((line = reader.readLine()) != null) {

+

+        // String line = reader.readLine();

+        String[] values = line.split("\t");

+        if (values.length != 4) {

+          throw new IOException("improperly formatted province lookup file");

+        }

+        String ccode = values[0].toLowerCase();

+

+        String[] split = ccode.split("\\.");

+        String pcode = "";

+        if (split.length == 2) {

+          //System.out.println(split);

+          ccode = split[0];

+          pcode = split[1];

+        }

+

+        String pname = values[2];

+

+        if (ccode.matches("[0-9].*")) {

+          String code = ccode;

+          ccode = pcode;

+          pcode = code;

+        }

+

+        String cname = ccodes.get(ccode);

+

+        if (cname == null) {

+          nullcodes.add(ccode);

+        }

+        AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);

+        //  System.out.println(data);

+        outmap.put(ccode + "." + pcode, data);

+

+      }

+      System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data.");

+      reader.close();

+    } catch (IOException ex) {

+      ex.printStackTrace();

+    }

+    System.out.println("Successfully read geonames province data from: " + adm1CodesLookupFile.getPath());

+

+    return outmap;

+

+  }

+

+  private static Map<String, String> getCountryCodes(File countryContextFile) {

+    Map<String, String> ccs = new HashMap<>();

+    BufferedReader reader;

+    try {

+

+      reader = new BufferedReader(new FileReader(countryContextFile));

+      int i = 0;

+      String line = "";

+      boolean start = false;

+      while ((line = reader.readLine()) != null) {

+        if (!line.toLowerCase().startsWith("#iso\t") && !start) {

+

+          continue;

+        } else {

+          start = true;

+        }

+        String[] values = line.split("\t");

+

+        String ccode = values[0].toLowerCase();//this is the 2 digit ISO code

+        String cname = values[4].toLowerCase();

+        if (!ccode.equals("")) {

+          ccs.put(ccode, cname);

+        }

+

+      }

+      reader.close();

+    } catch (IOException ex) {

+      ex.printStackTrace();

+    }

+    ccs.put("SS", "South Sudan");

+    ccs.put("CS", "Kosovo");

+    return ccs;

+

+  }

+

+  public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {

+    // FileWriter writer = null;

+    try (FileWriter writer = new FileWriter(outfile, true)) {

+

+      for (String admKey : adms.keySet()) {

+        AdminBoundary adm = adms.get(admKey);

+        if (adm == null) {

+          continue;

+        }

+        String province = adm.getProvinceName();

+        String country = adm.getCountryName();

+

+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n";

+        writer.write(line);

+        // System.out.println(line);

+

+      }

+      writer.close();

+    } catch (IOException ex) {

+      ex.printStackTrace();

+    }

+    System.out.println("successfully wrote Geonames entries to country oontext file");

+  }

+

+ /**

+  * 

+  * @param gazateerInputData the Geonames allCounties.txt file

+  * @param type the types of gaz entry, usgs, geonames, or regions

+  * @param adms the province info

+  * @param countrycodes the country code info

+  * @param w the lucene index writer

+  * @throws Exception 

+  */

+  public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {

+

+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+    String[] fieldStrings = new String[]{

+      "geonameid",

+      "name",

+      "asciiname",

+      "alternatenames",

+      "latitude",

+      "longitude",

+      "feature_class",

+      "feature_code",

+      "country code",

+      "cc2",

+      "admin1_code",

+      "admin2_code",

+      "admin3_code",

+      "admin4_code",

+      "population",

+      "elevation",

+      "dem ",

+      "timezone",

+      "modification_date"};

+

+    List<String> fields = Arrays.asList(fieldStrings);

+    int counter = 0;

+    System.out.println("reading gazetteer data from file...........");

+    String line = "";

+    while ((line = reader.readLine()) != null) {

+      String[] values = line.split(type.getSeparator());

+

+      Document doc = new Document();

+      String admincode = values[10].toLowerCase();

+      String ccode = values[8].toLowerCase();

+      if (ccode.contains(",")) {

+        String[] codes = ccode.split(",");

+        if (codes.length > 0) {

+          ccode = codes[0];

+        }

+      }

+      AdminBoundary adm = adms.get(ccode + "." + admincode);

+

+      String placeName = values[2];

+      String lat = values[4];

+      String lon = values[5];

+      String dsg = values[7];

+      String id = values[0];

+      String concatIndexEntry = "";

+      if (adm != null) {

+        concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;

+      } else {

+        //there is no admin info, but we can still use the countrycode to concat the country name

+        String n = countrycodes.get(ccode);

+        if (n != null) {

+          concatIndexEntry = n + ", " + placeName;

+        } else {

+          ///don't want a single token hierarchy entry.

+          concatIndexEntry = "";

+        }

+      }

+      if (ccode == null) {

+        System.out.println("naughty country code");

+      }

+      for (int i = 0; i < fields.size() - 1; i++) {

+        doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

+

+      }

+

+      /**

+       * add standard fields to the index

+       */

+      doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));

+      doc.add(new TextField("placename", placeName, Field.Store.YES));

+      doc.add(new TextField("latitude", lat, Field.Store.YES));

+      doc.add(new TextField("longitude", lon, Field.Store.YES));

+      doc.add(new TextField("loctype", dsg, Field.Store.YES));

+      doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));

+      doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));

+      doc.add(new TextField("countycode", "", Field.Store.YES));

+

+      doc.add(new TextField("locid", id, Field.Store.YES));

+      doc.add(new TextField("gazsource", "geonames", Field.Store.YES));

+      w.addDocument(doc);

+

+      counter++;

+      if (counter % 100000 == 0) {

+        w.commit();

+        System.out.println(counter + " .........Geonames entries committed to index..............");

+      }

+

+    }

+

+    System.out.println("Completed indexing gaz! index name is: " + type.toString());

+  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
new file mode 100644
index 0000000..3b667cf
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java

@@ -0,0 +1,119 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker.indexing;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.io.FileWriter;

+import java.util.ArrayList;

+import java.util.List;

+import java.util.Map;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.addons.geoentitylinker.AdminBoundary;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.document.Field;

+import org.apache.lucene.document.TextField;

+import org.apache.lucene.index.IndexWriter;

+

+/**

+ *

+ * @author mgiaconia

+ */

+public class RegionProcessor {

+

+  public static void main(String[] args) {

+    RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);

+  }

+

+  /**

+   *

+   * @param regionsFile the file that stores Region references. the format of

+   * this file is tab delimitted text with index 0 as the name of the region,

+   * index 1 as the longitude, and index 2 as the latitude

+   * @param outputCountryContextfile this is the country context files shared by

+   * all indexing processors

+   * @param w

+   */

+  public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {

+    try {

+      readFile(regionsFile, outputCountryContextfile, w);

+    } catch (Exception ex) {

+      ex.printStackTrace();

+    }

+  }

+

+  public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception {

+    List<String> ccfileentries = new ArrayList<>();

+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+    List<String> fields = new ArrayList<>();

+    int counter = 0;

+    System.out.println("reading gazetteer data from Regions file...........");

+    String line = "";

+    while ((line = reader.readLine()) != null) {

+

+      String[] values = line.split("\t");

+      if (counter == 0) {

+

+      } else {

+        Document doc = new Document();

+        for (int i = 0; i < fields.size() - 1; i++) {

+          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

+        }

+        String placeName = values[0];

+        String lat = values[2];

+        String lon = values[1];

+        String dsg = "region";

+        String id = "rg" + counter;

+

+        String hierarchy = placeName;

+

+        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));

+        doc.add(new TextField("placename", placeName, Field.Store.YES));

+        doc.add(new TextField("latitude", lat, Field.Store.YES));

+        doc.add(new TextField("longitude", lon, Field.Store.YES));

+        doc.add(new TextField("loctype", dsg, Field.Store.YES));

+        doc.add(new TextField("admincode", "", Field.Store.YES));

+        doc.add(new TextField("countrycode", id, Field.Store.YES));

+        doc.add(new TextField("countycode", "", Field.Store.YES));

+

+        doc.add(new TextField("locid", id, Field.Store.YES));

+        doc.add(new TextField("gazsource", "region", Field.Store.YES));

+        //countrycontext file format

+        // US	KY	131	United States	Kentucky	Leslie

+

+        ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n");

+        if (w != null) {

+          w.addDocument(doc);

+        }

+      }

+      counter++;

+

+    }

+    if (w != null) {

+      w.commit();

+    }

+    FileWriter writer = new FileWriter(outputCountryContextfile, true);

+    for (String string : ccfileentries) {

+      writer.write(string);

+    }

+    System.out.println("successfully wrote Region entries to country oontext file");

+    writer.close();

+    System.out.println("Completed indexing regions!");

+  }

+

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
new file mode 100644
index 0000000..cdb5ed2
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java

@@ -0,0 +1,188 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker.indexing;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.io.FileWriter;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.addons.geoentitylinker.AdminBoundary;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.document.Field;

+import org.apache.lucene.document.TextField;

+

+import org.apache.lucene.index.IndexWriter;

+

+/**

+ *

+ * @author mgiaconia

+ */

+public class USGSProcessor {

+

+  public static void main(String[] args) {

+    try {

+      Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS);

+      process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);

+    } catch (Exception ex) {

+      Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex);

+    }

+  }

+

+  public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception {

+    Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS);

+    readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);

+    writeCountryContextFile(outputCountryContextfile, provData);

+  }

+

+  public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {

+

+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+    List<String> fields = new ArrayList<>();

+    int counter = 0;

+    System.out.println("reading gazetteer data from USGS file...........");

+    String line = "";

+    while ((line = reader.readLine()) != null) {

+

+      String[] values = line.split(type.getSeparator());

+      if (counter == 0) {

+        for (String columnName : values) {

+          fields.add(columnName.replace("»¿", "").trim());

+        }

+

+      } else {

+        Document doc = new Document();

+        for (int i = 0; i < fields.size() - 1; i++) {

+          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

+        }

+        String placeName = values[1];

+        String lat = values[9];

+        String lon = values[10];

+        String dsg = values[2];

+        String id = values[0];

+

+        String ccode = values[6];

+        String admincode = values[3];

+        AdminBoundary get = lookupMap.get(admincode + "." + ccode);

+        String countyname = "";

+        String countyCode = get.getCountyCode();

+        if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {

+          countyname =  get.getCountyName();

+        }

+        if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {

+          countyCode = get.getCountyCode();

+        }

+        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() +", "+ countyname + ", " + placeName;

+

+        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));

+        doc.add(new TextField("placename", placeName, Field.Store.YES));

+        doc.add(new TextField("latitude", lat, Field.Store.YES));

+        doc.add(new TextField("longitude", lon, Field.Store.YES));

+        doc.add(new TextField("loctype", dsg, Field.Store.YES));

+        doc.add(new TextField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));

+        doc.add(new TextField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));

+        doc.add(new TextField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));

+

+        doc.add(new TextField("locid", id, Field.Store.YES));

+        doc.add(new TextField("gazsource", "usgs", Field.Store.YES));

+        w.addDocument(doc);

+      }

+      counter++;

+      if (counter % 100000 == 0) {

+        w.commit();

+        System.out.println(counter + " .........USGS entries committed to index..............");

+      }

+

+    }

+    w.commit();

+    System.out.println("Completed indexing USGS gaz!");

+  }

+

+  private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {

+ System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());

+    Map<String, AdminBoundary> outmap = new HashMap<>();

+    BufferedReader reader;

+

+    try {

+

+      reader = new BufferedReader(new FileReader(govUnitsFile));

+      int i = 0;

+      String line = "";

+      String[] fields = null;

+      while ((line = reader.readLine()) != null) {

+

+        String[] values = line.split(type.getSeparator());

+        if (i == 0) {

+          fields = values;

+          i++;

+          continue;

+        }

+        i++;

+        // System.out.println(i);

+        String countyCode = values[2];

+        String countyName = values[3];

+        String stateCode = values[5];

+        String stateName = values[6];

+        String countryCode = values[7];

+        String countryName = values[8];

+        AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName);

+        outmap.put(stateCode + "." + countyCode, adminBoundary);

+        //  System.out.println(adminBoundary);

+

+      }

+      reader.close();

+    } catch (IOException ex) {

+      ex.printStackTrace();

+    }

+  System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());

+

+    return outmap;

+

+  }

+

+  public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {

+    // FileWriter writer = null;

+    try (FileWriter writer = new FileWriter(outfile, true)) {

+

+      for (String admkey : adms.keySet()) {

+        AdminBoundary adm = adms.get(admkey);

+        if (adm == null) {

+          continue;

+        }

+        String province = adm.getProvinceName();

+        String country = adm.getCountryName();

+        /**

+         * this is the standard format of the country context file... Geonames

+         * data will have an empty string for the county

+         */

+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\n";

+        writer.write(line);

+      ///  System.out.println(line);

+

+      }

+      writer.close();

+    } catch (IOException ex) {

+      Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);

+    }

+    System.out.println("successfully wrote USGS entries to country oontext file");

+  }

+}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
similarity index 96%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index 3dbf5d1..cc34b1a 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java

@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

 import java.util.ArrayList;

 import java.util.HashMap;

@@ -22,6 +22,7 @@
 import java.util.Map;

 import java.util.Set;

 import java.util.TreeSet;

+import opennlp.addons.geoentitylinker.AdminBoundaryContext;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.BaseLink;

 import opennlp.tools.entitylinker.LinkedSpan;

@@ -29,20 +30,20 @@
 

 /**

  * Scores toponyms based on their proximity to a country mention. Based on the

- * heuristic that typonymn mentions are more likely close to their parent

+ * heuristic that toponymn mentions are more likely close to their parent

  * country mentions. For instance, if the toponym Berlin is mentioned near an

  * indicator of Germany, it is more likely to be Berlin Germany than Berlin

- * Connecticut.

+ * Connecticut (if Connecticut is mentioned further down in the article).

  *

  *

  */

-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {

+public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {

 

   private Map<String, Set<String>> nameCodesMap;

   String dominantCode = "";

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

 

     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
similarity index 64%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index 36e2751..9101829 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java

@@ -13,43 +13,50 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

+import java.util.ArrayList;

 import java.util.HashSet;

 import java.util.List;

 import java.util.Set;

+import opennlp.addons.geoentitylinker.AdminBoundaryContext;

+import opennlp.addons.geoentitylinker.GazetteerEntry;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.BaseLink;

 import opennlp.tools.entitylinker.LinkedSpan;

-import opennlp.tools.ngram.NGramGenerator;

 import opennlp.tools.util.Span;

 

 /**

  *

  * Generates scores based on string comparisons levenstein and dice

  */

-public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

+public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> {

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

     for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {

       for (BaseLink link : linkedSpan.getLinkedEntries()) {

-        Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);

-        link.getScoreMap().put("dice", dice);

-        Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));

-        link.getScoreMap().put("levenshtein", ld);

+        if (link instanceof GazetteerEntry) {

+          GazetteerEntry entry = (GazetteerEntry) link;

+          String hierarchy = entry.getHierarchy();

+          if (hierarchy != null) {

+            Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);

+            link.getScoreMap().put("hierarchydicecoef", dice);

+            Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase().toLowerCase());

+            link.getScoreMap().put("hierarchylevenshtein", ld);

+          }

+        }

       }

     }

 

-

   }

 

   /**

    * Generates a score based on an overlap of nGrams between two strings using

    * the DiceCoefficient technique.

    *

-   * @param s1     first string

-   * @param s2     second string

+   * @param s1 first string

+   * @param s2 second string

    * @param nGrams number of chars in each gram

    * @return

    */

@@ -57,8 +64,22 @@
     if (s1.equals("") || s1.equals("")) {

       return 0d;

     }

-    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");

-    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");

+    List<String> s1Grams = new ArrayList<>();

+    List<String> s2Grams = new ArrayList<>();

+    String[] split1 = s1.split("[ ,]");

+    for (String token : split1) {

+      if (token.trim().equals("")) {

+        continue;

+      }

+      s1Grams.add(token);

+    }

+    String[] split2 = s2.split("[ ,]");

+    for (String token : split2) {

+      if (token.trim().equals("")) {

+        continue;

+      }

+      s2Grams.add(token);

+    }

 

     Set<String> overlap = new HashSet<String>(s1Grams);

     overlap.retainAll(s2Grams);


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
similarity index 87%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
index 97a5d07..d3494e0 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java

@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

 import java.util.ArrayList;

 import java.util.List;

 import java.util.Map;

+import opennlp.addons.geoentitylinker.AdminBoundaryContext;

+import opennlp.addons.geoentitylinker.GazetteerEntry;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.BaseLink;

 import opennlp.tools.entitylinker.LinkedSpan;

@@ -29,13 +31,13 @@
  * outliers by finding those points that are not near the majority

  *

  */

-public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {

+public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> {

 

   private final PointClustering CLUSTERER = new PointClustering();

   private int PRECISION = 3;

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties,  AdminBoundaryContext additionalContext) {

      //Map<Double, Double> latLongs = new HashMap<Double, Double>();

     List<GazetteerEntry> allGazEntries = new ArrayList<>();

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
similarity index 90%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
index 5567fa2..f56e8da 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java

@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

 import java.util.List;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

@@ -23,6 +23,7 @@
 /**

  * Structure for scoring linked entities. The Map logically represents a pair :

  * "Score type" to the "actual Score."

+ * @param <T> a generic for providing additional context

  */

 public interface LinkedEntityScorer<T> {

 

@@ -32,6 +33,7 @@
  * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored

  * @param docText the full text of the document.

  * @param sentenceSpans the sentence spans the correspond to the document text

+   * @param properties the entitylinker properties config file

  * @param additionalContext any additional data required to perform the scoring operation

  * @return void

  */


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
similarity index 94%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
index 35b423a..3202f85 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java

@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

 import java.io.File;

 import java.io.FileNotFoundException;

@@ -21,6 +21,7 @@
 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

+import opennlp.addons.geoentitylinker.AdminBoundaryContext;

 import opennlp.tools.doccat.DoccatModel;

 import opennlp.tools.doccat.DocumentCategorizerME;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

@@ -33,7 +34,7 @@
  *

  * Utilizes a doccat model to score toponyms based on surrounding context

  */

-public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {

+public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> {

 

   private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class);

   DocumentCategorizerME documentCategorizerME;

@@ -42,7 +43,7 @@
   boolean modelexists = false;

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

     try {

       if (doccatModel == null) {

         String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
similarity index 78%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
index bf7f701..908df1e 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java

@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

 import com.spatial4j.core.context.SpatialContext;

 import com.spatial4j.core.io.GeohashUtils;

@@ -22,6 +22,7 @@
 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

+import opennlp.addons.geoentitylinker.GazetteerEntry;

 

 /**

  *

@@ -114,36 +115,6 @@
     return point;

   }

 

-  /**

-   * Hashes a lat long based on adding 90 or 180 and then interlarding lat lon

-   * chars. reduces a set of points to a sortable set

-   *

-   * @param lat

-   * @param lon

-   * @return

-   */

-  public String simpleGeohash(Double lat, Double lon) {

-    String geoHash = "";

-    lat = lat + 90;

-    lon = lon + 180;

-    String latString = String.valueOf(lat).replace(".", "");

-    String lonString = String.valueOf(lon).replace(".", "");

-    int length = latString.length() > lonString.length() ? lonString.length() : latString.length();

-    while (length < 12) {

-      latString += "0";

-      lonString += "0";

-      length++;

-    }

-    latString = latString.substring(0, 10);

-    lonString = lonString.substring(0, 10);

-    char[] latChars = latString.toCharArray();

-    char[] lonChars = lonString.toCharArray();

-

-    for (int i = 0; i < latChars.length; i++) {

-      geoHash += String.valueOf(latChars[i]) + String.valueOf(lonChars[i]);

-    }

-    return geoHash;

-  }

 

   private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {

     Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
similarity index 76%
copy from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
copy to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index 3dbf5d1..f3199a1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java

@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

-package opennlp.addons.geoentitylinker;

+package opennlp.addons.geoentitylinker.scoring;

 

 import java.util.ArrayList;

 import java.util.HashMap;

@@ -22,29 +22,39 @@
 import java.util.Map;

 import java.util.Set;

 import java.util.TreeSet;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.addons.geoentitylinker.AdminBoundaryContext;

+import opennlp.addons.geoentitylinker.GazetteerEntry;

 import opennlp.tools.entitylinker.BaseLink;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.LinkedSpan;

 import opennlp.tools.util.Span;

 

 /**

- * Scores toponyms based on their proximity to a country mention. Based on the

- * heuristic that typonymn mentions are more likely close to their parent

- * country mentions. For instance, if the toponym Berlin is mentioned near an

- * indicator of Germany, it is more likely to be Berlin Germany than Berlin

- * Connecticut.

+ * Scores toponyms based on their proximity to a province mention. Based on the

+ * heuristic that toponymn mentions are more likely close to their parent

+ * province mentions. For instance, if the toponym Berlin is mentioned near an

+ * indicator of Connecticut, it is more likely to be Berlin Connecticut than

+ * Berlin Germany (if Germany did not exist in, or is mentioned further down in,

+ * the article).

  *

  *

  */

-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {

+public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {

 

   private Map<String, Set<String>> nameCodesMap;

   String dominantCode = "";

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

-

-    score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

+    if (!additionalContext.getProvHits().isEmpty()) {

+      score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

+    } else {

+      for (LinkedSpan<BaseLink> span : linkedSpans) {

+        for (BaseLink link : span.getLinkedEntries()) {

+          link.getScoreMap().put("provincecontext", Double.NaN);

+        }

+      }

+    }

 

   }

 

@@ -53,20 +63,19 @@
    * matches. Currently the scoring indicates the probability that the toponym

    * is correct based on the country context in the document

    *

-   * @param linkedData     the linked spans, holds the Namefinder results, and

-   *                       the list of BaseLink for each

-   * @param countryHits    all the country mentions in the document

-   * @param nameCodesMap   maps a country indicator name to a country code. Used

-   *                       to determine if the namefinder found the same exact

-   *                       toponym the country context did. If so the score is

-   *                       boosted due to the high probability that the

-   *                       NameFinder actually "rediscovered" a country

-   * @param docText        the full text of the document...not used in this

-   *                       default implementation

-   * @param sentences      the sentences that correspond to the doc text.

+   * @param linkedData the linked spans, holds the Namefinder results, and the

+   * list of BaseLink for each

+   * @param countryHits all the country mentions in the document

+   * @param nameCodesMap maps a province indicator name to a province code. Used

+   * to determine if the namefinder found the same exact toponym the country

+   * context did. If so the score is boosted due to the high probability that

+   * the NameFinder actually "rediscovered" a country

+   * @param docText the full text of the document...not used in this default

+   * implementation

+   * @param sentences the sentences that correspond to the doc text.

    * @param maxAllowedDist a constant that is used to determine which country

-   *                       mentions, based on proximity within the text, should

-   *                       be used to score the Named Entity.

+   * mentions, based on proximity within the text, should be used to score the

+   * Named Entity.

    * @return

    */

   public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {

@@ -149,34 +158,35 @@
     Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);

     for (BaseLink link : span.getLinkedEntries()) {

       //getItemParentId is the country code

-      String spanCountryCode = link.getItemParentID();

+    GazetteerEntry entry = (GazetteerEntry)link;

+      String spanCountryCode = entry.getProvinceCode();

       if (scoreMap.containsKey(spanCountryCode)) {

 

         score = scoreMap.get(spanCountryCode);

         ///does the name extracted match a country name?

-        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {

+        if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {

           //if so, is it the correct country code for that name?

-          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {

+          if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) {

             //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1

             //TODO: make this smarter, and utilize province/state info in the future to be even more specific

             score = (score + .75) > 1.0 ? 1d : (score + .75);

 

-            if (link.getItemParentID().equals(dominantCode)) {

+            if (entry.getProvinceCode().equals(dominantCode)) {

               score = (score + .25) > 1.0 ? 1d : (score + .25);

             }

           }

         }

       }

-      link.getScoreMap().put("countrycontext", score);

+      link.getScoreMap().put("provincecontext", score);

     }

     return span;

   }

 

   /**

-   * takes a map of distances from the toponym to each country mention and generates

-   * a map of scores for each country code. The map is then correlated to the

-   * code of the BaseLink parentid for retrieval. Then the

-   * score is added to the overall list.

+   * takes a map of distances from the toponym to each country mention and

+   * generates a map of scores for each country code. The map is then correlated

+   * to the code of the BaseLink parentid for retrieval. Then the score is added

+   * to the overall list.

    *

    * @param distanceMap

    * @param sentences

@@ -211,7 +221,6 @@
         normalizedDistances.add(reverse);

       }

 

-

       List<Double> doubles = new ArrayList<Double>(normalizedDistances);

       scoreMap.put(key, slidingDistanceAverage(doubles));

     }

@@ -257,8 +266,8 @@
    * range. Used to normalize distances in this class.

    *

    * @param valueToNormalize the value to place within the new range

-   * @param minimum          the min of the set to be transposed

-   * @param maximum          the max of the set to be transposed

+   * @param minimum the min of the set to be transposed

+   * @param maximum the max of the set to be transposed

    * @return

    */

   private Double normalize(int valueToNormalize, int minimum, int maximum) {
commit	33446b94d0612eadb13492b3ea7e5f9a3d245595	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Fri Jul 11 01:04:58 2014 +0000
committer	Mark Giaconia <markg@apache.org>	Fri Jul 11 01:04:58 2014 +0000
tree	256a55ca02918960313c44360d3a8e331cdc9965
parent	1229638ad20d70905d8761cb4b2c6b47e5e7c48a [diff]