OPENNLP-706
OPENNLP-707
OPENNLP-708
OPENNLP-709
OPENNLP-710
Addressed each ticket. Also adjusted the package structure a bit to separate responsibility better.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
new file mode 100644
index 0000000..638e603
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.Objects;
+
+/**
+ * Stores an admin boundary down to the US county level. Only US places from the
+ * USGS Gazetteer will have county level info
+ *
+ * @author mgiaconia
+ */
+public class AdminBoundary {
+
+ private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND";
+ private final String countryCode;
+ private final String provinceCode;
+ private final String provinceName;
+ private final String countryName;
+ private final String countyName;
+ private final String countyCode;
+
+ public AdminBoundary(String cc, String ac, String pname, String countryName) {
+ this.countryCode = cc;
+ this.provinceCode = ac;
+ this.provinceName = pname;
+ this.countryName = countryName;
+ this.countyCode = NO_DATA_FOUND_VALUE;
+ this.countyName = NO_DATA_FOUND_VALUE;
+ }
+
+ public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName) {
+ this.countryCode = countryCode;
+ this.provinceCode = provinceCode;
+ this.provinceName = provinceName;
+ this.countryName = countryName;
+ this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName;
+ this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode;
+ }
+
+ public String getCountryCode() {
+ return countryCode;
+ }
+
+ public String getProvCode() {
+ return provinceCode;
+ }
+
+ public String getProvinceName() {
+ return provinceName;
+ }
+
+ public String getCountryName() {
+ return countryName;
+ }
+
+ public String getCountyName() {
+ return countyName;
+ }
+
+ public String getCountyCode() {
+ return countyCode;
+ }
+
+ @Override
+ public String toString() {
+ return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}';
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 7;
+ hash = 11 * hash + Objects.hashCode(this.countryCode);
+ hash = 11 * hash + Objects.hashCode(this.provinceCode);
+ hash = 11 * hash + Objects.hashCode(this.provinceName);
+ hash = 11 * hash + Objects.hashCode(this.countryName);
+ hash = 11 * hash + Objects.hashCode(this.countyName);
+ hash = 11 * hash + Objects.hashCode(this.countyCode);
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final AdminBoundary other = (AdminBoundary) obj;
+ if (!Objects.equals(this.countryCode, other.countryCode)) {
+ return false;
+ }
+ if (!Objects.equals(this.provinceCode, other.provinceCode)) {
+ return false;
+ }
+ if (!Objects.equals(this.provinceName, other.provinceName)) {
+ return false;
+ }
+ if (!Objects.equals(this.countryName, other.countryName)) {
+ return false;
+ }
+ if (!Objects.equals(this.countyName, other.countyName)) {
+ return false;
+ }
+ if (!Objects.equals(this.countyCode, other.countyCode)) {
+ return false;
+ }
+ return true;
+ }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
new file mode 100644
index 0000000..323aabb
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class AdminBoundaryContext {
+
+ private final Map<String, Set<Integer>> countryMentions;
+ private final Map<String, Set<Integer>> provMentions;
+ private final Map<String, Set<Integer>> countyMentions;
+ private final Set<String> countryHits;
+ private final Set<String> provHits;
+ private final Set<String> countyHits;
+ private final Map<String, String> countryRefMap;
+ private final Map<String, Map<String, String>> provRefMap;
+ private final Map<String, Map<String, String>> countyRefMap;
+ private final Set<String> whereClauses;
+ private final Map<String, Set<String>> nameCodesMap;
+
+ public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions,
+ Map<String, Set<Integer>> provMentions,
+ Map<String, Set<Integer>> countyMentions,
+ Set<String> countryHits,
+ Set<String> provHits,
+ Set<String> countyHits,
+ Map<String, String> countryRefMap,
+ Map<String, Map<String, String>> provRefMap,
+ Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap) {
+ this.countryMentions = countryMentions;
+ this.provMentions = provMentions;
+ this.countyMentions = countyMentions;
+ this.countryHits = countryHits;
+ this.provHits = provHits;
+ this.countyHits = countyHits;
+ this.countryRefMap = countryRefMap;
+ this.provRefMap = provRefMap;
+ this.countyRefMap = countyRefMap;
+ this.whereClauses = setWhereClauses();
+ this.nameCodesMap = nameCodesMap;
+ }
+
+ public Map<String, Set<String>> getNameCodesMap() {
+ return nameCodesMap;
+ }
+
+ public Map<String, Set<Integer>> getCountryMentions() {
+ return countryMentions;
+ }
+
+ public Map<String, Set<Integer>> getProvMentions() {
+ return provMentions;
+ }
+
+ public Map<String, Set<Integer>> getCountyMentions() {
+ return countyMentions;
+ }
+
+ public Set<String> getCountryHits() {
+ return countryHits;
+ }
+
+ public Set<String> getProvHits() {
+ return provHits;
+ }
+
+ public Set<String> getCountyHits() {
+ return countyHits;
+ }
+
+ public Map<String, String> getCountryRefMap() {
+ return countryRefMap;
+ }
+
+ public Map<String, Map<String, String>> getProvRefMap() {
+ return provRefMap;
+ }
+
+ public Map<String, Map<String, String>> getCountyRefMap() {
+ return countyRefMap;
+ }
+
+ public Set<String> getWhereClauses() {
+ return whereClauses;
+ }
+
+ private Set<String> setWhereClauses() {
+ Set<String> clauses = new HashSet<>();
+ for (String countryCode : this.getCountryHits()) {
+ String gazType = countryCode.toLowerCase().equals("us") ? " AND gazsource:usgs" : " AND gazsource:geonames";
+ if (countryCode.toLowerCase().matches(".*rg[0-9].*")) {
+ gazType = " AND gazsource:region";
+ }
+ Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode);
+ if (provsForCountry == null) {
+ provsForCountry = new HashMap<>();
+ }
+ Map<String, String> provs = new HashMap<>();
+
+ if (!provsForCountry.isEmpty()) {
+ for (String pcode : provsForCountry.keySet()) {
+ if (this.getProvHits().contains(pcode)) {
+ provs.put(pcode, provsForCountry.get(pcode));
+
+ clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType);
+
+ }
+ }
+ }
+ if (provs.isEmpty()) {
+ //got a country with no mentioned provs
+ clauses.add(" countrycode:" + countryCode + gazType);
+ }
+ }
+ return clauses;
+ }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
new file mode 100644
index 0000000..c09afbd
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import org.apache.log4j.Logger;
+
+/**
+ * Finds instances of country mentions in a String, typically a document text.
+ * Used to boost or degrade scoring of linked geo entities
+ *
+ */
+public class AdminBoundaryContextGenerator {
+
+ private static final Logger LOGGER = Logger.getLogger(AdminBoundaryContextGenerator.class);
+ private List<CountryContextEntry> countrydata;
+ private Map<String, Set<String>> nameCodesMap = new HashMap<>();
+ private Map<String, Set<Integer>> countryMentions = new HashMap<>();
+ private Set<CountryContextEntry> countryHits = new HashSet<>();
+ private EntityLinkerProperties properties;
+ private List<AdminBoundary> adminBoundaryData;
+ private Set<AdminBoundary> adminBoundaryHits = new HashSet<>();
+ private AdminBoundaryContext context;
+
+ public AdminBoundaryContext getContext(String text) {
+ context = null;
+ nameCodesMap.clear();
+ context = process(text);
+
+ return context;
+ }
+
+ private Set<String> countryHitSet = new HashSet<>();
+ private Map<String, String> countryMap = new HashMap<>();
+ private Map<String, Map<String, String>> provMap = new HashMap<>();
+ private Map<String, Map<String, String>> countyMap = new HashMap<>();
+
+ private Map<String, Set<Integer>> provMentions = new HashMap<>();
+ private Map<String, Set<Integer>> countyMentions = new HashMap<>();
+
+ private Set<String> provHits = new HashSet<String>();
+ private Set<String> countyHits = new HashSet<String>();
+
+ public static void main(String[] args) {
+ try {
+ AdminBoundaryContextGenerator countryContext = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
+ GeoEntityLinker linker = new GeoEntityLinker();
+ linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
+
+ countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");
+
+ } catch (Exception ex) {
+ java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws Exception {
+ this.properties = properties;
+ if (countrydata == null) {
+ String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
+
+ File countryContextFile = new File(path);
+ //countrydata = getCountryContextFromFile(countryContextFile);
+ adminBoundaryData = getContextFromFile(countryContextFile);
+ }
+ }
+
+ public Map<String, Set<Integer>> getCountryMentions() {
+ return countryMentions;
+ }
+
+ /**
+ * returns the last set of hits after calling regexFind
+ *
+ * @return
+ */
+ public Set<CountryContextEntry> getCountryHits() {
+ return countryHits;
+ }
+
+ /**
+ * returns the last name to codes map after calling regexFind
+ *
+ * @return
+ */
+ public Map<String, Set<String>> getNameCodesMap() {
+ return nameCodesMap;
+ }
+
+ public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
+ this.nameCodesMap = nameCodesMap;
+ }
+
+ private void reset() {
+ this.nameCodesMap.clear();
+ this.countryHitSet.clear();
+ this.countryHits.clear();
+ this.countryMentions.clear();
+ this.provHits.clear();
+ this.provMentions.clear();
+ this.countyHits.clear();
+ this.countyMentions.clear();
+ this.adminBoundaryHits.clear();
+ }
+
+ /**
+ * Finds indicators of countries, provinces, and cities, as per the USGS and
+ * Geonames gazetteers. The results of this are used to score toponymns
+ * downstream. The full text of a document should be passed in here.
+ *
+ * @param text the full text of the document (block of text).
+ * @return
+ */
+ private AdminBoundaryContext process(String text) {
+ try {
+ if (text.contains("Convoy of terror")) {
+ System.out.println("");
+ }
+ reset();
+ Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);
+ if (!countryhitMap.isEmpty()) {
+ for (String cc : countryhitMap.keySet()) {
+ Map<String, String> provsForCc = provMap.get(cc);
+ if (provsForCc != null) {
+ provMentions = regexfind(text, provsForCc, provHits);
+ if (provMentions != null) {
+ for (String prov : provMentions.keySet()) {
+ Map<String, String> get = countyMap.get(prov);
+ if (get != null) {
+ countyMentions = regexfind(text, get, countyHits);
+ }
+ }
+ }
+ }
+ }
+ } else {
+ for (Map<String, String> provsForCc : provMap.values()) {
+ if (provsForCc != null) {
+ provMentions = regexfind(text, provsForCc, provHits);
+ if (provMentions != null) {
+ for (String prov : provMentions.keySet()) {
+ //fake a country hit based on a province hit... this gets fuzzy
+ String cc = prov.split("\\.")[0];
+ if (!countryhitMap.containsKey(cc)) {
+ countryhitMap.put(cc, provMentions.get(prov));
+ countryHitSet.add(cc);
+ } else {
+ countryhitMap.get(cc).addAll(provMentions.get(prov));
+ }
+ Map<String, String> get = countyMap.get(prov);
+ if (get != null) {
+ countyMentions = regexfind(text, get, countyHits);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ Map<String, String> countryRefMap = new HashMap<>();
+
+ for (String c : countryHitSet) {
+ String countryName = countryMap.get(c);
+ if (countryName != null) {
+ countryRefMap.put(c, countryName);
+ }
+ }
+
+ AdminBoundaryContext context = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, countryRefMap, provMap, countyMap, nameCodesMap);
+
+ return context;
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ /**
+ * Finds mentions of countries to assist in toponym resolution. Countries are
+ * discovered via regex based on a configured file called
+ * opennlp.geoentitylinker.countrycontext.txt. the file is configured using
+ * the entitylinker.properties file as such:
+ * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt
+ *
+ * Finding mentions in documents is very helpful for scoring. Lazily loads the
+ * list from the file.
+ *
+ * @param docText the full text of the document
+ * @return
+ */
+ @Deprecated
+ public Map<String, Set<Integer>> regexfind(String docText) {
+ countryMentions = new HashMap<>();
+ nameCodesMap.clear();
+ try {
+
+ for (CountryContextEntry entry : countrydata) {
+ Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+ Matcher rs = regex.matcher(docText);
+ String code = entry.getCc1().toLowerCase();
+
+ boolean found = false;
+ while (rs.find()) {
+ found = true;
+ Integer start = rs.start();
+ String hit = rs.group().toLowerCase();
+ if (countryMentions.containsKey(code)) {
+ countryMentions.get(code).add(start);
+ } else {
+ Set<Integer> newset = new HashSet<Integer>();
+ newset.add(start);
+ countryMentions.put(code, newset);
+ }
+ if (!hit.equals("")) {
+ if (this.nameCodesMap.containsKey(hit)) {
+ nameCodesMap.get(hit).add(code);
+ } else {
+ HashSet<String> newset = new HashSet<String>();
+ newset.add(code);
+ nameCodesMap.put(hit, newset);
+ }
+ }
+ }
+ if (found) {
+ countryHits.add(entry);
+ }
+
+ }
+
+ } catch (Exception ex) {
+ LOGGER.error(ex);
+ }
+
+ return countryMentions;
+ }
+
+ /**
+ * discovers indicators of admin boundary data using regex.
+ *
+ * @param docText the full text
+ * @param lookupMap a map to use to find names. the key=a location code, the
+ * value is an actual name.
+ * @param hitsRef a reference to a set that stores the hits by id
+ * @return
+ */
+ private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef) {
+ Map<String, Set<Integer>> mentions = new HashMap<>();
+ if (lookupMap == null) {
+ return mentions;
+ }
+ try {
+
+ for (String entry : lookupMap.keySet()) {
+ String name = lookupMap.get(entry).toLowerCase();
+ if (name == null) {
+ continue;
+ }
+ name = "[^\\p{L}\\p{Nd}]" + name.replace(", the", "") + "[^\\p{L}\\p{Nd}]";
+ Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+ Matcher rs = regex.matcher(docText);
+ String code = entry.toLowerCase();
+
+ boolean found = false;
+ while (rs.find()) {
+ found = true;
+ Integer start = rs.start();
+ String hit = rs.group().toLowerCase().trim();
+ hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", "");
+ if (mentions.containsKey(code)) {
+ mentions.get(code).add(start);
+ } else {
+ Set<Integer> newset = new HashSet<Integer>();
+ newset.add(start);
+ mentions.put(code, newset);
+ }
+ if (!hit.equals("")) {
+ if (this.nameCodesMap.containsKey(hit)) {
+ nameCodesMap.get(hit).add(code);
+ } else {
+ HashSet<String> newset = new HashSet<String>();
+ newset.add(code);
+ nameCodesMap.put(hit, newset);
+ }
+ }
+
+ }
+ if (found) {
+ hitsRef.add(code);
+
+ }
+ }
+
+ } catch (Exception ex) {
+ LOGGER.error(ex);
+ ex.printStackTrace();
+
+ }
+
+ return mentions;
+ }
+
+ private List<AdminBoundary> getContextFromFile(File countryContextFile) {
+ if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {
+ return adminBoundaryData;
+ }
+ List<AdminBoundary> entries = new ArrayList<>();
+ BufferedReader reader;
+ try {
+ reader = new BufferedReader(new FileReader(countryContextFile));
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+ String[] values = line.split("\t");
+ int len = values.length;
+ if (len < 5 || len > 6) {
+ throw new IllegalArgumentException("Improperly formatted file");
+ }
+ if (values.length == 6) {
+ AdminBoundary entry = new AdminBoundary(
+ values[0].toLowerCase().trim(),
+ values[3].toLowerCase().trim(),
+ values[1].toLowerCase().trim(),
+ values[4].toLowerCase().trim(),
+ values[2].toLowerCase().trim(),
+ values[5].toLowerCase().trim());
+ entries.add(entry);
+ } else {
+ AdminBoundary entry = new AdminBoundary(
+ values[0].toLowerCase().trim(),
+ values[3].toLowerCase().trim(),
+ values[1].toLowerCase().trim(),
+ values[4].toLowerCase().trim(),
+ values[2].toLowerCase().trim(),
+ "");
+ entries.add(entry);
+ }
+
+ }
+ reader.close();
+ } catch (IOException ex) {
+ LOGGER.error(ex);
+ }
+ loadMaps(entries);
+ return entries;
+
+ }
+
+ private void loadMaps(List<AdminBoundary> boundaries) {
+ for (AdminBoundary adm : boundaries) {
+ if (!adm.getCountryCode().equals("null")) {
+ countryMap.put(adm.getCountryCode(), adm.getCountryName());
+
+ if (!adm.getProvCode().equals("null")) {
+ Map<String, String> provs = provMap.get(adm.getCountryCode());
+ if (provs == null) {
+ provs = new HashMap<>();
+ }
+ //if (!provs.containsKey(adm.getProvCode())) {
+ provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName());
+ provMap.put(adm.getCountryCode(), provs);
+ // }
+
+ if (!adm.getCountyCode().toLowerCase().equals("no_data_found") && !adm.getCountyName().toLowerCase().equals("no_data_found")) {
+ Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode());
+ if (counties == null) {
+ counties = new HashMap<>();
+ } // if (!counties.containsKey(adm.getCountyCode())) {
+ String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode();
+ counties.put(countyid, adm.getCountyName());
+ countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties);
+ // }
+
+ }
+
+ }
+ }
+ }
+ }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
deleted file mode 100644
index 4aa9e16..0000000
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import org.apache.log4j.Logger;
-
-/**
- * Finds instances of country mentions in a String, typically a document text.
- * Used to boost or degrade scoring of linked geo entities
- *
- */
-public class CountryContext {
-
- private static final Logger LOGGER = Logger.getLogger(CountryContext.class);
- private List<CountryContextEntry> countrydata;
- private Map<String, Set<String>> nameCodesMap = new HashMap<>();
- private Map<String, Set<Integer>> countryMentions = new HashMap<>();
- private Set<CountryContextEntry> countryHits = new HashSet<>();
- private EntityLinkerProperties properties;
-
- public CountryContext(EntityLinkerProperties properties) throws Exception {
- this.properties = properties;
- if (countrydata == null) {
- String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-
- File countryContextFile = new File(path);
- countrydata = getCountryContextFromFile(countryContextFile);
- }
- }
-
- public Map<String, Set<Integer>> getCountryMentions() {
- return countryMentions;
- }
-
- /**
- * returns the last set of hits after calling regexFind
- *
- * @return
- */
- public Set<CountryContextEntry> getCountryHits() {
- return countryHits;
- }
-
- /**
- * returns the last name to codes map after calling regexFind
- *
- * @return
- */
- public Map<String, Set<String>> getNameCodesMap() {
- return nameCodesMap;
- }
-
- public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
- this.nameCodesMap = nameCodesMap;
- }
-
- /**
- * Finds mentions of countries to assist in toponym resolution. Countries are
- * discovered via regex based on a configured file called
- * opennlp.geoentitylinker.countrycontext.txt. the file is configured using
- * the entitylinker.properties file as such:
- * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt
- *
- * Finding mentions in documents is very helpful for scoring. Lazily loads the
- * list from the file.
- *
- * @param docText the full text of the document
- * @param properties EntityLinkerProperties for getting database connection
- * @return
- */
- public Map<String, Set<Integer>> regexfind(String docText) {
- countryMentions = new HashMap<>();
- nameCodesMap.clear();
- try {
-
- for (CountryContextEntry entry : countrydata) {
- Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
- Matcher rs = regex.matcher(docText);
- String code = entry.getCc1().toLowerCase();
-
- boolean found = false;
- while (rs.find()) {
- found = true;
- Integer start = rs.start();
- String hit = rs.group().toLowerCase();
- if (countryMentions.containsKey(code)) {
- countryMentions.get(code).add(start);
- } else {
- Set<Integer> newset = new HashSet<Integer>();
- newset.add(start);
- countryMentions.put(code, newset);
- }
- if (!hit.equals("")) {
- if (this.nameCodesMap.containsKey(hit)) {
- nameCodesMap.get(hit).add(code);
- } else {
- HashSet<String> newset = new HashSet<String>();
- newset.add(code);
- nameCodesMap.put(hit, newset);
- }
- }
- }
- if (found) {
- countryHits.add(entry);
- }
-
- }
-
- } catch (Exception ex) {
- LOGGER.error(ex);
- }
-
- return countryMentions;
- }
-
- private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {
- List<CountryContextEntry> entries = new ArrayList<>();
- String path = countryContextFile.getPath();
- BufferedReader reader;
-
- try {
- path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-
- reader = new BufferedReader(new FileReader(path));
-
- while (reader.read() != -1) {
- String line = reader.readLine();
- String[] values = line.split("\t");
- if (values.length != 4) {
- throw new IOException("improperly formatted country context file");
- }
- CountryContextEntry entry = new CountryContextEntry();
- // rc,cc1, full_name_nd_ro,dsg
- entry.setRc(values[0].toLowerCase());
- entry.setCc1(values[1].toLowerCase());
- entry.setFull_name_nd_ro(values[2].toLowerCase());
- entry.setDsg(values[3].toLowerCase());
- entries.add(entry);
- }
- reader.close();
- } catch (IOException ex) {
- LOGGER.error(ex);
- }
- return entries;
-
- }
-}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
index a208d78..0c37eee 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
@@ -31,6 +31,9 @@
private String source;
private String indexID;
private Map<String, String> indexData = new HashMap<>();
+ private String countryCode;
+ private String provinceCode;
+ private String hierarchy;
/**
* returns the id from the lucene document
@@ -159,5 +162,28 @@
return true;
}
+ public String getCountryCode() {
+ return countryCode;
+ }
+
+ public void setCountryCode(String countryCode) {
+ this.countryCode = countryCode;
+ }
+
+ public String getProvinceCode() {
+ return provinceCode;
+ }
+
+ public void setProvinceCode(String provinceCode) {
+ this.provinceCode = provinceCode;
+ }
+
+ public String getHierarchy() {
+ return hierarchy;
+ }
+
+ public void setHierarchy(String hierarchy) {
+ this.hierarchy = hierarchy;
+ }
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
deleted file mode 100644
index dd65ec7..0000000
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MMapDirectory;
-import org.apache.lucene.util.Version;
-
-/**
- *
- * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
- */
-public class GazetteerIndexer {
-
- public GazetteerIndexer() {
-
- }
-
-
- public static interface Separable {
-
- String getSeparator();
- }
-
- public enum GazType implements Separable {
-
- GEONAMES {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_geonames_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\t";
- }
- },
- USGS {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_usgsgaz_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\\|";
- }
- }
- }
-
- /**
- * indexes the USGS or Geonames gazateers.
- *
- * @param outputIndexDir a DIRECTORY path where you would like to store
- * the output lucene indexes
- * @param gazetteerInputData the file, "as is" that was downloaded from the
- * USGS and GEONAMES website
- * @param type indicates whether the data is USGS or GEONAMES
- * format
- * @throws Exception
- */
- public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
- if (!outputIndexDir.isDirectory()) {
- throw new IllegalArgumentException("outputIndexDir must be a directory.");
- }
-
- String indexloc = outputIndexDir + type.toString();
- Directory index = new MMapDirectory(new File(indexloc));
-
- Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
-
- IndexWriter w = new IndexWriter(index, config);
-
- readFile(gazetteerInputData, w, type);
- w.commit();
- w.close();
-
- }
-
- public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
- BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<>();
- int counter = 0;
- System.out.println("reading gazetteer data from file...........");
- while (reader.read() != -1) {
- String line = reader.readLine();
- String[] values = line.split(type.getSeparator());
- if (counter == 0) {
- for (String columnName : values) {
- fields.add(columnName.replace("»¿", "").trim());
- }
-
- } else {
- Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
- doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
- }
- w.addDocument(doc);
- }
- counter++;
- if (counter % 100000 == 0) {
- w.commit();
- System.out.println(counter + " .........committed to index..............");
- }
-
- }
- w.commit();
- System.out.println("Completed indexing gaz! index name is: " + type.toString());
- }
-
-}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
index 3049169..ac5b01e 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
@@ -28,7 +28,7 @@
private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>();
/**
- * returns the cached entries. Returns null if the query does not exists in the cache
+ * returns the cached entries. Returns null if the query does not exist in the cache
* @param searchString
* @return
*/
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 1f976d6..9a8be47 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -38,6 +38,7 @@
import opennlp.tools.entitylinker.EntityLinkerProperties;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.search.Sort;
/**
*
@@ -63,11 +64,16 @@
private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
+ private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
+ private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex);
+ private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader);
+ private Analyzer opennlpAnalyzer;
+
public static void main(String[] args) {
try {
boolean b = Boolean.valueOf("true");
- new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
+ new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("baghdad", 5, "iz");
} catch (IOException ex) {
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
} catch (Exception ex) {
@@ -79,6 +85,112 @@
this.properties = properties;
init();
}
+/**
+ * Searches the single lucene index that includes the location hierarchy.
+ * @param searchString the location name to search for
+ * @param rowsReturned how many index entries to return (top N...)
+ * @param whereClause the conditional statement that defines the index type and the country oode.
+ * @return
+ */
+ public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) {
+ ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
+ searchString = cleanInput(searchString);
+ if (searchString.isEmpty()) {
+ return linkedData;
+ }
+ try {
+ /**
+ * build the search string Sometimes no country context is found. In this
+ * case the code variables will be empty strings
+ */
+ String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;
+ if (searchString.trim().contains(" ")) {
+ placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
+ + " AND " + whereClause;
+ }
+
+ // luceneQueryString = "hierarchy:(tampa florida) AND gazsource:usgs";
+ /**
+ * check the cache and go no further if the records already exist
+ */
+ ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(placeNameQueryString);
+ if (get != null) {
+
+ return get;
+ }
+ /**
+ * search the placename
+ */
+ QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);
+ Query q = parser.parse(placeNameQueryString);
+
+ TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned, Sort.RELEVANCE);
+
+ for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
+ GazetteerEntry entry = new GazetteerEntry();
+ int docId = bestDocs.scoreDocs[i].doc;
+ double sc = bestDocs.scoreDocs[i].score;
+
+ entry.getScoreMap().put("lucene", sc);
+ entry.setIndexID(docId + "");
+
+ Document d = opennlpSearcher.doc(docId);
+
+ List<IndexableField> fields = d.getFields();
+
+ String lat = d.get("latitude");
+ String lon = d.get("longitude");
+ String placename = d.get("placename");
+ String parentid = d.get("countrycode").toLowerCase();
+ String provid = d.get("admincode");
+ String itemtype = d.get("loctype");
+ String source = d.get("gazsource");
+ String hier = d.get("hierarchy");
+ entry.setSource(source);
+
+ entry.setItemID(docId + "");
+ entry.setLatitude(Double.valueOf(lat));
+ entry.setLongitude(Double.valueOf(lon));
+ entry.setItemType(itemtype);
+ entry.setItemParentID(parentid);
+ entry.setProvinceCode(provid);
+ entry.setCountryCode(parentid);
+ entry.setItemName(placename);
+ entry.setHierarchy(hier);
+ for (int idx = 0; idx < fields.size(); idx++) {
+ entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name()));
+ }
+ /**
+ * norm the levenstein distance
+ */
+ int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();
+
+ Double normLev = Math.abs(1-(sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();
+ /**
+ * only want hits above the levenstein thresh. This should be a low
+ * thresh due to the use of the hierarchy field in the index
+ */
+ if (normLev.compareTo(scoreCutoff) >= 0) {
+// if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {
+ entry.getScoreMap().put("normlucene", normLev);
+ //make sure we don't produce a duplicate
+ if (!linkedData.contains(entry)) {
+ linkedData.add(entry);
+ /**
+ * add the records to the cache for this query
+ */
+ GazetteerSearchCache.put(placeNameQueryString, linkedData);
+ }
+// }
+ }
+ }
+
+ } catch (IOException | ParseException ex) {
+ LOGGER.error(ex);
+ }
+
+ return linkedData;
+ }
/**
*
@@ -88,6 +200,7 @@
*
* @return
*/
+ @Deprecated
public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
searchString = cleanInput(searchString);
@@ -198,6 +311,7 @@
*
* @return
*/
+ @Deprecated
public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
searchString = cleanInput(searchString);
@@ -284,7 +398,8 @@
}
/**
- * Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
+ * Replaces any noise chars with a space, and depending on configuration adds
+ * double quotes to the string
*
* @param input
* @return
@@ -300,36 +415,66 @@
}
private void init() throws Exception {
- if (usgsIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+// if (usgsIndex == null) {
+// String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+// if (indexloc.equals("")) {
+// // System.out.println("USGS Gaz location not found");
+// LOGGER.error(new Exception("USGS Gaz location not found"));
+// }
+// String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+//
+// scoreCutoff = Double.valueOf(cutoff);
+// String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
+// doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
+// usgsIndex = new MMapDirectory(new File(indexloc));
+// usgsReader = DirectoryReader.open(usgsIndex);
+// usgsSearcher = new IndexSearcher(usgsReader);
+// usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+// }
+// if (geonamesIndex == null) {
+// String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+// if (indexloc.equals("")) {
+// LOGGER.error(new Exception("Geonames Gaz location not found"));
+//
+// }
+// String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+// scoreCutoff = Double.valueOf(cutoff);
+// geonamesIndex = new MMapDirectory(new File(indexloc));
+// geonamesReader = DirectoryReader.open(geonamesIndex);
+// geonamesSearcher = new IndexSearcher(geonamesReader);
+// //TODO: a language code switch statement should be employed here at some point
+// geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+//
+// }
+ if (opennlpIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");
if (indexloc.equals("")) {
- // System.out.println("USGS Gaz location not found");
- LOGGER.error(new Exception("USGS Gaz location not found"));
- }
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
-
- scoreCutoff = Double.valueOf(cutoff);
- String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
- doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
- usgsIndex = new MMapDirectory(new File(indexloc));
- usgsReader = DirectoryReader.open(usgsIndex);
- usgsSearcher = new IndexSearcher(usgsReader);
- usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
- }
- if (geonamesIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
- if (indexloc.equals("")) {
- LOGGER.error(new Exception("Geonames Gaz location not found"));
+ LOGGER.error(new Exception("Opennlp combined Gaz directory location not found"));
}
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
- scoreCutoff = Double.valueOf(cutoff);
- geonamesIndex = new MMapDirectory(new File(indexloc));
- geonamesReader = DirectoryReader.open(geonamesIndex);
- geonamesSearcher = new IndexSearcher(geonamesReader);
+ // String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+ // scoreCutoff = Double.valueOf(cutoff);
+ opennlpIndex = new MMapDirectory(new File(indexloc));
+ opennlpReader = DirectoryReader.open(opennlpIndex);
+ opennlpSearcher = new IndexSearcher(opennlpReader);
//TODO: a language code switch statement should be employed here at some point
- geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
}
}
+
+ private String formatForHierarchy(String searchTerm) {
+ String[] parts = searchTerm.split(" ");
+ String out = "";
+ if (parts.length != 0) {
+ for (String string : parts) {
+ out += string + " AND ";
+ }
+ out = out.substring(0, out.lastIndexOf(" AND "));
+ } else {
+ out = cleanInput(searchTerm);
+ }
+ return out;
+ }
+
}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index b147d27..367a082 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -15,6 +15,11 @@
*/
package opennlp.addons.geoentitylinker;
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
+import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
+import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
+import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
+import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -33,11 +38,11 @@
*/
public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
- private CountryContext countryContext;
+ private AdminBoundaryContextGenerator countryContext;
private Map<String, Set<Integer>> countryMentions;
private EntityLinkerProperties linkerProperties;
private GazetteerSearcher gazateerSearcher;
- private List<LinkedEntityScorer> scorers = new ArrayList<>();
+ private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
@@ -46,8 +51,8 @@
if (linkerProperties == null) {
throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
}
- countryMentions = countryContext.regexfind(doctext);
-
+ //countryMentions = countryContext.regexfind(doctext);
+ AdminBoundaryContext context = countryContext.getContext(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
String[] tokens = tokensBySentence[s];
@@ -55,51 +60,27 @@
for (int i = 0; i < matches.length; i++) {
- /**
- * nga gazateer is for other than US placenames,don't want to use it if
- * US is the only country mentioned in the doc
- *
- */
ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
- if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
- || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
-
- if (!countryMentions.keySet().isEmpty()) {
- for (String code : countryMentions.keySet()) {
- if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));
- }
- }
- } else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));
-
+ if (!context.getWhereClauses().isEmpty()) {
+ for (String whereclause : context.getWhereClauses()) {
+ geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));
}
-
+ }else{//this means there were no where clauses generated so the where clause will default to look at the entire index
+ geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));
}
- ArrayList<BaseLink> usgsEntries = new ArrayList<>();
- if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
- //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
- usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));
- }
- LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd(), "location",names[i].getProb());
-
-
- if (!usgsEntries.isEmpty()) {
- geoSpan.getLinkedEntries().addAll(usgsEntries);
- geoSpan.setSearchTerm(matches[i]);
- }
-
- if (!geoSpan.getLinkedEntries().isEmpty()) {
- geoSpan.setSearchTerm(matches[i]);
- geoSpan.setSentenceid(s);
- spans.add(geoSpan);
- }
+ //start generating queries
+ LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
+ newspan.setSearchTerm(matches[i]);
+ newspan.setLinkedEntries(geoNamesEntries);
+ newspan.setSentenceid(s);
+ spans.add(newspan);
}
+
}
if (!scorers.isEmpty()) {
for (LinkedEntityScorer scorer : scorers) {
- scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
+ scorer.score(spans, doctext, sentences, linkerProperties, context);
}
}
@@ -111,6 +92,8 @@
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
+ scorers.add(new FuzzyStringMatchScorer());
+ // scorers.add(new ProvinceProximityScorer());
}
}
@@ -118,7 +101,7 @@
public void init(EntityLinkerProperties properties) {
try {
this.linkerProperties = properties;
- countryContext = new CountryContext(this.linkerProperties);
+ countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
loadScorers();
} catch (Exception ex) {
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
new file mode 100644
index 0000000..98dd7b5
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ *
+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
+ */
+public class GazetteerIndexer {
+
+ public static void main(String[] args) {
+ try {
+ GazetteerIndexer i = new GazetteerIndexer();
+ i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),
+ new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),
+ new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),
+ new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),
+ new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
+ new File("C:\\temp\\gazetteers\\"),
+ new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),
+ new File("C:\\temp\\gazetteers\\regions.txt"));
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public GazetteerIndexer() {
+
+ }
+
+ public static interface Separable {
+
+ String getSeparator();
+ }
+
+ public enum GazType implements Separable {
+
+ GEONAMES {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_geonames_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\t";
+ }
+ },
+ USGS {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_usgsgaz_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\\|";
+ }
+ }
+ }
+
+ /**
+ *
+ * @param geonamesData the actual Geonames gazetteer data downloaded from
+ * here: http://download.geonames.org/export/dump/ then click on this
+ * link 'allCountries.zip'
+ * @param geoNamesCountryInfo the countryinfo lookup table that can be
+ * downloaded from here
+ * http://download.geonames.org/export/dump/countryinfo.txt
+ * @param geonamesAdmin1CodesASCII The lookup data for the province names for
+ * each place found here:
+ * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight the
+ * table view, and copy results into a text file. Make sure the tab delimitted
+ * format is maintained.
+ * @param usgsDataFile the actual USGS gazetteer downloaded from here:
+ * http://geonames.usgs.gov/domestic/download_data.htm click on the
+ * national_file####.zip link to get all the most recent features
+ *
+ * @param usgsGovUnitsFile go to here:
+ * http://geonames.usgs.gov/domestic/download_data.htm in the section titled
+ * "Topical Gazetteers -- File Format" click on the drop down list and select
+ * "Government Units". The downloaded file is what you need for this param.
+ * @param outputIndexDir where you want the final index. Must be a directory,
+ * not an actual file.
+ * @param outputCountryContextFile The output countrycontext file. THis is a
+ * very important file used inside the GeoEntityLinker to assist in toponym
+ * resolution.
+ * @param regionsFile this file contains a list of regions in the following
+ * format: tab delimitted text with index 0 as the name of the region, index 1
+ * as the longitude, and index 2 as the latitude
+ * @throws Exception
+ */
+ public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII,
+ File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) throws Exception {
+ if (!outputIndexDir.isDirectory()) {
+ throw new IllegalArgumentException("outputIndexDir must be a directory.");
+ }
+ if (!geonamesData.exists()) {
+ throw new FileNotFoundException("geonames data file does not exist");
+ }
+ if (!geoNamesCountryInfo.exists()) {
+ throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");
+ }
+ if (!geonamesAdmin1CodesASCII.exists()) {
+ throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");
+ }
+
+ if (!usgsDataFile.exists()) {
+ throw new FileNotFoundException("usgsDataFile data file does not exist");
+ }
+ if (!usgsGovUnitsFile.exists()) {
+ throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");
+ }
+ if (!outputIndexDir.exists()) {
+ throw new FileNotFoundException("outputIndexDir data file does not exist");
+ }
+ if (!regionsFile.exists()) {
+ throw new FileNotFoundException("regionsFile data file does not exist");
+ }
+
+ String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";
+ Directory index = new MMapDirectory(new File(indexloc));
+
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
+
+ IndexWriter w = new IndexWriter(index, config);
+ USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);
+
+ GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);
+
+ RegionProcessor.process(regionsFile, outputCountryContextFile, w);
+ w.commit();
+ w.close();
+ System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" + outputCountryContextFile.getPath() + "' to entitylinker.properties file");
+ }
+
+ /**
+ * indexes the USGS or Geonames gazateers.
+ *
+ * @param outputIndexDir a DIRECTORY path where you would like to store the
+ * output lucene indexes
+ * @param gazetteerInputData the file, "as is" that was downloaded from the
+ * USGS and GEONAMES website
+ * @param type indicates whether the data is USGS or GEONAMES format
+ * @throws Exception
+ */
+ @Deprecated
+ public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
+ if (!outputIndexDir.isDirectory()) {
+ throw new IllegalArgumentException("outputIndexDir must be a directory.");
+
+ }
+
+ String indexloc = outputIndexDir + type.toString();
+ Directory index = new MMapDirectory(new File(indexloc));
+
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
+
+ IndexWriter w = new IndexWriter(index, config);
+ // GeonamesProcessor.process(new File("C:\\temp\\gazetteers\\geonamesdata\\countrycodes.txt"), new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"), gazetteerInputData, null, w);
+ // USGSProcessor.process(gazetteerInputData, outputIndexDir, w);
+ // readFile(gazetteerInputData, w, type);
+ w.commit();
+ w.close();
+
+ }
+//
+// public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
+// BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+// List<String> fields = new ArrayList<>();
+// int counter = 0;
+// System.out.println("reading gazetteer data from file...........");
+// while (reader.read() != -1) {
+// String line = reader.readLine();
+// String[] values = line.split(type.getSeparator());
+// if (counter == 0) {
+// for (String columnName : values) {
+// fields.add(columnName.replace("»¿", "").trim());
+// }
+//
+// } else {
+// Document doc = new Document();
+// for (int i = 0; i < fields.size() - 1; i++) {
+// doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+// }
+// w.addDocument(doc);
+// }
+// counter++;
+// if (counter % 100000 == 0) {
+// w.commit();
+// System.out.println(counter + " .........committed to index..............");
+// }
+//
+// }
+// w.commit();
+// System.out.println("Completed indexing gaz! index name is: " + type.toString());
+// }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
similarity index 94%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
index 991081a..63cb88c 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.indexing;
import java.io.BufferedOutputStream;
import java.io.File;
@@ -28,6 +28,8 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContextGenerator;
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
@@ -41,6 +43,7 @@
*
* Tools for setting up GeoEntityLinker gazateers and doccat scoring model
*/
+@Deprecated
public class GeoEntityLinkerSetupUtils {
private static final int RADIUS = 200;
public static ModelBasedScorer scorer;
@@ -86,7 +89,7 @@
* @throws IOException
*/
public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
- CountryContext context = new CountryContext(properties);
+ AdminBoundaryContextGenerator context = new AdminBoundaryContextGenerator(properties);
FileWriter writer = new FileWriter(annotationOutFile, true);
System.out.println("processing " + documents.size() + " documents");
for (String docText : documents) {
@@ -131,7 +134,7 @@
* @param radius
* @return
*/
- private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+ private static Map<String, ArrayList<String>> modelCountryContext(String docText, AdminBoundaryContextGenerator additionalContext, int radius) {
Map<String, ArrayList< String>> featureBags = new HashMap<>();
Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
/**
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
new file mode 100644
index 0000000..73ff14e
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+/**
+ *
+ * @author mgiaconia
+ */
+import java.io.*;
+import java.net.*;
+import java.util.Enumeration;
+import java.util.zip.*;
+public class GeonamesFileDownloader {
+
+ final static int size = 1024;
+ private static final String ALL_COUNTRIES = "http://download.geonames.org/export/dump/ZM.zip";
+ private static final String COUNTRY_INFO = "";
+ private static final String ADM1_LOOKUP = "";
+
+ public static void main(String[] args) {
+ downloadGeonamesFiles(COUNTRY_INFO, "c:\\temp\\gazetteers");
+ }
+
+ public static void downloadGeonamesFiles(String outputFileName, String outputDir) {
+ String fileDownload = fileDownload(ALL_COUNTRIES, outputDir);
+
+ unzipMyZip(fileDownload, outputDir);
+
+ fileDownload(COUNTRY_INFO, outputDir);
+ fileDownload(ADM1_LOOKUP, outputDir);
+
+ }
+
+ public static final void writeFile(InputStream in, OutputStream out)
+ throws IOException {
+ byte[] buffer = new byte[1024];
+ int len;
+
+ while ((len = in.read(buffer)) != 0) {
+ out.write(buffer, 0, len);
+ }
+
+ in.close();
+ out.close();
+ }
+
+ public static void unzipMyZip(String zipFileName,
+ String directoryToExtractTo) {
+ Enumeration entriesEnum;
+ ZipFile zip;
+ try {
+ zip = new ZipFile(zipFileName);
+ entriesEnum = zip.entries();
+ while (entriesEnum.hasMoreElements()) {
+ ZipEntry entry = (ZipEntry) entriesEnum.nextElement();
+ InputStream is = zip.getInputStream(entry); // get the input stream
+ OutputStream os = new java.io.FileOutputStream(new File(zipFileName.replace("\\.zip", ".txt")));
+ byte[] buf = new byte[4096];
+ int r;
+ while ((r = is.read(buf)) != -1) {
+ os.write(buf, 0, r);
+ }
+ os.close();
+ is.close();
+ }
+ } catch (IOException ioe) {
+ System.err.println("Some Exception Occurred:");
+ ioe.printStackTrace();
+ return;
+ }
+ }
+
+ public static String fileUrl(String fAddress, String localFileName, String destinationDir) {
+ OutputStream outStream = null;
+ URLConnection uCon = null;
+ String filename = destinationDir + "\\" + localFileName;
+ InputStream is = null;
+ try {
+ URL Url;
+ byte[] buf;
+ int ByteRead, ByteWritten = 0;
+ Url = new URL(fAddress);
+ outStream = new BufferedOutputStream(new FileOutputStream(destinationDir + "\\" + localFileName));
+
+ uCon = Url.openConnection();
+ is = uCon.getInputStream();
+ buf = new byte[size];
+ while ((ByteRead = is.read(buf)) != -1) {
+ outStream.write(buf, 0, ByteRead);
+ ByteWritten += ByteRead;
+ }
+ System.out.println("Downloaded Successfully.");
+ System.out.println("File name:\"" + localFileName + "\"\nNo ofbytes :" + ByteWritten);
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ try {
+ is.close();
+ outStream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ return filename;
+ }
+
+ public static String fileDownload(String fAddress, String destinationDir) {
+ int slashIndex = fAddress.lastIndexOf('/');
+ int periodIndex = fAddress.lastIndexOf('.');
+
+ String fileName = fAddress.substring(slashIndex + 1);
+ String retFileName = "";
+ if (periodIndex >= 1 && slashIndex >= 0
+ && slashIndex < fAddress.length() - 1) {
+ retFileName = fileUrl(fAddress, fileName, destinationDir);
+ } else {
+ System.err.println("path or file name.");
+ }
+ return retFileName;
+ }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
new file mode 100644
index 0000000..bd73bb9
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class GeonamesProcessor {
+
+ public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception {
+ Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);
+
+ Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes);
+ // List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));
+ //admin2Codes.txt
+
+ readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w);
+ //now append to the coutnry context file
+ writeCountryContextFile(outputCountryContextFile, adm1s);
+
+ }
+
+ public GeonamesProcessor() {
+ }
+
+ private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) {
+ System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath());
+
+ Map<String, AdminBoundary> outmap = new HashMap<>();
+ BufferedReader reader;
+ Set<String> nullcodes = new HashSet<>();
+ try {
+
+ reader = new BufferedReader(new FileReader(adm1CodesLookupFile));
+ int i = 0;
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ // String line = reader.readLine();
+ String[] values = line.split("\t");
+ if (values.length != 4) {
+ throw new IOException("improperly formatted province lookup file");
+ }
+ String ccode = values[0].toLowerCase();
+
+ String[] split = ccode.split("\\.");
+ String pcode = "";
+ if (split.length == 2) {
+ //System.out.println(split);
+ ccode = split[0];
+ pcode = split[1];
+ }
+
+ String pname = values[2];
+
+ if (ccode.matches("[0-9].*")) {
+ String code = ccode;
+ ccode = pcode;
+ pcode = code;
+ }
+
+ String cname = ccodes.get(ccode);
+
+ if (cname == null) {
+ nullcodes.add(ccode);
+ }
+ AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);
+ // System.out.println(data);
+ outmap.put(ccode + "." + pcode, data);
+
+ }
+ System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data.");
+ reader.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ System.out.println("Successfully read geonames province data from: " + adm1CodesLookupFile.getPath());
+
+ return outmap;
+
+ }
+
+ private static Map<String, String> getCountryCodes(File countryContextFile) {
+ Map<String, String> ccs = new HashMap<>();
+ BufferedReader reader;
+ try {
+
+ reader = new BufferedReader(new FileReader(countryContextFile));
+ int i = 0;
+ String line = "";
+ boolean start = false;
+ while ((line = reader.readLine()) != null) {
+ if (!line.toLowerCase().startsWith("#iso\t") && !start) {
+
+ continue;
+ } else {
+ start = true;
+ }
+ String[] values = line.split("\t");
+
+ String ccode = values[0].toLowerCase();//this is the 2 digit ISO code
+ String cname = values[4].toLowerCase();
+ if (!ccode.equals("")) {
+ ccs.put(ccode, cname);
+ }
+
+ }
+ reader.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ ccs.put("SS", "South Sudan");
+ ccs.put("CS", "Kosovo");
+ return ccs;
+
+ }
+
+ public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
+ // FileWriter writer = null;
+ try (FileWriter writer = new FileWriter(outfile, true)) {
+
+ for (String admKey : adms.keySet()) {
+ AdminBoundary adm = adms.get(admKey);
+ if (adm == null) {
+ continue;
+ }
+ String province = adm.getProvinceName();
+ String country = adm.getCountryName();
+
+ String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n";
+ writer.write(line);
+ // System.out.println(line);
+
+ }
+ writer.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ System.out.println("successfully wrote Geonames entries to country oontext file");
+ }
+
+ /**
+ *
+ * @param gazateerInputData the Geonames allCounties.txt file
+ * @param type the types of gaz entry, usgs, geonames, or regions
+ * @param adms the province info
+ * @param countrycodes the country code info
+ * @param w the lucene index writer
+ * @throws Exception
+ */
+ public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {
+
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ String[] fieldStrings = new String[]{
+ "geonameid",
+ "name",
+ "asciiname",
+ "alternatenames",
+ "latitude",
+ "longitude",
+ "feature_class",
+ "feature_code",
+ "country code",
+ "cc2",
+ "admin1_code",
+ "admin2_code",
+ "admin3_code",
+ "admin4_code",
+ "population",
+ "elevation",
+ "dem ",
+ "timezone",
+ "modification_date"};
+
+ List<String> fields = Arrays.asList(fieldStrings);
+ int counter = 0;
+ System.out.println("reading gazetteer data from file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+ String[] values = line.split(type.getSeparator());
+
+ Document doc = new Document();
+ String admincode = values[10].toLowerCase();
+ String ccode = values[8].toLowerCase();
+ if (ccode.contains(",")) {
+ String[] codes = ccode.split(",");
+ if (codes.length > 0) {
+ ccode = codes[0];
+ }
+ }
+ AdminBoundary adm = adms.get(ccode + "." + admincode);
+
+ String placeName = values[2];
+ String lat = values[4];
+ String lon = values[5];
+ String dsg = values[7];
+ String id = values[0];
+ String concatIndexEntry = "";
+ if (adm != null) {
+ concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;
+ } else {
+ //there is no admin info, but we can still use the countrycode to concat the country name
+ String n = countrycodes.get(ccode);
+ if (n != null) {
+ concatIndexEntry = n + ", " + placeName;
+ } else {
+ ///don't want a single token hierarchy entry.
+ concatIndexEntry = "";
+ }
+ }
+ if (ccode == null) {
+ System.out.println("naughty country code");
+ }
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+
+ }
+
+ /**
+ * add standard fields to the index
+ */
+ doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new TextField("latitude", lat, Field.Store.YES));
+ doc.add(new TextField("longitude", lon, Field.Store.YES));
+ doc.add(new TextField("loctype", dsg, Field.Store.YES));
+ doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
+ doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));
+ doc.add(new TextField("countycode", "", Field.Store.YES));
+
+ doc.add(new TextField("locid", id, Field.Store.YES));
+ doc.add(new TextField("gazsource", "geonames", Field.Store.YES));
+ w.addDocument(doc);
+
+ counter++;
+ if (counter % 100000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........Geonames entries committed to index..............");
+ }
+
+ }
+
+ System.out.println("Completed indexing gaz! index name is: " + type.toString());
+ }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
new file mode 100644
index 0000000..3b667cf
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class RegionProcessor {
+
+ public static void main(String[] args) {
+ RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
+ }
+
+ /**
+ *
+ * @param regionsFile the file that stores Region references. the format of
+ * this file is tab delimitted text with index 0 as the name of the region,
+ * index 1 as the longitude, and index 2 as the latitude
+ * @param outputCountryContextfile this is the country context files shared by
+ * all indexing processors
+ * @param w
+ */
+ public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {
+ try {
+ readFile(regionsFile, outputCountryContextfile, w);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception {
+ List<String> ccfileentries = new ArrayList<>();
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<>();
+ int counter = 0;
+ System.out.println("reading gazetteer data from Regions file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split("\t");
+ if (counter == 0) {
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+ }
+ String placeName = values[0];
+ String lat = values[2];
+ String lon = values[1];
+ String dsg = "region";
+ String id = "rg" + counter;
+
+ String hierarchy = placeName;
+
+ doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new TextField("latitude", lat, Field.Store.YES));
+ doc.add(new TextField("longitude", lon, Field.Store.YES));
+ doc.add(new TextField("loctype", dsg, Field.Store.YES));
+ doc.add(new TextField("admincode", "", Field.Store.YES));
+ doc.add(new TextField("countrycode", id, Field.Store.YES));
+ doc.add(new TextField("countycode", "", Field.Store.YES));
+
+ doc.add(new TextField("locid", id, Field.Store.YES));
+ doc.add(new TextField("gazsource", "region", Field.Store.YES));
+ //countrycontext file format
+ // US KY 131 United States Kentucky Leslie
+
+ ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n");
+ if (w != null) {
+ w.addDocument(doc);
+ }
+ }
+ counter++;
+
+ }
+ if (w != null) {
+ w.commit();
+ }
+ FileWriter writer = new FileWriter(outputCountryContextfile, true);
+ for (String string : ccfileentries) {
+ writer.write(string);
+ }
+ System.out.println("successfully wrote Region entries to country oontext file");
+ writer.close();
+ System.out.println("Completed indexing regions!");
+ }
+
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
new file mode 100644
index 0000000..cdb5ed2
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class USGSProcessor {
+
+ public static void main(String[] args) {
+ try {
+ Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS);
+ process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);
+ } catch (Exception ex) {
+ Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception {
+ Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS);
+ readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
+ writeCountryContextFile(outputCountryContextfile, provData);
+ }
+
+ public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {
+
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<>();
+ int counter = 0;
+ System.out.println("reading gazetteer data from USGS file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split(type.getSeparator());
+ if (counter == 0) {
+ for (String columnName : values) {
+ fields.add(columnName.replace("»¿", "").trim());
+ }
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+ }
+ String placeName = values[1];
+ String lat = values[9];
+ String lon = values[10];
+ String dsg = values[2];
+ String id = values[0];
+
+ String ccode = values[6];
+ String admincode = values[3];
+ AdminBoundary get = lookupMap.get(admincode + "." + ccode);
+ String countyname = "";
+ String countyCode = get.getCountyCode();
+ if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
+ countyname = get.getCountyName();
+ }
+ if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
+ countyCode = get.getCountyCode();
+ }
+ String hierarchy = get.getCountryName() + ", " + get.getProvinceName() +", "+ countyname + ", " + placeName;
+
+ doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new TextField("latitude", lat, Field.Store.YES));
+ doc.add(new TextField("longitude", lon, Field.Store.YES));
+ doc.add(new TextField("loctype", dsg, Field.Store.YES));
+ doc.add(new TextField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));
+ doc.add(new TextField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));
+ doc.add(new TextField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
+
+ doc.add(new TextField("locid", id, Field.Store.YES));
+ doc.add(new TextField("gazsource", "usgs", Field.Store.YES));
+ w.addDocument(doc);
+ }
+ counter++;
+ if (counter % 100000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........USGS entries committed to index..............");
+ }
+
+ }
+ w.commit();
+ System.out.println("Completed indexing USGS gaz!");
+ }
+
+ private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {
+ System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
+ Map<String, AdminBoundary> outmap = new HashMap<>();
+ BufferedReader reader;
+
+ try {
+
+ reader = new BufferedReader(new FileReader(govUnitsFile));
+ int i = 0;
+ String line = "";
+ String[] fields = null;
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split(type.getSeparator());
+ if (i == 0) {
+ fields = values;
+ i++;
+ continue;
+ }
+ i++;
+ // System.out.println(i);
+ String countyCode = values[2];
+ String countyName = values[3];
+ String stateCode = values[5];
+ String stateName = values[6];
+ String countryCode = values[7];
+ String countryName = values[8];
+ AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName);
+ outmap.put(stateCode + "." + countyCode, adminBoundary);
+ // System.out.println(adminBoundary);
+
+ }
+ reader.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());
+
+ return outmap;
+
+ }
+
+ public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
+ // FileWriter writer = null;
+ try (FileWriter writer = new FileWriter(outfile, true)) {
+
+ for (String admkey : adms.keySet()) {
+ AdminBoundary adm = adms.get(admkey);
+ if (adm == null) {
+ continue;
+ }
+ String province = adm.getProvinceName();
+ String country = adm.getCountryName();
+ /**
+ * this is the standard format of the country context file... Geonames
+ * data will have an empty string for the county
+ */
+ String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\n";
+ writer.write(line);
+ /// System.out.println(line);
+
+ }
+ writer.close();
+ } catch (IOException ex) {
+ Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ System.out.println("successfully wrote USGS entries to country oontext file");
+ }
+}
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
similarity index 96%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index 3dbf5d1..cc34b1a 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
import java.util.ArrayList;
import java.util.HashMap;
@@ -22,6 +22,7 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.BaseLink;
import opennlp.tools.entitylinker.LinkedSpan;
@@ -29,20 +30,20 @@
/**
* Scores toponyms based on their proximity to a country mention. Based on the
- * heuristic that typonymn mentions are more likely close to their parent
+ * heuristic that toponymn mentions are more likely close to their parent
* country mentions. For instance, if the toponym Berlin is mentioned near an
* indicator of Germany, it is more likely to be Berlin Germany than Berlin
- * Connecticut.
+ * Connecticut (if Connecticut is mentioned further down in the article).
*
*
*/
-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
+public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {
private Map<String, Set<String>> nameCodesMap;
String dominantCode = "";
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
similarity index 64%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index 36e2751..9101829 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
@@ -13,43 +13,50 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
+import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.BaseLink;
import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.ngram.NGramGenerator;
import opennlp.tools.util.Span;
/**
*
* Generates scores based on string comparisons levenstein and dice
*/
-public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
+public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> {
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
for (BaseLink link : linkedSpan.getLinkedEntries()) {
- Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
- link.getScoreMap().put("dice", dice);
- Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));
- link.getScoreMap().put("levenshtein", ld);
+ if (link instanceof GazetteerEntry) {
+ GazetteerEntry entry = (GazetteerEntry) link;
+ String hierarchy = entry.getHierarchy();
+ if (hierarchy != null) {
+ Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);
+ link.getScoreMap().put("hierarchydicecoef", dice);
+ Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase().toLowerCase());
+ link.getScoreMap().put("hierarchylevenshtein", ld);
+ }
+ }
}
}
-
}
/**
* Generates a score based on an overlap of nGrams between two strings using
* the DiceCoefficient technique.
*
- * @param s1 first string
- * @param s2 second string
+ * @param s1 first string
+ * @param s2 second string
* @param nGrams number of chars in each gram
* @return
*/
@@ -57,8 +64,22 @@
if (s1.equals("") || s1.equals("")) {
return 0d;
}
- List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
- List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");
+ List<String> s1Grams = new ArrayList<>();
+ List<String> s2Grams = new ArrayList<>();
+ String[] split1 = s1.split("[ ,]");
+ for (String token : split1) {
+ if (token.trim().equals("")) {
+ continue;
+ }
+ s1Grams.add(token);
+ }
+ String[] split2 = s2.split("[ ,]");
+ for (String token : split2) {
+ if (token.trim().equals("")) {
+ continue;
+ }
+ s2Grams.add(token);
+ }
Set<String> overlap = new HashSet<String>(s1Grams);
overlap.retainAll(s2Grams);
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
similarity index 87%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
index 97a5d07..d3494e0 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.BaseLink;
import opennlp.tools.entitylinker.LinkedSpan;
@@ -29,13 +31,13 @@
* outliers by finding those points that are not near the majority
*
*/
-public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
+public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> {
private final PointClustering CLUSTERER = new PointClustering();
private int PRECISION = 3;
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
//Map<Double, Double> latLongs = new HashMap<Double, Double>();
List<GazetteerEntry> allGazEntries = new ArrayList<>();
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
similarity index 90%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
index 5567fa2..f56e8da 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
import java.util.List;
import opennlp.tools.entitylinker.EntityLinkerProperties;
@@ -23,6 +23,7 @@
/**
* Structure for scoring linked entities. The Map logically represents a pair :
* "Score type" to the "actual Score."
+ * @param <T> a generic for providing additional context
*/
public interface LinkedEntityScorer<T> {
@@ -32,6 +33,7 @@
* @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
* @param docText the full text of the document.
* @param sentenceSpans the sentence spans the correspond to the document text
+ * @param properties the entitylinker properties config file
* @param additionalContext any additional data required to perform the scoring operation
* @return void
*/
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
similarity index 94%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
index 35b423a..3202f85 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
import java.io.File;
import java.io.FileNotFoundException;
@@ -21,6 +21,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.entitylinker.EntityLinkerProperties;
@@ -33,7 +34,7 @@
*
* Utilizes a doccat model to score toponyms based on surrounding context
*/
-public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
+public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> {
private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class);
DocumentCategorizerME documentCategorizerME;
@@ -42,7 +43,7 @@
boolean modelexists = false;
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
try {
if (doccatModel == null) {
String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
similarity index 78%
rename from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
rename to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
index bf7f701..908df1e 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
import com.spatial4j.core.context.SpatialContext;
import com.spatial4j.core.io.GeohashUtils;
@@ -22,6 +22,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
/**
*
@@ -114,36 +115,6 @@
return point;
}
- /**
- * Hashes a lat long based on adding 90 or 180 and then interlarding lat lon
- * chars. reduces a set of points to a sortable set
- *
- * @param lat
- * @param lon
- * @return
- */
- public String simpleGeohash(Double lat, Double lon) {
- String geoHash = "";
- lat = lat + 90;
- lon = lon + 180;
- String latString = String.valueOf(lat).replace(".", "");
- String lonString = String.valueOf(lon).replace(".", "");
- int length = latString.length() > lonString.length() ? lonString.length() : latString.length();
- while (length < 12) {
- latString += "0";
- lonString += "0";
- length++;
- }
- latString = latString.substring(0, 10);
- lonString = lonString.substring(0, 10);
- char[] latChars = latString.toCharArray();
- char[] lonChars = lonString.toCharArray();
-
- for (int i = 0; i < latChars.length; i++) {
- geoHash += String.valueOf(latChars[i]) + String.valueOf(lonChars[i]);
- }
- return geoHash;
- }
private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
similarity index 76%
copy from geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
copy to geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index 3dbf5d1..f3199a1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.scoring;
import java.util.ArrayList;
import java.util.HashMap;
@@ -22,29 +22,39 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
import opennlp.tools.entitylinker.LinkedSpan;
import opennlp.tools.util.Span;
/**
- * Scores toponyms based on their proximity to a country mention. Based on the
- * heuristic that typonymn mentions are more likely close to their parent
- * country mentions. For instance, if the toponym Berlin is mentioned near an
- * indicator of Germany, it is more likely to be Berlin Germany than Berlin
- * Connecticut.
+ * Scores toponyms based on their proximity to a province mention. Based on the
+ * heuristic that toponymn mentions are more likely close to their parent
+ * province mentions. For instance, if the toponym Berlin is mentioned near an
+ * indicator of Connecticut, it is more likely to be Berlin Connecticut than
+ * Berlin Germany (if Germany did not exist in, or is mentioned further down in,
+ * the article).
*
*
*/
-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
+public class ProvinceProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {
private Map<String, Set<String>> nameCodesMap;
String dominantCode = "";
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
-
- score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
+ if (!additionalContext.getProvHits().isEmpty()) {
+ score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
+ } else {
+ for (LinkedSpan<BaseLink> span : linkedSpans) {
+ for (BaseLink link : span.getLinkedEntries()) {
+ link.getScoreMap().put("provincecontext", Double.NaN);
+ }
+ }
+ }
}
@@ -53,20 +63,19 @@
* matches. Currently the scoring indicates the probability that the toponym
* is correct based on the country context in the document
*
- * @param linkedData the linked spans, holds the Namefinder results, and
- * the list of BaseLink for each
- * @param countryHits all the country mentions in the document
- * @param nameCodesMap maps a country indicator name to a country code. Used
- * to determine if the namefinder found the same exact
- * toponym the country context did. If so the score is
- * boosted due to the high probability that the
- * NameFinder actually "rediscovered" a country
- * @param docText the full text of the document...not used in this
- * default implementation
- * @param sentences the sentences that correspond to the doc text.
+ * @param linkedData the linked spans, holds the Namefinder results, and the
+ * list of BaseLink for each
+ * @param countryHits all the country mentions in the document
+ * @param nameCodesMap maps a province indicator name to a province code. Used
+ * to determine if the namefinder found the same exact toponym the country
+ * context did. If so the score is boosted due to the high probability that
+ * the NameFinder actually "rediscovered" a country
+ * @param docText the full text of the document...not used in this default
+ * implementation
+ * @param sentences the sentences that correspond to the doc text.
* @param maxAllowedDist a constant that is used to determine which country
- * mentions, based on proximity within the text, should
- * be used to score the Named Entity.
+ * mentions, based on proximity within the text, should be used to score the
+ * Named Entity.
* @return
*/
public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
@@ -149,34 +158,35 @@
Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
for (BaseLink link : span.getLinkedEntries()) {
//getItemParentId is the country code
- String spanCountryCode = link.getItemParentID();
+ GazetteerEntry entry = (GazetteerEntry)link;
+ String spanCountryCode = entry.getProvinceCode();
if (scoreMap.containsKey(spanCountryCode)) {
score = scoreMap.get(spanCountryCode);
///does the name extracted match a country name?
- if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
+ if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {
//if so, is it the correct country code for that name?
- if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
+ if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) {
//boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
//TODO: make this smarter, and utilize province/state info in the future to be even more specific
score = (score + .75) > 1.0 ? 1d : (score + .75);
- if (link.getItemParentID().equals(dominantCode)) {
+ if (entry.getProvinceCode().equals(dominantCode)) {
score = (score + .25) > 1.0 ? 1d : (score + .25);
}
}
}
}
- link.getScoreMap().put("countrycontext", score);
+ link.getScoreMap().put("provincecontext", score);
}
return span;
}
/**
- * takes a map of distances from the toponym to each country mention and generates
- * a map of scores for each country code. The map is then correlated to the
- * code of the BaseLink parentid for retrieval. Then the
- * score is added to the overall list.
+ * takes a map of distances from the toponym to each country mention and
+ * generates a map of scores for each country code. The map is then correlated
+ * to the code of the BaseLink parentid for retrieval. Then the score is added
+ * to the overall list.
*
* @param distanceMap
* @param sentences
@@ -211,7 +221,6 @@
normalizedDistances.add(reverse);
}
-
List<Double> doubles = new ArrayList<Double>(normalizedDistances);
scoreMap.put(key, slidingDistanceAverage(doubles));
}
@@ -257,8 +266,8 @@
* range. Used to normalize distances in this class.
*
* @param valueToNormalize the value to place within the new range
- * @param minimum the min of the set to be transposed
- * @param maximum the max of the set to be transposed
+ * @param minimum the min of the set to be transposed
+ * @param maximum the max of the set to be transposed
* @return
*/
private Double normalize(int valueToNormalize, int minimum, int maximum) {