OPENNLP-756
Many small changes in a few classes due to the REGEX support in the country context file. The country context file is now capable of regex. A bug was also fixed in the AdminBoundaryContextGenerator which improved the performance of the ProvinceProximityScorer.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
index c0d2645..556caa1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
@@ -74,14 +74,14 @@
GeoEntityLinker linker = new GeoEntityLinker();
linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
- countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");
-
+ AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool.");
+ System.out.println(c);
} catch (Exception ex) {
java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);
}
}
- public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException{
+ public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException {
this.properties = properties;
if (countrydata == null) {
String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
@@ -155,12 +155,12 @@
for (String cc : countryhitMap.keySet()) {
Map<String, String> provsForCc = provMap.get(cc);
if (provsForCc != null) {
- provMentions = regexfind(text, provsForCc, provHits);
+ provMentions.putAll(regexfind(text, provsForCc, provHits));
if (provMentions != null) {
for (String prov : provMentions.keySet()) {
Map<String, String> get = countyMap.get(prov);
if (get != null) {
- countyMentions = regexfind(text, get, countyHits);
+ countyMentions.putAll(regexfind(text, get, countyHits));
}
}
}
@@ -208,64 +208,6 @@
return null;
}
- /**
- * Finds mentions of countries to assist in toponym resolution. Countries are
- * discovered via regex based on a configured file called
- * opennlp.geoentitylinker.countrycontext.txt. the file is configured using
- * the entitylinker.properties file as such:
- * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt
- *
- * Finding mentions in documents is very helpful for scoring. Lazily loads the
- * list from the file.
- *
- * @param docText the full text of the document
- * @return
- */
- @Deprecated
- public Map<String, Set<Integer>> regexfind(String docText) {
- countryMentions = new HashMap<>();
- nameCodesMap.clear();
- try {
-
- for (CountryContextEntry entry : countrydata) {
- Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
- Matcher rs = regex.matcher(docText);
- String code = entry.getCc1().toLowerCase();
-
- boolean found = false;
- while (rs.find()) {
- found = true;
- Integer start = rs.start();
- String hit = rs.group().toLowerCase();
- if (countryMentions.containsKey(code)) {
- countryMentions.get(code).add(start);
- } else {
- Set<Integer> newset = new HashSet<Integer>();
- newset.add(start);
- countryMentions.put(code, newset);
- }
- if (!hit.equals("")) {
- if (this.nameCodesMap.containsKey(hit)) {
- nameCodesMap.get(hit).add(code);
- } else {
- HashSet<String> newset = new HashSet<String>();
- newset.add(code);
- nameCodesMap.put(hit, newset);
- }
- }
- }
- if (found) {
- countryHits.add(entry);
- }
-
- }
-
- } catch (Exception ex) {
- LOGGER.error(ex);
- }
-
- return countryMentions;
- }
/**
* discovers indicators of admin boundary data using regex.
@@ -292,7 +234,7 @@
Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
String code = entry.toLowerCase();
-
+ code = code.trim().replace("", "");
boolean found = false;
while (rs.find()) {
found = true;
@@ -349,7 +291,7 @@
}
if (values.length == 6) {
AdminBoundary entry = new AdminBoundary(
- values[0].toLowerCase().trim(),
+ values[0].toLowerCase().trim().replace("", ""),
values[3].toLowerCase().trim(),
values[1].toLowerCase().trim(),
values[4].toLowerCase().trim(),
@@ -358,7 +300,7 @@
entries.add(entry);
} else {
AdminBoundary entry = new AdminBoundary(
- values[0].toLowerCase().trim(),
+ values[0].toLowerCase().trim().replace("", ""),
values[3].toLowerCase().trim(),
values[1].toLowerCase().trim(),
values[4].toLowerCase().trim(),
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index d7c6dbf..3a5eeb0 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -76,7 +76,7 @@
}
}
- public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
+ public GazetteerSearcher(EntityLinkerProperties properties) throws IOException {
this.properties = properties;
init();
}
@@ -206,7 +206,7 @@
}
- private void init() throws Exception {
+ private void init() throws IOException {
if (opennlpIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index b6a8c1c..4d01718 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -186,7 +186,7 @@
@Override
public void init(EntityLinkerProperties properties) throws IOException {
- try {
+
this.linkerProperties = properties;
countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
@@ -199,9 +199,7 @@
}
topN = rws;
loadScorers();
- } catch (Exception ex) {
- throw new IOException(ex);
- }
+
}
@Override
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
index 1c4a936..90a6e99 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
@@ -44,7 +44,7 @@
i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),
new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),
new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),
- new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),
+ new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20141202.txt.txt"),
new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
new File("C:\\temp\\gazetteers\\"),
new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index cc34b1a..ff4219f 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
@@ -168,6 +168,7 @@
}
}
}
+
link.getScoreMap().put("countrycontext", score);
}
return span;
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index afdb4b1..e245a00 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
@@ -156,9 +156,12 @@
* span is referring to the toponym form this code key>
*/
Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
+ if (scoreMap.isEmpty()) {
+ return span;
+ }
for (BaseLink link : span.getLinkedEntries()) {
//getItemParentId is the country code
- GazetteerEntry entry = (GazetteerEntry)link;
+ GazetteerEntry entry = (GazetteerEntry) link;
String spanCountryCode = entry.getProvinceCode();
if (scoreMap.containsKey(spanCountryCode)) {
@@ -184,9 +187,9 @@
/**
* takes a map of distances from the toponym to each province mention and
- * generates a map of scores for each province code. The map is then correlated
- * to the code of the BaseLink parentid for retrieval. Then the score is added
- * to the overall list.
+ * generates a map of scores for each province code. The map is then
+ * correlated to the code of the BaseLink parentid for retrieval. Then the
+ * score is added to the overall list.
*
* @param distanceMap
* @param sentences