OPENNLP-626
renamed packages for consistency in addons, also made small efficiencies
diff --git a/apache-opennlp-addons/pom.xml b/apache-opennlp-addons/pom.xml
index 062afa4..6fd5059 100644
--- a/apache-opennlp-addons/pom.xml
+++ b/apache-opennlp-addons/pom.xml
@@ -1,28 +1,33 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ <relativePath>../opennlp/pom.xml</relativePath>
+ </parent>
- <groupId>apache-opennlp-addons</groupId>
- <artifactId>apache-opennlp-addons</artifactId>
+ <artifactId>geoentitylinker-addon</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
-<name>Apache OpenNLP Addons</name>
+ <name>geoentitylinker-addon</name>
<url>http://maven.apache.org</url>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <version>2.3.2</version>
- <configuration>
- <source>1.7</source>
- <target>1.7</target>
- </configuration>
- </plugin>
- </plugins>
- </build>
- <properties>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
@@ -33,7 +38,7 @@
<version>3.8.1</version>
<scope>test</scope>
</dependency>
- <dependency>
+ <dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.5.0</version>
@@ -51,7 +56,7 @@
<version>4.5.0</version>
<optional>true</optional>
</dependency>
- <dependency>
+ <dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.6.0-SNAPSHOT</version>
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
new file mode 100644
index 0000000..bc6d787
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+
+/**
+ * Finds instances of country mentions in a String, typically a document text.
+ * Used to boost or degrade scoring of linked geo entities
+ *
+ */
+public class CountryContext {
+
+
+ private List<CountryContextEntry> countrydata;
+ private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
+ private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
+ private Set<CountryContextEntry> countryHits = new HashSet<>();
+
+ public CountryContext() {
+ }
+
+ public Map<String, Set<Integer>> getCountryMentions() {
+ return countryMentions;
+ }
+
+ public Set<CountryContextEntry> getCountryHits() {
+ return countryHits;
+ }
+
+ public Map<String, Set<String>> getNameCodesMap() {
+ return nameCodesMap;
+ }
+
+ public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
+ this.nameCodesMap = nameCodesMap;
+ }
+
+ /**
+ * Finds mentions of countries based on a list from MySQL stored procedure
+ * called getCountryList. This method finds country mentions in documents,
+ * which is an essential element of the scoring that is done for geo
+ * linkedspans. Lazily loads the list from the database.
+ *
+ * @param docText the full text of the document
+ * @param properties EntityLinkerProperties for getting database connection
+ * @return
+ */
+ public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
+ countryMentions = new HashMap<>();
+ nameCodesMap.clear();
+ try {
+
+ if (countrydata == null) {
+ countrydata = getCountryContextFromFile(properties);
+ // countrydata = getCountryData(properties);
+ }
+ for (CountryContextEntry entry : countrydata) {
+ Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+ Matcher rs = regex.matcher(docText);
+ String code = entry.getCc1().toLowerCase();
+
+ boolean found = false;
+ while (rs.find()) {
+ found = true;
+ Integer start = rs.start();
+ String hit = rs.group().toLowerCase();
+ if (countryMentions.containsKey(code)) {
+ countryMentions.get(code).add(start);
+ } else {
+ Set<Integer> newset = new HashSet<Integer>();
+ newset.add(start);
+ countryMentions.put(code, newset);
+ }
+ if (!hit.equals("")) {
+ if (this.nameCodesMap.containsKey(hit)) {
+ nameCodesMap.get(hit).add(code);
+ } else {
+ HashSet<String> newset = new HashSet<String>();
+ newset.add(code);
+ nameCodesMap.put(hit, newset);
+ }
+ }
+ }
+ if (found) {
+ countryHits.add(entry);
+ }
+
+ }
+
+ } catch (Exception ex) {
+ Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);
+ }
+
+
+ return countryMentions;
+ }
+
+ private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
+ List<CountryContextEntry> entries = new ArrayList<>();
+ String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
+ BufferedReader reader;
+
+ try {
+ path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
+
+ reader = new BufferedReader(new FileReader(path));
+
+ while (reader.read() != -1) {
+ String line = reader.readLine();
+ String[] values = line.split("\t");
+ if (values.length != 4) {
+ throw new IOException("improperly formatted country context file");
+ }
+ CountryContextEntry entry = new CountryContextEntry();
+ // rc,cc1, full_name_nd_ro,dsg
+ entry.setRc(values[0].toLowerCase());
+ entry.setCc1(values[1].toLowerCase());
+ entry.setFull_name_nd_ro(values[2].toLowerCase());
+ entry.setDsg(values[3].toLowerCase());
+ entries.add(entry);
+ }
+ reader.close();
+ } catch (IOException e) {
+ System.err.println(e);
+ }
+ return entries;
+
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
new file mode 100644
index 0000000..61cfcbb
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.Objects;
+
+/**
+ *Stores a tuple from mysql that is used to find country mentions in document text.
+ *
+ */
+public class CountryContextEntry {
+ /*
+ * rc,cc1, full_name_nd_ro,dsg
+ */
+
+ private String rc;
+ private String cc1;
+ private String full_name_nd_ro;
+ private String dsg;
+ private String provCode;
+ public CountryContextEntry() {
+ }
+
+ public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) {
+ this.rc = rc;
+ this.cc1 = cc1;
+ this.full_name_nd_ro = full_name_nd_ro;
+ this.dsg = dsg;
+ }
+
+ public String getProvCode() {
+ return provCode;
+ }
+
+ public void setProvCode(String provCode) {
+ this.provCode = provCode;
+ }
+
+ public String getRc() {
+ return rc;
+ }
+
+ public void setRc(String rc) {
+ this.rc = rc;
+ }
+
+ public String getCc1() {
+ return cc1;
+ }
+
+ public void setCc1(String cc1) {
+ this.cc1 = cc1;
+ }
+
+ public String getFull_name_nd_ro() {
+ return full_name_nd_ro;
+ }
+
+ public void setFull_name_nd_ro(String full_name_nd_ro) {
+ this.full_name_nd_ro = full_name_nd_ro;
+ }
+
+ public String getDsg() {
+ return dsg;
+ }
+
+ public void setDsg(String dsg) {
+ this.dsg = dsg;
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 7;
+ hash = 17 * hash + Objects.hashCode(this.rc);
+ hash = 17 * hash + Objects.hashCode(this.cc1);
+ hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro);
+ hash = 17 * hash + Objects.hashCode(this.dsg);
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final CountryContextEntry other = (CountryContextEntry) obj;
+ if (!Objects.equals(this.rc, other.rc)) {
+ return false;
+ }
+ if (!Objects.equals(this.cc1, other.cc1)) {
+ return false;
+ }
+ if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) {
+ return false;
+ }
+ if (!Objects.equals(this.dsg, other.dsg)) {
+ return false;
+ }
+ return true;
+ }
+
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java
new file mode 100644
index 0000000..3df2392
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+/**
+ *Stores a "hit" on a country and the start and end of the hit
+
+ */
+public class CountryContextHit {
+
+ private String countryCode;
+ private int start;
+ private int end;
+
+ public CountryContextHit() {
+ }
+
+ public CountryContextHit(String countryCode, int start, int end) {
+ this.countryCode = countryCode;
+ this.start = start;
+ this.end = end;
+ }
+
+ public String getCountryCode() {
+ return countryCode;
+ }
+
+ public void setCountryCode(String countryCode) {
+ this.countryCode = countryCode;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public void setStart(int start) {
+ this.start = start;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+ public void setEnd(int end) {
+ this.end = end;
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
new file mode 100644
index 0000000..48ebccf
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponyms based on country context as well as fuzzy string matching
+ */
+public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
+
+ private Map<String, Set<String>> nameCodesMap;
+ String dominantCode = "";
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+
+ score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
+
+ }
+
+ /**
+ * Assigns a score to each BaseLink in each linkedSpan's set of N best
+ * matches. Currently the scoring indicates the probability that the toponym
+ * is correct based on the country context in the document and fuzzy string
+ * matching
+ *
+ * @param linkedData the linked spans, holds the Namefinder results, and
+ * the list of BaseLink for each
+ * @param countryHits all the country mentions in the document
+ * @param nameCodesMap maps a country indicator name to a country code. Used
+ * to determine if the namefinder found the same exact
+ * toponym the country context did. If so the score is
+ * boosted due to the high probability that the
+ * NameFinder actually "rediscovered" a country
+ * @param docText the full text of the document...not used in this
+ * default implementation
+ * @param sentences the sentences that correspond to the doc text.
+ * @param maxAllowedDist a constant that is used to determine which country
+ * mentions, based on proximity within the text, should
+ * be used to score the Named Entity.
+ * @return
+ */
+ public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
+ this.nameCodesMap = nameCodesMap;
+ setDominantCode(countryHits);
+ for (LinkedSpan<BaseLink> linkedspan : linkedData) {
+
+ linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
+ }
+ return linkedData;
+ }
+
+ /**
+ * sets class level variable to a code based on the number of mentions
+ *
+ * @param countryHits
+ */
+ private void setDominantCode(Map<String, Set<Integer>> countryHits) {
+ int hits = -1;
+ for (String code : countryHits.keySet()) {
+ if (countryHits.get(code).size() > hits) {
+ hits = countryHits.get(code).size();
+ dominantCode = code;
+ }
+ }
+ }
+
+ /**
+ * Generates distances from each country mention to the span's location in the
+ * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
+ * are resolved to the correct country and coordinate.
+ *
+ * @param sentences
+ * @param countryHits
+ * @param span
+ * @return
+ */
+ private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
+ Double score = 0.0;
+ //get the index of the actual span, begining of sentence
+ //should generate tokens from sentence and create a char offset...
+ //could have large sentences due to poor sentence detection or wonky doc text
+ int sentenceIdx = span.getSentenceid();
+ int sentIndexInDoc = sentences[sentenceIdx].getStart();
+ /**
+ * create a map of all the span's proximal country mentions in the document
+ * Map< countrycode, set of <distances from this NamedEntity>>
+ */
+ Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();
+ //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
+ for (String cCode : countryHits.keySet()) {
+//iterate over all the regex start values and calculate an offset
+ for (Integer cHit : countryHits.get(cCode)) {
+ Integer absDist = Math.abs(sentIndexInDoc - cHit);
+ //only include near mentions based on a heuristic
+ //TODO make this a property
+ // if (absDist < maxAllowedDistance) {
+ if (distancesFromCodeMap.containsKey(cCode)) {
+ distancesFromCodeMap.get(cCode).add(absDist);
+ } else {
+ HashSet<Integer> newset = new HashSet<Integer>();
+ newset.add(absDist);
+ distancesFromCodeMap.put(cCode, newset);
+ }
+ }
+
+ //}
+ }
+ //we now know how far this named entity is from every country mention in the document
+
+ /**
+ * the gaz matches that have a country code that have mentions in the doc
+ * that are closest to the Named Entity should return the best score.
+ * Analyzemap generates a likelihood score that the toponym from the gaz is
+ * referring to one of the countries, i.e, Map<countrycode, prob that this
+ * span is referring to the toponym form this code key>
+ */
+ Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
+ for (BaseLink link : span.getLinkedEntries()) {
+ //getItemParentId is the country code
+ String spanCountryCode = link.getItemParentID();
+ if (scoreMap.containsKey(spanCountryCode)) {
+
+ score = scoreMap.get(spanCountryCode);
+ ///does the name extracted match a country name?
+ if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
+ //if so, is it the correct country code for that name?
+ if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
+ //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
+ //TODO: make this multiplier configurable
+ score = (score + .75) > 1.0 ? 1d : (score + .75);
+
+ if (link.getItemParentID().equals(dominantCode)) {
+ score = (score + .25) > 1.0 ? 1d : (score + .25);
+ }
+ }
+ }
+ }
+ link.getScoreMap().put("countrycontext", score);
+ }
+ return span;
+ }
+
+ /**
+ * takes a map of distances from the NE to each country mention and generates
+ * a map of scores for each country code. The map is then correlated to teh
+ * correlated to the code of the BaseLink parentid for retrieval. Then the
+ * score is added to the overall.
+ *
+ * @param distanceMap
+ * @param sentences
+ * @param span
+ * @return
+ */
+ private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
+
+ Map<String, Double> scoreMap = new HashMap<String, Double>();
+ if (distanceMap.isEmpty()) {
+ return scoreMap;
+ }
+ TreeSet<Integer> all = new TreeSet<Integer>();
+ for (String key : distanceMap.keySet()) {
+ all.addAll(distanceMap.get(key));
+ }
+ //get min max for normalization, this could be more efficient
+
+ Integer min = all.first();
+ Integer max = all.last();
+ if (min == max) {
+ min = 0;
+ }
+ for (String key : distanceMap.keySet()) {
+
+ TreeSet<Double> normalizedDistances = new TreeSet<Double>();
+ for (Integer i : distanceMap.get(key)) {
+ Double norm = normalize(i, min, max);
+ //reverse the normed distance so low numbers (closer) are better
+ //this could be improved with a "decaying " function using an imcreaseing negative exponent
+ Double reverse = Math.abs(norm - 1);
+ normalizedDistances.add(reverse);
+ }
+
+
+ List<Double> doubles = new ArrayList<Double>(normalizedDistances);
+ scoreMap.put(key, slidingDistanceAverage(doubles));
+ }
+ return scoreMap;
+ }
+
+ /**
+ * this method is an attempt to make closer clusters of mentions group
+ * together to smooth out the average, so one distant outlier does not kill
+ * the score for an obviously good hit. More elegant solution is possible
+ * using Math.pow, and making the score decay with distance by using an
+ * increasing negative exponent
+ *
+ * @param normDis the normalized and sorted set of distances as a list
+ * @return
+ */
+ private Double slidingDistanceAverage(List<Double> normDis) {
+ List<Double> windowOfAverages = new ArrayList<Double>();
+
+ if (normDis.size() < 3) {
+ windowOfAverages.addAll(normDis);
+ } else {
+
+ for (int i = 0; i < normDis.size() - 1; i++) {
+ double a = normDis.get(i);
+ double b = normDis.get(i + 1);
+ windowOfAverages.add((a + b) / 2);
+
+ }
+ }
+ double sum = 0d;
+ for (double d : windowOfAverages) {
+ sum += d;
+ }
+ double result = sum / windowOfAverages.size();
+ //TODO: ++ prob when large amounts of mentions for a code
+ //System.out.println("avg of window:" + result);
+ return result;
+ }
+
+ /**
+ * transposes a value within one range to a relative value in a different
+ * range. Used to normalize distances in this class.
+ *
+ * @param valueToNormalize the value to place within the new range
+ * @param minimum the min of the set to be transposed
+ * @param maximum the max of the set to be transposed
+ * @return
+ */
+ private Double normalize(int valueToNormalize, int minimum, int maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
new file mode 100644
index 0000000..29cf58b
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.ngram.NGramGenerator;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * Generates scores for string comparisons.
+ */
+public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
+ for (BaseLink link : linkedSpan.getLinkedEntries()) {
+ Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
+ link.getScoreMap().put("dice", dice);
+ Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));
+ link.getScoreMap().put("levenshtein", ld);
+ }
+ }
+
+
+ }
+
+ /**
+ * Generates a score based on an overlap of nGrams between two strings using
+ * the DiceCoefficient technique.
+ *
+ * @param s1 first string
+ * @param s2 second string
+ * @param nGrams number of chars in each gram
+ * @return
+ */
+ public double getDiceCoefficient(String s1, String s2, int nGrams) {
+ if (s1.equals("") || s1.equals("")) {
+ return 0d;
+ }
+ List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
+ List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");
+
+ Set<String> overlap = new HashSet<String>(s1Grams);
+ overlap.retainAll(s2Grams);
+ double totcombigrams = overlap.size();
+
+ return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
+ }
+
+ private int minimum(int a, int b, int c) {
+ return Math.min(Math.min(a, b), c);
+ }
+
+ public int getLevenshteinDistance(CharSequence str1,
+ CharSequence str2) {
+ int[][] distance = new int[str1.length() + 1][str2.length() + 1];
+
+ for (int i = 0; i <= str1.length(); i++) {
+ distance[i][0] = i;
+ }
+ for (int j = 1; j <= str2.length(); j++) {
+ distance[0][j] = j;
+ }
+
+ for (int i = 1; i <= str1.length(); i++) {
+ for (int j = 1; j <= str2.length(); j++) {
+ distance[i][j] = minimum(
+ distance[i - 1][j] + 1,
+ distance[i][j - 1] + 1,
+ distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
+ }
+ }
+
+ return distance[str1.length()][str2.length()];
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
new file mode 100644
index 0000000..f375dcf
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.HashMap;
+import java.util.Map;
+import opennlp.tools.entitylinker.domain.BaseLink;
+
+/**
+ *
+ * Stores a record from a geographic placenames gazateer
+ */
+public class GazateerEntry extends BaseLink {
+
+ private Double latitude;
+ private Double longitude;
+ private String source;
+ private String indexID;
+ private Map<String, String> indexData=new HashMap<>();
+
+ public String getIndexID() {
+ return indexID;
+ }
+
+ public void setIndexID(String indexID) {
+ this.indexID = indexID;
+ }
+
+ public Double getLatitude() {
+ return latitude;
+ }
+
+ public void setLatitude(Double latitude) {
+ this.latitude = latitude;
+ }
+
+ public Double getLongitude() {
+ return longitude;
+ }
+
+ public void setLongitude(Double longitude) {
+ this.longitude = longitude;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ public Map<String, String> getIndexData() {
+ return indexData;
+ }
+
+ public void setIndexData(Map<String, String> indexData) {
+ this.indexData = indexData;
+ }
+
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
new file mode 100644
index 0000000..5e72a9f
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.th.ThaiAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ *
+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
+ */
+public class GazateerIndexer {
+
+ public GazateerIndexer() {
+ loadAnalyzerMap();
+ }
+ Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
+
+ public static interface Separable {
+
+ String getSeparator();
+ }
+
+ public enum GazType implements Separable {
+
+ GEONAMES {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_geonames_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\t";
+ }
+ },
+ USGS {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_usgsgaz_idx";
+ }
+
+ @Override
+ public String getSeparator() {
+ return "\\|";
+ }
+ }
+ }
+
+ public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
+ if (!outputIndexDir.isDirectory()) {
+ throw new IllegalArgumentException("outputIndexDir must be a directory.");
+ }
+
+ String indexloc = outputIndexDir + type.toString();
+ Directory index = new MMapDirectory(new File(indexloc));
+
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
+
+ IndexWriter w = new IndexWriter(index, config);
+
+ readFile(gazateerInputData, w, type);
+ w.commit();
+ w.close();
+
+ }
+
+ public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<String>();
+ int counter = 0;
+ int langCodeIndex = 0;
+ System.out.println("reading gazateer data from file...........");
+ while (reader.read() != -1) {
+ String line = reader.readLine();
+ String[] values = line.split(type.getSeparator());
+ if (counter == 0) {
+ // build fields
+ for (int i = 0; i < values.length; i++) {
+ String columnName = values[i];
+ fields.add(columnName.replace("»¿", "").trim());
+ if (columnName.toLowerCase().equals("lc")) {
+ langCodeIndex = i;
+ }
+ }
+
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
+ }
+ if (type == GazType.GEONAMES) {
+ /**
+ * see if the map contains a language specific analyzer
+ */
+ if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {
+ /*
+ * if so retrieve it from the map
+ */
+ Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);
+ /**
+ * index the doc using the specified analyzer
+ */
+ w.addDocument(doc, analyzer);
+ } else {
+ w.addDocument(doc);
+ }
+ } else {
+ w.addDocument(doc);
+ }
+ }
+ counter++;
+ if (counter % 10000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........committed to index..............");
+ }
+
+ }
+ w.commit();
+ System.out.println("Completed indexing gaz! index name is: " + type.toString());
+ }
+/**
+ * TODO: make these analyzers configurable
+ */
+ private void loadAnalyzerMap() {
+ languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
+ languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
+ languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
+ languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
+
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
new file mode 100644
index 0000000..437227e
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ *
+ * Caches gazateer query results statically
+ */
+public class GazateerSearchCache {
+
+ private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();
+
+
+ public static synchronized ArrayList<GazateerEntry> get(String searchString) {
+ return gazCache.get(searchString);
+ }
+
+ public static synchronized void put(String searchString, ArrayList<GazateerEntry> hits) {
+ if (gazCache.size() > 10000) {
+ gazCache.clear();
+ }
+ if (!gazCache.containsKey(searchString)) {
+ gazCache.put(searchString, hits);
+ }
+ }
+
+
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
new file mode 100644
index 0000000..ed220e8
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.queryparser.classic.ParseException;
+
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.Version;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+
+/**
+ *
+ * Searches Gazateers stored in a MMapDirectory lucene index
+ */
+public class GazateerSearcher {
+
+ private double scoreCutoff = .75;
+ private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
+ private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
+ private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
+ private Analyzer geonamesAnalyzer;
+ //usgs US gazateer
+ private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
+ private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
+ private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
+ private Analyzer usgsAnalyzer;
+
+ public GazateerSearcher() {
+ }
+
+ /**
+ *
+ * @param searchString the nameed entity to look up in the lucene index
+ * @param rowsReturned how many rows to allow lucene to return
+ * @param code the country code
+ * @param properties properties file that states where the lucene indexes
+ * are
+ * @return
+ */
+ public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
+ ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ try {
+ /**
+ * build the search string
+ */
+ String luceneQueryString = !code.equals("")
+ ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
+ : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
+ /**
+ * check the cache and go no further if the records already exist
+ */
+ ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
+ if (get != null) {
+ return get;
+ }
+ if (geonamesIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");
+ scoreCutoff = Double.valueOf(cutoff);
+ geonamesIndex = new MMapDirectory(new File(indexloc));
+ geonamesReader = DirectoryReader.open(geonamesIndex);
+ geonamesSearcher = new IndexSearcher(geonamesReader);
+ geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+
+ }
+
+
+
+ QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
+ Query q = parser.parse(luceneQueryString);
+
+
+ TopDocs search = geonamesSearcher.search(q, rowsReturned);
+ double maxScore = (double) search.getMaxScore();
+
+ for (int i = 0; i < search.scoreDocs.length; ++i) {
+ GazateerEntry entry = new GazateerEntry();
+ int docId = search.scoreDocs[i].doc;
+ double sc = search.scoreDocs[i].score;
+
+ entry.getScoreMap().put("lucene", sc);
+
+ entry.getScoreMap().put("rawlucene", sc);
+ entry.setIndexID(docId + "");
+ entry.setSource("geonames");
+
+ Document d = geonamesSearcher.doc(docId);
+ List<IndexableField> fields = d.getFields();
+ for (int idx = 0; idx < fields.size(); idx++) {
+ String value = d.get(fields.get(idx).name());
+ value = value.toLowerCase();
+ switch (idx) {
+ case 1:
+ entry.setItemID(value);
+ break;
+ case 3:
+ entry.setLatitude(Double.valueOf(value));
+ break;
+ case 4:
+ entry.setLongitude(Double.valueOf(value));
+ break;
+ case 10:
+ entry.setItemType(value);
+ break;
+ case 12:
+ entry.setItemParentID(value);
+ break;
+ case 23:
+ entry.setItemName(value);
+ break;
+ }
+ entry.getIndexData().put(fields.get(idx).name(), value);
+ }
+ //only keep it if the country code is a match
+ if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
+ linkedData.add(entry);
+ }
+ }
+
+ normalize(linkedData, 0d, maxScore);
+ prune(linkedData);
+ } catch (IOException | ParseException ex) {
+ System.err.println(ex);
+ }
+ /**
+ * add the records to the cache for this query
+ */
+ GazateerSearchCache.put(searchString, linkedData);
+ return linkedData;
+ }
+
+ /**
+ * Looks up the name in the USGS gazateer, after checking the cache
+ *
+ * @param searchString the nameed entity to look up in the lucene index
+ * @param rowsReturned how many rows to allow lucene to return
+ *
+ * @param properties properties file that states where the lucene indexes
+ * @return
+ */
+ public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {
+ ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ try {
+
+ String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
+ /**
+ * hit the cache
+ */
+ ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
+ if (get != null) {
+ //if the name is already there, return the list of cavhed results
+ return get;
+ }
+ if (usgsIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+ scoreCutoff = Double.valueOf(cutoff);
+ usgsIndex = new MMapDirectory(new File(indexloc));
+ usgsReader = DirectoryReader.open(usgsIndex);
+ usgsSearcher = new IndexSearcher(usgsReader);
+ usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ }
+
+
+ QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);
+ Query q = parser.parse(luceneQueryString);
+
+
+ TopDocs search = usgsSearcher.search(q, rowsReturned);
+ double maxScore = (double) search.getMaxScore();
+
+
+ for (int i = 0; i < search.scoreDocs.length; ++i) {
+ GazateerEntry entry = new GazateerEntry();
+ int docId = search.scoreDocs[i].doc;
+ double sc = search.scoreDocs[i].score;
+ //keep track of the min score for normalization
+
+ entry.getScoreMap().put("lucene", sc);
+ entry.getScoreMap().put("rawlucene", sc);
+ entry.setIndexID(docId + "");
+ entry.setSource("usgs");
+ entry.setItemParentID("us");
+
+
+ Document d = usgsSearcher.doc(docId);
+ List<IndexableField> fields = d.getFields();
+ for (int idx = 0; idx < fields.size(); idx++) {
+ String value = d.get(fields.get(idx).name());
+ value = value.toLowerCase();
+ switch (idx) {
+ case 0:
+ entry.setItemID(value);
+ break;
+ case 1:
+ entry.setItemName(value);
+ break;
+ case 2:
+ entry.setItemType(value);
+ break;
+ case 9:
+ entry.setLatitude(Double.valueOf(value));
+ break;
+ case 10:
+ entry.setLongitude(Double.valueOf(value));
+ break;
+ }
+ entry.getIndexData().put(fields.get(idx).name(), value);
+ }
+ linkedData.add(entry);
+
+
+ }
+
+ normalize(linkedData, 0d, maxScore);
+ prune(linkedData);
+ } catch (IOException | ParseException ex) {
+ System.err.println(ex);
+ }
+ /**
+ * add the records to the cache for this query
+ */
+ GazateerSearchCache.put(searchString, linkedData);
+ return linkedData;
+ }
+
+ private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {
+ for (GazateerEntry gazateerEntry : linkedData) {
+
+ double luceneScore = gazateerEntry.getScoreMap().get("lucene");
+ luceneScore = normalize(luceneScore, minScore, maxScore);
+ luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;
+ luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;
+ gazateerEntry.getScoreMap().put("lucene", luceneScore);
+ }
+ }
+
+ private void prune(ArrayList<GazateerEntry> linkedData) {
+ for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {
+ GazateerEntry ge = itr.next();
+ if (ge.getScoreMap().get("lucene") < scoreCutoff) {
+ itr.remove();
+ }
+ }
+ }
+
+ private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
new file mode 100644
index 0000000..3dc8c81
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.EntityLinker;
+
+/**
+ * Links location entities to gazatteers. Currently supports gazateers in a
+ * MySql database (NGA and USGS)
+ *
+ *
+ */
+public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
+
+ private CountryContext countryContext;
+ private Map<String, Set<Integer>> countryMentions;
+ private EntityLinkerProperties linkerProperties;
+ private GazateerSearcher gazateerSearcher = new GazateerSearcher();
+ private List<LinkedEntityScorer> scorers = new ArrayList<>();
+ /**
+ * Flag for deciding whether to search gaz only for toponyms within countries
+ * that are mentioned in the document
+ */
+ private Boolean filterCountryContext = true;
+
+ public GeoEntityLinker() {
+ countryContext = new CountryContext();
+ }
+
+ @Override
+ public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
+ ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
+
+ if (linkerProperties == null) {
+ throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
+ }
+ countryMentions = countryContext.regexfind(doctext, linkerProperties);
+
+ for (int s = 0; s < sentences.length; s++) {
+ Span[] names = namesBySentence[s];
+ String[] tokens = tokensBySentence[s];
+ String[] matches = Span.spansToStrings(names, tokens);
+
+ for (int i = 0; i < matches.length; i++) {
+
+//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
+ ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+ if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
+ // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
+ if (!countryMentions.keySet().isEmpty()) {
+ for (String code : countryMentions.keySet()) {
+ if (!code.equals("us")) {
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+ }
+ }
+ } else {
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));
+
+ }
+
+ }
+ ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+ if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
+ //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
+ usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));
+ }
+ LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+
+ if (!usgsEntries.isEmpty()) {
+ geoSpan.getLinkedEntries().addAll(usgsEntries);
+ geoSpan.setSearchTerm(matches[i]);
+ }
+
+ if (!geoSpan.getLinkedEntries().isEmpty()) {
+ geoSpan.setSearchTerm(matches[i]);
+ geoSpan.setSentenceid(s);
+ spans.add(geoSpan);
+ }
+ }
+ }
+
+ if (scorers.isEmpty()) {
+ scorers.add(new FuzzyStringMatchScorer());
+ scorers.add(new GeoHashBinningScorer());
+ scorers.add(new CountryProximityScorer());
+ scorers.add(new ModelBasedScorer());
+ }
+ for (LinkedEntityScorer scorer : scorers) {
+ scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
+ }
+ return spans;
+ }
+
+ @Override
+ public void setEntityLinkerProperties(EntityLinkerProperties properties) {
+ this.linkerProperties = properties;
+ }
+
+ @Override
+ public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {
+ throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
+ }
+
+ @Override
+ public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {
+ throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
+ }
+
+ @Override
+ public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {
+ throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
+ }
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
new file mode 100644
index 0000000..00c3f9e
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import static opennlp.addons.geoentitylinker.ModelBasedScorer.RADIUS;
+
+
+/**
+ *
+ * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
+ */
+public class GeoEntityLinkerSetupUtils {
+ public static ModelBasedScorer scorer;
+
+ static {
+ scorer = new ModelBasedScorer();
+ }
+ public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){
+ GazateerIndexer indexer = new GazateerIndexer();
+ try {
+ indexer.index(outputIndexDir, gazateerInputData, type);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+ /**
+ *
+ * @param documents A list of document texts, for best results try to
+ * ensure each country you care about will be
+ * represented in the collection
+ * @param annotationOutFile the location where the annotated doccat text file
+ * will be stored
+ * @param modelOutFile the location where the doccat model will be stored
+ * @param properties the properties where the country context object
+ * will find it's country data from this property:
+ * opennlp.geoentitylinker.countrycontext.filepath
+ * @throws IOException
+ */
+ public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
+ CountryContext context = new CountryContext();
+ FileWriter writer = new FileWriter(annotationOutFile, true);
+ System.out.println("processing " + documents.size() + " documents");
+ for (String docText : documents) {
+ System.out.append(".");
+ Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+ Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
+ for (String key : modelCountryContext.keySet()) {
+ for (String wordbag : modelCountryContext.get(key)) {
+ writer.write(key + " " + wordbag + "\n");
+ }
+ }
+ }
+ System.out.println("Document processing complete. Writing training data to "+ annotationOutFile.getAbsolutePath());
+ writer.close();
+ System.out.println("Building Doccat model...");
+ DoccatModel model = null;
+
+ InputStream dataIn = new FileInputStream(annotationOutFile);
+ try {
+
+ ObjectStream<String> lineStream =
+ new PlainTextByLineStream(dataIn, "UTF-8");
+ ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
+
+ model = DocumentCategorizerME.train("en", sampleStream);
+ OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
+ model.serialize(modelOut);
+ System.out.println("Model complete!");
+ } catch (IOException e) {
+ // Failed to read or parse training data, training failed
+ e.printStackTrace();
+ }
+
+ }
+
+ /**
+ * generates proximal wordbags within the radius of a country mention within
+ * the doctext based on the country context object
+ *
+ *
+ * @param docText
+ * @param additionalContext
+ * @param radius
+ * @return
+ */
+ private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+ Map<String, ArrayList< String>> featureBags = new HashMap<>();
+ Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
+ /**
+ * iterator over the map that contains a mapping of every country code to
+ * all of its mentions in the document
+ */
+ for (String code : countryMentions.keySet()) {
+ /**
+ * for each mention, collect features from around each mention, then
+ * consolidate the features into another map
+ */
+ for (int mentionIdx : countryMentions.get(code)) {
+ String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
+ // Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
+ if (featureBags.containsKey(code)) {
+ featureBags.get(code).add(chunk);
+ } else {
+ ArrayList<String> newlist = new ArrayList<>();
+ newlist.add(chunk);
+ featureBags.put(code, newlist);
+ }
+ }
+ }
+ return featureBags;
+ }
+
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
new file mode 100644
index 0000000..4d7467f
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality
+ * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the
+ * Lucene search is set to an appropriate level so this class if not fed poor data.
+ */
+public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
+ score( linkedSpans);
+ }
+
+ private void score(List<LinkedSpan> geospans) {
+ Map<Double, Double> latLongs = new HashMap<Double, Double>();
+
+ /**
+ * collect all the lat longs
+ */
+ for (LinkedSpan<BaseLink> ls : geospans) {
+ for (BaseLink bl : ls.getLinkedEntries()) {
+ if (bl instanceof GazateerEntry) {
+ GazateerEntry entry = (GazateerEntry) bl;
+ latLongs.put(entry.getLatitude(), entry.getLongitude());
+
+ }
+ }
+ }
+
+ /**
+ * convert to geohash and add to sortedset
+ */
+ TreeSet<Long> geoHashes = new TreeSet<Long>();
+ for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {
+ geoHashes.add(geoHash(entry.getKey(), entry.getValue()));
+ }
+ /**
+ * bin the points and generate a scoremap
+ */
+ Map<Long, Set<Long>> bins = bin(geoHashes);
+ Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);
+ /**
+ * iterate over the data again and assign the score based on the bins
+ */
+ for (LinkedSpan<BaseLink> ls : geospans) {
+ for (BaseLink bl : ls.getLinkedEntries()) {
+ Long geohash = -1L;
+ Double score = 0d;
+ if (bl instanceof GazateerEntry) {
+ GazateerEntry entry = (GazateerEntry) bl;
+ geohash = geoHash(entry.getLatitude(), entry.getLongitude());
+
+ }
+ if (scores.containsKey(geohash)) {
+ score = scores.get(geohash);
+
+ } else {
+ for (Long bin : bins.keySet()) {
+ if (bin == geohash || bins.get(bin).contains(geohash)) {
+ score = scores.get(bin);
+ break;
+ }
+ }
+ }
+ bl.getScoreMap().put("geohashbin", score);
+ }
+ }
+
+
+ }
+
+ private Long normalize(Double coordpart, Boolean isLat) {
+ Integer add = isLat ? 90 : 180;
+ coordpart = Math.abs(coordpart + add);
+ coordpart = coordpart * 1000000;
+
+ Long l = Math.round(coordpart);
+ String coord = String.valueOf(l);
+ if (coord.length() < 8) {
+ while (coord.length() < 8) {
+ coord += "0";
+ }
+ }
+ coord = coord.substring(0, 8);
+ l = Long.valueOf(coord);
+ return l;
+ }
+
+ /**
+ * interleaves a lat and a long to place the coordinate in linear sortable
+ * space for binning simplicity
+ *
+ * @param lat
+ * @param lon
+ * @return
+ */
+ private Long geoHash(double lat, double lon) {
+ Long normLat = normalize(lat, Boolean.TRUE);
+ Long normLon = normalize(lon, Boolean.FALSE);
+ String sLat = String.valueOf(normLat);
+ String sLon = String.valueOf(normLon);
+ char[] latInts = sLat.toCharArray();
+ char[] lonInts = sLon.toCharArray();
+ String geoHash = "";
+ int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;
+ for (int i = 0; i < len - 1; i++) {
+ String a = String.valueOf(latInts[i]);
+ String b = String.valueOf(lonInts[i]);
+ geoHash += a + b;
+ }
+
+ return Long.valueOf(geoHash);
+ }
+
+ private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {
+ ArrayList<Long> list = new ArrayList<Long>(sets);
+ ArrayList<Long> diffs = new ArrayList<Long>();
+ /**
+ * create a set of differences between the points
+ */
+ for (int i = 0; i < list.size() - 1; i++) {
+ Long n = list.get(i + 1);
+ Long v = list.get(i);
+ diffs.add(Math.abs(n - v));
+ }
+ /**
+ * generate an average "distance" between the normed points
+ */
+ Long sum = 0L;
+ for (Long l : diffs) {
+ sum += l;
+ }
+ Long avg=sum;
+ if(!diffs.isEmpty()){
+ avg = sum / diffs.size();
+ }
+
+
+ /**
+ * generate break values where the disparity is greater than the average
+ */
+ TreeSet<Long> breaks = new TreeSet<Long>();
+ for (int i = 0; i < list.size() - 1; i++) {
+ Long n = list.get(i + 1);
+ Long v = list.get(i);
+ //Long percent = 100 - (v / n * 100);
+ Long diff = n - v;
+ if (diff > avg) {
+ breaks.add(v);
+ }
+ }
+ /**
+ * based on the break values, place subsets of close points into bins
+ */
+ TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();
+ Long lastBreak = -1L;
+ for (Long br : breaks) {
+ if (lastBreak == -1L) {
+ binToAmount.put(br, sets.subSet(0L, true, br, true));
+ } else {
+ binToAmount.put(br, sets.subSet(lastBreak, false, br, true));
+ }
+ lastBreak = br;
+ }
+ lastBreak = sets.higher(lastBreak);
+ if (lastBreak != null) {
+ binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));
+ if (binToAmount.get(lastBreak).isEmpty()) {
+ binToAmount.get(lastBreak).add(lastBreak);
+ }
+ }
+ /**
+ * "binToAmount" is a map of the break value to all the points behind it
+ * (it's sorted), so the key is the max value of its set of values
+ */
+ return binToAmount;
+ }
+
+ /**
+ * returns a map of geohashes and their score
+ *
+ * @param binToAmount
+ * @return Map< Geohash, score>
+ */
+ private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {
+ TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();
+ TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();
+ /**
+ * if there is only one bin return 1 as the rank for each item in the value
+ */
+ if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {
+ for (Long bin : binToAmount.keySet()) {
+ for (Long hash : binToAmount.get(bin)) {
+ ranks.put(bin, 1d);
+ }
+ }
+ return ranks;
+ }
+ int total = 0;
+ /**
+ * generate a total number of points
+ */
+ for (Set<Long> geohashes : binToAmount.values()) {
+ total += geohashes.size();
+ }
+ /**
+ * divide total by bin size, largest bin size gets best score, everything in
+ * that bin gets that score because it is part of that primary cluster
+ * TODO... do an extra iteration of clustering within the predominant
+ * cluster to refine the scoring or make the basis of the binning more
+ * granular than > avg
+ */
+ TreeSet<Double> rankSet = new TreeSet<Double>();
+ for (Long key : binToAmount.keySet()) {
+ int size = binToAmount.get(key).size();
+ Double rank = (double) total / size;
+ rankSet.add(rank);
+ ranks.put(key, rank);
+ }
+ /**
+ * load the final score map with normalized values
+ */
+ for (Map.Entry<Long, Double> rank : ranks.entrySet()) {
+ double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);
+ double reverse = Math.abs(norm - 1);
+ double score = reverse > 1d ? 1.0 : reverse;
+ normRanks.put(rank.getKey(), score);
+ }
+
+ return normRanks;
+ }
+
+ /**
+ * transposes a number in a range to a double between 0 and 1
+ *
+ * @param valueToNormalize the value to be normalized (placed within a new
+ * range of 0-1)
+ * @param minimum the min of the current range
+ * @param maximum the max of the current range
+ * @return
+ */
+ private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
+
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
new file mode 100644
index 0000000..1acca46
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.List;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Structure for scoring linked entities. The Map logically represents a pair :
+ * "Score type" to the "actual Score."
+ */
+public interface LinkedEntityScorer<T> {
+
+/**
+ * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
+ * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
+ * @param docText the full text of the document.
+ * @param sentenceSpans the sentence spans the correspond to the document text
+ * @param additionalContext any additional data required to perform the scoring operation
+ * @return void
+ */
+ void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);
+}
diff --git a/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
new file mode 100644
index 0000000..d370ec8
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * Utilizes a doccat model to score toponyms based on surrounding context
+ */
+public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
+
+
+ DocumentCategorizerME documentCategorizerME;
+ DoccatModel doccatModel;
+ public static final int RADIUS = 100;
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ try {
+ if (doccatModel == null) {
+ String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
+ if (path.equals("")) {
+ return;
+ }
+ doccatModel = new DoccatModel(new File(path));
+ documentCategorizerME = new DocumentCategorizerME(doccatModel);
+ }
+ Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
+ for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
+ Map<String, Double> scores = this.getScore(entry.getValue());
+ for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {
+ double score = 0d;
+ if (scores.containsKey(link.getItemParentID())) {
+ score = scores.get(link.getItemParentID());
+ }
+ link.getScoreMap().put("countrymodel", score);
+ }
+ }
+
+ } catch (FileNotFoundException ex) {
+ System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");
+ } catch (IOException ex) {
+ System.err.println(ex);
+ } catch (Exception ex) {
+ System.err.println(ex);
+ }
+ }
+
+ /**
+ * generates features using a BagOfWordsfeatureGenerator that are within the
+ * radius of a mention within the doctext
+ *
+ * @param linkedSpans
+ * @param docText
+ * @param additionalContext
+ * @param radius
+ * @return a map of the index of the linked span to the string of surrounding
+ * text: Map<indexofspan,surrounding text>
+ */
+ public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {
+ Map<Integer, String> featureBags = new HashMap<>();
+ Map<Integer, Integer> nameMentionMap = new HashMap<>();
+ /**
+ * iterator over the map that contains a mapping of every country code to
+ * all of its mentions in the document
+ */
+ for (int i = 0; i < linkedSpans.size(); i++) {
+ LinkedSpan span = linkedSpans.get(i);
+ if (span.getLinkedEntries().isEmpty()) {
+ //don't care about spans that did not get linked to anything at all; nothing to work with
+ continue;
+ }
+ /**
+ * get the sentence the name span was found in, the beginning of the
+ * sentence will suffice as a centroid for feature generation around the
+ * named entity
+ */
+ Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
+ nameMentionMap.put(i, mentionIdx);
+ }
+ /**
+ * now associate each span to a string that will be used for categorization
+ * against the model.
+ */
+ for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
+ featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));
+ }
+
+
+ return featureBags;
+ }
+
+ public String getTextChunk(int mentionIdx, String docText, int radius) {
+ int docSize = docText.length();
+ int left = 0, right = 0;
+ left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
+ right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
+ String chunk = "";
+ if (right <= left) {
+ chunk = "";
+ } else {
+ /**
+ * don't want to chop any words in half, so take fron the first space to
+ * the last space in the chunk string
+ */
+ chunk = docText.substring(left, right);
+ if (left != 0) {
+ left = chunk.indexOf(" ");
+ }
+ right = chunk.lastIndexOf(" ");
+ /**
+ * now get the substring again with only whole words
+ */
+ if (left < right) {
+ chunk = chunk.substring(left, right);
+ }
+ }
+
+ return chunk;
+ }
+
+ private Map<String, Double> getScore(String text) throws Exception {
+ Map<String, Double> scoreMap = new HashMap<>();
+ double[] categorize = documentCategorizerME.categorize(text);
+ int catSize = documentCategorizerME.getNumberOfCategories();
+ for (int i = 0; i < catSize; i++) {
+ String category = documentCategorizerME.getCategory(i);
+ scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);
+ }
+ return scoreMap;
+ }
+
+
+}