OPENNLP-626
renamed packages for consistency in addons, also made small efficiencies
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
deleted file mode 100644
index 05fe749..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-
-/**
- * Finds instances of country mentions in a String, typically a document text.
- * Used to boost or degrade scoring of linked geo entities
- *
- */
-public class CountryContext {
-
-
- private List<CountryContextEntry> countrydata;
- private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
- private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
- private Set<CountryContextEntry> countryHits = new HashSet<>();
-
- public CountryContext() {
- }
-
- public Map<String, Set<Integer>> getCountryMentions() {
- return countryMentions;
- }
-
- public Set<CountryContextEntry> getCountryHits() {
- return countryHits;
- }
-
- public Map<String, Set<String>> getNameCodesMap() {
- return nameCodesMap;
- }
-
- public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
- this.nameCodesMap = nameCodesMap;
- }
-
- /**
- * Finds mentions of countries based on a list from MySQL stored procedure
- * called getCountryList. This method finds country mentions in documents,
- * which is an essential element of the scoring that is done for geo
- * linkedspans. Lazily loads the list from the database.
- *
- * @param docText the full text of the document
- * @param properties EntityLinkerProperties for getting database connection
- * @return
- */
- public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
- countryMentions = new HashMap<>();
- nameCodesMap.clear();
- try {
-
- if (countrydata == null) {
- countrydata = getCountryContextFromFile(properties);
- // countrydata = getCountryData(properties);
- }
- for (CountryContextEntry entry : countrydata) {
- Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
- Matcher rs = regex.matcher(docText);
- String code = entry.getCc1().toLowerCase();
-
- boolean found = false;
- while (rs.find()) {
- found = true;
- Integer start = rs.start();
- String hit = rs.group().toLowerCase();
- if (countryMentions.containsKey(code)) {
- countryMentions.get(code).add(start);
- } else {
- Set<Integer> newset = new HashSet<Integer>();
- newset.add(start);
- countryMentions.put(code, newset);
- }
- if (!hit.equals("")) {
- if (this.nameCodesMap.containsKey(hit)) {
- nameCodesMap.get(hit).add(code);
- } else {
- HashSet<String> newset = new HashSet<String>();
- newset.add(code);
- nameCodesMap.put(hit, newset);
- }
- }
- }
- if (found) {
- countryHits.add(entry);
- }
-
- }
-
- } catch (Exception ex) {
- Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);
- }
-
-
- return countryMentions;
- }
-
- private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
- List<CountryContextEntry> entries = new ArrayList<>();
- String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
- BufferedReader reader;
-
- try {
- path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-
- reader = new BufferedReader(new FileReader(path));
-
- while (reader.read() != -1) {
- String line = reader.readLine();
- String[] values = line.split("\t");
- if (values.length != 4) {
- throw new IOException("improperly formatted country context file");
- }
- CountryContextEntry entry = new CountryContextEntry();
- // rc,cc1, full_name_nd_ro,dsg
- entry.setRc(values[0].toLowerCase());
- entry.setCc1(values[1].toLowerCase());
- entry.setFull_name_nd_ro(values[2].toLowerCase());
- entry.setDsg(values[3].toLowerCase());
- entries.add(entry);
- }
- reader.close();
- } catch (IOException e) {
- System.err.println(e);
- }
- return entries;
-
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
deleted file mode 100644
index a32642b..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.Objects;
-
-/**
- *Stores a tuple from mysql that is used to find country mentions in document text.
- *
- */
-public class CountryContextEntry {
- /*
- * rc,cc1, full_name_nd_ro,dsg
- */
-
- private String rc;
- private String cc1;
- private String full_name_nd_ro;
- private String dsg;
- private String provCode;
- public CountryContextEntry() {
- }
-
- public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) {
- this.rc = rc;
- this.cc1 = cc1;
- this.full_name_nd_ro = full_name_nd_ro;
- this.dsg = dsg;
- }
-
- public String getProvCode() {
- return provCode;
- }
-
- public void setProvCode(String provCode) {
- this.provCode = provCode;
- }
-
- public String getRc() {
- return rc;
- }
-
- public void setRc(String rc) {
- this.rc = rc;
- }
-
- public String getCc1() {
- return cc1;
- }
-
- public void setCc1(String cc1) {
- this.cc1 = cc1;
- }
-
- public String getFull_name_nd_ro() {
- return full_name_nd_ro;
- }
-
- public void setFull_name_nd_ro(String full_name_nd_ro) {
- this.full_name_nd_ro = full_name_nd_ro;
- }
-
- public String getDsg() {
- return dsg;
- }
-
- public void setDsg(String dsg) {
- this.dsg = dsg;
- }
-
- @Override
- public int hashCode() {
- int hash = 7;
- hash = 17 * hash + Objects.hashCode(this.rc);
- hash = 17 * hash + Objects.hashCode(this.cc1);
- hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro);
- hash = 17 * hash + Objects.hashCode(this.dsg);
- return hash;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null) {
- return false;
- }
- if (getClass() != obj.getClass()) {
- return false;
- }
- final CountryContextEntry other = (CountryContextEntry) obj;
- if (!Objects.equals(this.rc, other.rc)) {
- return false;
- }
- if (!Objects.equals(this.cc1, other.cc1)) {
- return false;
- }
- if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) {
- return false;
- }
- if (!Objects.equals(this.dsg, other.dsg)) {
- return false;
- }
- return true;
- }
-
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
deleted file mode 100644
index 694cec6..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-/**
- *Stores a "hit" on a country and the start and end of the hit
-
- */
-public class CountryContextHit {
-
- private String countryCode;
- private int start;
- private int end;
-
- public CountryContextHit() {
- }
-
- public CountryContextHit(String countryCode, int start, int end) {
- this.countryCode = countryCode;
- this.start = start;
- this.end = end;
- }
-
- public String getCountryCode() {
- return countryCode;
- }
-
- public void setCountryCode(String countryCode) {
- this.countryCode = countryCode;
- }
-
- public int getStart() {
- return start;
- }
-
- public void setStart(int start) {
- this.start = start;
- }
-
- public int getEnd() {
- return end;
- }
-
- public void setEnd(int end) {
- this.end = end;
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
deleted file mode 100644
index 36bfb86..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Scores toponyms based on country context as well as fuzzy string matching
- */
-public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
-
- private Map<String, Set<String>> nameCodesMap;
- String dominantCode = "";
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
-
- score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
-
- }
-
- /**
- * Assigns a score to each BaseLink in each linkedSpan's set of N best
- * matches. Currently the scoring indicates the probability that the toponym
- * is correct based on the country context in the document and fuzzy string
- * matching
- *
- * @param linkedData the linked spans, holds the Namefinder results, and
- * the list of BaseLink for each
- * @param countryHits all the country mentions in the document
- * @param nameCodesMap maps a country indicator name to a country code. Used
- * to determine if the namefinder found the same exact
- * toponym the country context did. If so the score is
- * boosted due to the high probability that the
- * NameFinder actually "rediscovered" a country
- * @param docText the full text of the document...not used in this
- * default implementation
- * @param sentences the sentences that correspond to the doc text.
- * @param maxAllowedDist a constant that is used to determine which country
- * mentions, based on proximity within the text, should
- * be used to score the Named Entity.
- * @return
- */
- public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
- this.nameCodesMap = nameCodesMap;
- setDominantCode(countryHits);
- for (LinkedSpan<BaseLink> linkedspan : linkedData) {
-
- linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
- }
- return linkedData;
- }
-
- /**
- * sets class level variable to a code based on the number of mentions
- *
- * @param countryHits
- */
- private void setDominantCode(Map<String, Set<Integer>> countryHits) {
- int hits = -1;
- for (String code : countryHits.keySet()) {
- if (countryHits.get(code).size() > hits) {
- hits = countryHits.get(code).size();
- dominantCode = code;
- }
- }
- }
-
- /**
- * Generates distances from each country mention to the span's location in the
- * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
- * are resolved to the correct country and coordinate.
- *
- * @param sentences
- * @param countryHits
- * @param span
- * @return
- */
- private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
- Double score = 0.0;
- //get the index of the actual span, begining of sentence
- //should generate tokens from sentence and create a char offset...
- //could have large sentences due to poor sentence detection or wonky doc text
- int sentenceIdx = span.getSentenceid();
- int sentIndexInDoc = sentences[sentenceIdx].getStart();
- /**
- * create a map of all the span's proximal country mentions in the document
- * Map< countrycode, set of <distances from this NamedEntity>>
- */
- Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();
- //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
- for (String cCode : countryHits.keySet()) {
-//iterate over all the regex start values and calculate an offset
- for (Integer cHit : countryHits.get(cCode)) {
- Integer absDist = Math.abs(sentIndexInDoc - cHit);
- //only include near mentions based on a heuristic
- //TODO make this a property
- // if (absDist < maxAllowedDistance) {
- if (distancesFromCodeMap.containsKey(cCode)) {
- distancesFromCodeMap.get(cCode).add(absDist);
- } else {
- HashSet<Integer> newset = new HashSet<Integer>();
- newset.add(absDist);
- distancesFromCodeMap.put(cCode, newset);
- }
- }
-
- //}
- }
- //we now know how far this named entity is from every country mention in the document
-
- /**
- * the gaz matches that have a country code that have mentions in the doc
- * that are closest to the Named Entity should return the best score.
- * Analyzemap generates a likelihood score that the toponym from the gaz is
- * referring to one of the countries, i.e, Map<countrycode, prob that this
- * span is referring to the toponym form this code key>
- */
- Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
- for (BaseLink link : span.getLinkedEntries()) {
- //getItemParentId is the country code
- String spanCountryCode = link.getItemParentID();
- if (scoreMap.containsKey(spanCountryCode)) {
-
- score = scoreMap.get(spanCountryCode);
- ///does the name extracted match a country name?
- if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
- //if so, is it the correct country code for that name?
- if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
- //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
- //TODO: make this multiplier configurable
- score = (score + .75) > 1.0 ? 1d : (score + .75);
-
- if (link.getItemParentID().equals(dominantCode)) {
- score = (score + .25) > 1.0 ? 1d : (score + .25);
- }
- }
- }
- }
- link.getScoreMap().put("countrycontext", score);
- }
- return span;
- }
-
- /**
- * takes a map of distances from the NE to each country mention and generates
- * a map of scores for each country code. The map is then correlated to teh
- * correlated to the code of the BaseLink parentid for retrieval. Then the
- * score is added to the overall.
- *
- * @param distanceMap
- * @param sentences
- * @param span
- * @return
- */
- private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
-
- Map<String, Double> scoreMap = new HashMap<String, Double>();
- if (distanceMap.isEmpty()) {
- return scoreMap;
- }
- TreeSet<Integer> all = new TreeSet<Integer>();
- for (String key : distanceMap.keySet()) {
- all.addAll(distanceMap.get(key));
- }
- //get min max for normalization, this could be more efficient
-
- Integer min = all.first();
- Integer max = all.last();
- if (min == max) {
- min = 0;
- }
- for (String key : distanceMap.keySet()) {
-
- TreeSet<Double> normalizedDistances = new TreeSet<Double>();
- for (Integer i : distanceMap.get(key)) {
- Double norm = normalize(i, min, max);
- //reverse the normed distance so low numbers (closer) are better
- //this could be improved with a "decaying " function using an imcreaseing negative exponent
- Double reverse = Math.abs(norm - 1);
- normalizedDistances.add(reverse);
- }
-
-
- List<Double> doubles = new ArrayList<Double>(normalizedDistances);
- scoreMap.put(key, slidingDistanceAverage(doubles));
- }
- return scoreMap;
- }
-
- /**
- * this method is an attempt to make closer clusters of mentions group
- * together to smooth out the average, so one distant outlier does not kill
- * the score for an obviously good hit. More elegant solution is possible
- * using Math.pow, and making the score decay with distance by using an
- * increasing negative exponent
- *
- * @param normDis the normalized and sorted set of distances as a list
- * @return
- */
- private Double slidingDistanceAverage(List<Double> normDis) {
- List<Double> windowOfAverages = new ArrayList<Double>();
-
- if (normDis.size() < 3) {
- windowOfAverages.addAll(normDis);
- } else {
-
- for (int i = 0; i < normDis.size() - 1; i++) {
- double a = normDis.get(i);
- double b = normDis.get(i + 1);
- windowOfAverages.add((a + b) / 2);
-
- }
- }
- double sum = 0d;
- for (double d : windowOfAverages) {
- sum += d;
- }
- double result = sum / windowOfAverages.size();
- //TODO: ++ prob when large amounts of mentions for a code
- //System.out.println("avg of window:" + result);
- return result;
- }
-
- /**
- * transposes a value within one range to a relative value in a different
- * range. Used to normalize distances in this class.
- *
- * @param valueToNormalize the value to place within the new range
- * @param minimum the min of the set to be transposed
- * @param maximum the max of the set to be transposed
- * @return
- */
- private Double normalize(int valueToNormalize, int minimum, int maximum) {
- Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
- d = d == null ? 0d : d;
- return d;
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
deleted file mode 100644
index af1aa1c..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.ngram.NGramGenerator;
-import opennlp.tools.util.Span;
-
-/**
- *
- * Generates scores for string comparisons.
- */
-public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
- for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
- for (BaseLink link : linkedSpan.getLinkedEntries()) {
- Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
- link.getScoreMap().put("dice", dice);
- Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));
- link.getScoreMap().put("levenshtein", ld);
- }
- }
-
-
- }
-
- /**
- * Generates a score based on an overlap of nGrams between two strings using
- * the DiceCoefficient technique.
- *
- * @param s1 first string
- * @param s2 second string
- * @param nGrams number of chars in each gram
- * @return
- */
- public double getDiceCoefficient(String s1, String s2, int nGrams) {
- if (s1.equals("") || s1.equals("")) {
- return 0d;
- }
- List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
- List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");
-
- Set<String> overlap = new HashSet<String>(s1Grams);
- overlap.retainAll(s2Grams);
- double totcombigrams = overlap.size();
-
- return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
- }
-
- private int minimum(int a, int b, int c) {
- return Math.min(Math.min(a, b), c);
- }
-
- public int getLevenshteinDistance(CharSequence str1,
- CharSequence str2) {
- int[][] distance = new int[str1.length() + 1][str2.length() + 1];
-
- for (int i = 0; i <= str1.length(); i++) {
- distance[i][0] = i;
- }
- for (int j = 1; j <= str2.length(); j++) {
- distance[0][j] = j;
- }
-
- for (int i = 1; i <= str1.length(); i++) {
- for (int j = 1; j <= str2.length(); j++) {
- distance[i][j] = minimum(
- distance[i - 1][j] + 1,
- distance[i][j - 1] + 1,
- distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
- }
- }
-
- return distance[str1.length()][str2.length()];
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
deleted file mode 100644
index 2371333..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.HashMap;
-import java.util.Map;
-import opennlp.tools.entitylinker.domain.BaseLink;
-
-/**
- *
- * Stores a record from a geographic placenames gazateer
- */
-public class GazateerEntry extends BaseLink {
-
- private Double latitude;
- private Double longitude;
- private String source;
- private String indexID;
- private Map<String, String> indexData=new HashMap<>();
-
- public String getIndexID() {
- return indexID;
- }
-
- public void setIndexID(String indexID) {
- this.indexID = indexID;
- }
-
- public Double getLatitude() {
- return latitude;
- }
-
- public void setLatitude(Double latitude) {
- this.latitude = latitude;
- }
-
- public Double getLongitude() {
- return longitude;
- }
-
- public void setLongitude(Double longitude) {
- this.longitude = longitude;
- }
-
- public String getSource() {
- return source;
- }
-
- public void setSource(String source) {
- this.source = source;
- }
-
- public Map<String, String> getIndexData() {
- return indexData;
- }
-
- public void setIndexData(Map<String, String> indexData) {
- this.indexData = indexData;
- }
-
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
deleted file mode 100644
index d8be425..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.ar.ArabicAnalyzer;
-import org.apache.lucene.analysis.fa.PersianAnalyzer;
-import org.apache.lucene.analysis.ru.RussianAnalyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.th.ThaiAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MMapDirectory;
-import org.apache.lucene.util.Version;
-
-/**
- *
- * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
- */
-public class GazateerIndexer {
-
- public GazateerIndexer() {
- loadAnalyzerMap();
- }
- Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
-
- public static interface Separable {
-
- String getSeparator();
- }
-
- public enum GazType implements Separable {
-
- GEONAMES {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_geonames_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\t";
- }
- },
- USGS {
- @Override
- public String toString() {
- return "/opennlp_geoentitylinker_usgsgaz_idx";
- }
-
- @Override
- public String getSeparator() {
- return "\\|";
- }
- }
- }
-
- public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
- if (!outputIndexDir.isDirectory()) {
- throw new IllegalArgumentException("outputIndexDir must be a directory.");
- }
-
- String indexloc = outputIndexDir + type.toString();
- Directory index = new MMapDirectory(new File(indexloc));
-
- Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
-
- IndexWriter w = new IndexWriter(index, config);
-
- readFile(gazateerInputData, w, type);
- w.commit();
- w.close();
-
- }
-
- public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
- BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<String>();
- int counter = 0;
- int langCodeIndex = 0;
- System.out.println("reading gazateer data from file...........");
- while (reader.read() != -1) {
- String line = reader.readLine();
- String[] values = line.split(type.getSeparator());
- if (counter == 0) {
- // build fields
- for (int i = 0; i < values.length; i++) {
- String columnName = values[i];
- fields.add(columnName.replace("»¿", "").trim());
- if (columnName.toLowerCase().equals("lc")) {
- langCodeIndex = i;
- }
- }
-
-
- } else {
- Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
- doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
- }
- if (type == GazType.GEONAMES) {
- /**
- * see if the map contains a language specific analyzer
- */
- if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {
- /*
- * if so retrieve it from the map
- */
- Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);
- /**
- * index the doc using the specified analyzer
- */
- w.addDocument(doc, analyzer);
- } else {
- w.addDocument(doc);
- }
- } else {
- w.addDocument(doc);
- }
- }
- counter++;
- if (counter % 10000 == 0) {
- w.commit();
- System.out.println(counter + " .........committed to index..............");
- }
-
- }
- w.commit();
- System.out.println("Completed indexing gaz! index name is: " + type.toString());
- }
-/**
- * TODO: make these analyzers configurable
- */
- private void loadAnalyzerMap() {
- languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
- languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
- languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
- languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
-
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearchCache.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearchCache.java
deleted file mode 100644
index d4470d9..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearchCache.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- *
- * Caches gazateer query results statically
- */
-public class GazateerSearchCache {
-
- private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();
-
-
- public static synchronized ArrayList<GazateerEntry> get(String searchString) {
- return gazCache.get(searchString);
- }
-
- public static synchronized void put(String searchString, ArrayList<GazateerEntry> hits) {
- if (gazCache.size() > 10000) {
- gazCache.clear();
- }
- if (!gazCache.containsKey(searchString)) {
- gazCache.put(searchString, hits);
- }
- }
-
-
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
deleted file mode 100644
index ca9b93f..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.queryparser.classic.ParseException;
-
-import org.apache.lucene.queryparser.classic.QueryParser;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MMapDirectory;
-import org.apache.lucene.util.Version;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-
-/**
- *
- * Searches Gazateers stored in a MMapDirectory lucene index
- */
-public class GazateerSearcher {
-
- private double scoreCutoff = .75;
- private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
- private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
- private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
- private Analyzer geonamesAnalyzer;
- //usgs US gazateer
- private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
- private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
- private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
- private Analyzer usgsAnalyzer;
-
- public GazateerSearcher() {
- }
-
- /**
- *
- * @param searchString the nameed entity to look up in the lucene index
- * @param rowsReturned how many rows to allow lucene to return
- * @param code the country code
- * @param properties properties file that states where the lucene indexes
- * are
- * @return
- */
- public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
- ArrayList<GazateerEntry> linkedData = new ArrayList<>();
- try {
- /**
- * build the search string
- */
- String luceneQueryString = !code.equals("")
- ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
- : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
- /**
- * check the cache and go no further if the records already exist
- */
- ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
- if (get != null) {
- return get;
- }
- if (geonamesIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");
- scoreCutoff = Double.valueOf(cutoff);
- geonamesIndex = new MMapDirectory(new File(indexloc));
- geonamesReader = DirectoryReader.open(geonamesIndex);
- geonamesSearcher = new IndexSearcher(geonamesReader);
- geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
-
- }
-
-
-
- QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
- Query q = parser.parse(luceneQueryString);
-
-
- TopDocs search = geonamesSearcher.search(q, rowsReturned);
- double maxScore = (double) search.getMaxScore();
-
- for (int i = 0; i < search.scoreDocs.length; ++i) {
- GazateerEntry entry = new GazateerEntry();
- int docId = search.scoreDocs[i].doc;
- double sc = search.scoreDocs[i].score;
-
- entry.getScoreMap().put("lucene", sc);
-
- entry.getScoreMap().put("rawlucene", sc);
- entry.setIndexID(docId + "");
- entry.setSource("geonames");
-
- Document d = geonamesSearcher.doc(docId);
- List<IndexableField> fields = d.getFields();
- for (int idx = 0; idx < fields.size(); idx++) {
- String value = d.get(fields.get(idx).name());
- value = value.toLowerCase();
- switch (idx) {
- case 1:
- entry.setItemID(value);
- break;
- case 3:
- entry.setLatitude(Double.valueOf(value));
- break;
- case 4:
- entry.setLongitude(Double.valueOf(value));
- break;
- case 10:
- entry.setItemType(value);
- break;
- case 12:
- entry.setItemParentID(value);
- break;
- case 23:
- entry.setItemName(value);
- break;
- }
- entry.getIndexData().put(fields.get(idx).name(), value);
- }
- //only keep it if the country code is a match
- if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
- linkedData.add(entry);
- }
- }
-
- normalize(linkedData, 0d, maxScore);
- prune(linkedData);
- } catch (IOException | ParseException ex) {
- System.err.println(ex);
- }
- /**
- * add the records to the cache for this query
- */
- GazateerSearchCache.put(searchString, linkedData);
- return linkedData;
- }
-
- /**
- * Looks up the name in the USGS gazateer, after checking the cache
- *
- * @param searchString the nameed entity to look up in the lucene index
- * @param rowsReturned how many rows to allow lucene to return
- *
- * @param properties properties file that states where the lucene indexes
- * @return
- */
- public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {
- ArrayList<GazateerEntry> linkedData = new ArrayList<>();
- try {
-
- String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
- /**
- * hit the cache
- */
- ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
- if (get != null) {
- //if the name is already there, return the list of cavhed results
- return get;
- }
- if (usgsIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
- scoreCutoff = Double.valueOf(cutoff);
- usgsIndex = new MMapDirectory(new File(indexloc));
- usgsReader = DirectoryReader.open(usgsIndex);
- usgsSearcher = new IndexSearcher(usgsReader);
- usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
- }
-
-
- QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);
- Query q = parser.parse(luceneQueryString);
-
-
- TopDocs search = usgsSearcher.search(q, rowsReturned);
- double maxScore = (double) search.getMaxScore();
-
-
- for (int i = 0; i < search.scoreDocs.length; ++i) {
- GazateerEntry entry = new GazateerEntry();
- int docId = search.scoreDocs[i].doc;
- double sc = search.scoreDocs[i].score;
- //keep track of the min score for normalization
-
- entry.getScoreMap().put("lucene", sc);
- entry.getScoreMap().put("rawlucene", sc);
- entry.setIndexID(docId + "");
- entry.setSource("usgs");
- entry.setItemParentID("us");
-
-
- Document d = usgsSearcher.doc(docId);
- List<IndexableField> fields = d.getFields();
- for (int idx = 0; idx < fields.size(); idx++) {
- String value = d.get(fields.get(idx).name());
- value = value.toLowerCase();
- switch (idx) {
- case 0:
- entry.setItemID(value);
- break;
- case 1:
- entry.setItemName(value);
- break;
- case 2:
- entry.setItemType(value);
- break;
- case 9:
- entry.setLatitude(Double.valueOf(value));
- break;
- case 10:
- entry.setLongitude(Double.valueOf(value));
- break;
- }
- entry.getIndexData().put(fields.get(idx).name(), value);
- }
- linkedData.add(entry);
-
-
- }
-
- normalize(linkedData, 0d, maxScore);
- prune(linkedData);
- } catch (IOException | ParseException ex) {
- System.err.println(ex);
- }
- /**
- * add the records to the cache for this query
- */
- GazateerSearchCache.put(searchString, linkedData);
- return linkedData;
- }
-
- private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {
- for (GazateerEntry gazateerEntry : linkedData) {
-
- double luceneScore = gazateerEntry.getScoreMap().get("lucene");
- luceneScore = normalize(luceneScore, minScore, maxScore);
- luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;
- luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;
- gazateerEntry.getScoreMap().put("lucene", luceneScore);
- }
- }
-
- private void prune(ArrayList<GazateerEntry> linkedData) {
- for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {
- GazateerEntry ge = itr.next();
- if (ge.getScoreMap().get("lucene") < scoreCutoff) {
- itr.remove();
- }
- }
- }
-
- private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
- Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
- d = d == null ? 0d : d;
- return d;
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
deleted file mode 100644
index 05c63d7..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.Span;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.EntityLinker;
-
-/**
- * Links location entities to gazatteers. Currently supports gazateers in a
- * MySql database (NGA and USGS)
- *
- *
- */
-public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
-
- private CountryContext countryContext;
- private Map<String, Set<Integer>> countryMentions;
- private EntityLinkerProperties linkerProperties;
- private GazateerSearcher gazateerSearcher = new GazateerSearcher();
- private List<LinkedEntityScorer> scorers = new ArrayList<>();
- /**
- * Flag for deciding whether to search gaz only for toponyms within countries
- * that are mentioned in the document
- */
- private Boolean filterCountryContext = true;
-
- public GeoEntityLinker() {
- countryContext = new CountryContext();
- }
-
- @Override
- public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
- ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
-
- if (linkerProperties == null) {
- throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
- }
- countryMentions = countryContext.regexfind(doctext, linkerProperties);
-
- for (int s = 0; s < sentences.length; s++) {
- Span[] names = namesBySentence[s];
- String[] tokens = tokensBySentence[s];
- String[] matches = Span.spansToStrings(names, tokens);
-
- for (int i = 0; i < matches.length; i++) {
-
-//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
- ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
- if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
- // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
- if (!countryMentions.keySet().isEmpty()) {
- for (String code : countryMentions.keySet()) {
- if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
- }
- }
- } else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));
-
- }
-
- }
- ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
- if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
- //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
- usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));
- }
- LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
-
- if (!usgsEntries.isEmpty()) {
- geoSpan.getLinkedEntries().addAll(usgsEntries);
- geoSpan.setSearchTerm(matches[i]);
- }
-
- if (!geoSpan.getLinkedEntries().isEmpty()) {
- geoSpan.setSearchTerm(matches[i]);
- geoSpan.setSentenceid(s);
- spans.add(geoSpan);
- }
- }
- }
-
- if (scorers.isEmpty()) {
- scorers.add(new FuzzyStringMatchScorer());
- scorers.add(new GeoHashBinningScorer());
- scorers.add(new CountryProximityScorer());
- scorers.add(new ModelBasedScorer());
- }
- for (LinkedEntityScorer scorer : scorers) {
- scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
- }
- return spans;
- }
-
- @Override
- public void setEntityLinkerProperties(EntityLinkerProperties properties) {
- this.linkerProperties = properties;
- }
-
- @Override
- public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {
- throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
- }
-
- @Override
- public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {
- throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
- }
-
- @Override
- public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {
- throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
- }
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
deleted file mode 100644
index b1b9d11..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import opennlp.tools.doccat.DoccatModel;
-import opennlp.tools.doccat.DocumentCategorizerME;
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.doccat.DocumentSampleStream;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
-import static org.apache.opennlp.addons.tools.entitylinker.geoentitylinker.ModelBasedScorer.RADIUS;
-
-
-/**
- *
- * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
- */
-public class GeoEntityLinkerSetupUtils {
- public static ModelBasedScorer scorer;
-
- static {
- scorer = new ModelBasedScorer();
- }
- public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){
- GazateerIndexer indexer = new GazateerIndexer();
- try {
- indexer.index(outputIndexDir, gazateerInputData, type);
- } catch (Exception ex) {
- ex.printStackTrace();
- }
- }
- /**
- *
- * @param documents A list of document texts, for best results try to
- * ensure each country you care about will be
- * represented in the collection
- * @param annotationOutFile the location where the annotated doccat text file
- * will be stored
- * @param modelOutFile the location where the doccat model will be stored
- * @param properties the properties where the country context object
- * will find it's country data from this property:
- * opennlp.geoentitylinker.countrycontext.filepath
- * @throws IOException
- */
- public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
- CountryContext context = new CountryContext();
- FileWriter writer = new FileWriter(annotationOutFile, true);
- System.out.println("processing " + documents.size() + " documents");
- for (String docText : documents) {
- System.out.append(".");
- Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
- Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
- for (String key : modelCountryContext.keySet()) {
- for (String wordbag : modelCountryContext.get(key)) {
- writer.write(key + " " + wordbag + "\n");
- }
- }
- }
- System.out.println("Document processing complete. Writing training data to "+ annotationOutFile.getAbsolutePath());
- writer.close();
- System.out.println("Building Doccat model...");
- DoccatModel model = null;
-
- InputStream dataIn = new FileInputStream(annotationOutFile);
- try {
-
- ObjectStream<String> lineStream =
- new PlainTextByLineStream(dataIn, "UTF-8");
- ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
-
- model = DocumentCategorizerME.train("en", sampleStream);
- OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
- model.serialize(modelOut);
- System.out.println("Model complete!");
- } catch (IOException e) {
- // Failed to read or parse training data, training failed
- e.printStackTrace();
- }
-
- }
-
- /**
- * generates proximal wordbags within the radius of a country mention within
- * the doctext based on the country context object
- *
- *
- * @param docText
- * @param additionalContext
- * @param radius
- * @return
- */
- private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
- Map<String, ArrayList< String>> featureBags = new HashMap<>();
- Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
- /**
- * iterator over the map that contains a mapping of every country code to
- * all of its mentions in the document
- */
- for (String code : countryMentions.keySet()) {
- /**
- * for each mention, collect features from around each mention, then
- * consolidate the features into another map
- */
- for (int mentionIdx : countryMentions.get(code)) {
- String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
- // Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
- if (featureBags.containsKey(code)) {
- featureBags.get(code).add(chunk);
- } else {
- ArrayList<String> newlist = new ArrayList<>();
- newlist.add(chunk);
- featureBags.put(code, newlist);
- }
- }
- }
- return featureBags;
- }
-
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
deleted file mode 100644
index 26b69c1..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality
- * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the
- * Lucene search is set to an appropriate level so this class if not fed poor data.
- */
-public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
- score( linkedSpans);
- }
-
- private void score(List<LinkedSpan> geospans) {
- Map<Double, Double> latLongs = new HashMap<Double, Double>();
-
- /**
- * collect all the lat longs
- */
- for (LinkedSpan<BaseLink> ls : geospans) {
- for (BaseLink bl : ls.getLinkedEntries()) {
- if (bl instanceof GazateerEntry) {
- GazateerEntry entry = (GazateerEntry) bl;
- latLongs.put(entry.getLatitude(), entry.getLongitude());
-
- }
- }
- }
-
- /**
- * convert to geohash and add to sortedset
- */
- TreeSet<Long> geoHashes = new TreeSet<Long>();
- for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {
- geoHashes.add(geoHash(entry.getKey(), entry.getValue()));
- }
- /**
- * bin the points and generate a scoremap
- */
- Map<Long, Set<Long>> bins = bin(geoHashes);
- Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);
- /**
- * iterate over the data again and assign the score based on the bins
- */
- for (LinkedSpan<BaseLink> ls : geospans) {
- for (BaseLink bl : ls.getLinkedEntries()) {
- Long geohash = -1L;
- Double score = 0d;
- if (bl instanceof GazateerEntry) {
- GazateerEntry entry = (GazateerEntry) bl;
- geohash = geoHash(entry.getLatitude(), entry.getLongitude());
-
- }
- if (scores.containsKey(geohash)) {
- score = scores.get(geohash);
-
- } else {
- for (Long bin : bins.keySet()) {
- if (bin == geohash || bins.get(bin).contains(geohash)) {
- score = scores.get(bin);
- break;
- }
- }
- }
- bl.getScoreMap().put("geohashbin", score);
- }
- }
-
-
- }
-
- private Long normalize(Double coordpart, Boolean isLat) {
- Integer add = isLat ? 90 : 180;
- coordpart = Math.abs(coordpart + add);
- coordpart = coordpart * 1000000;
-
- Long l = Math.round(coordpart);
- String coord = String.valueOf(l);
- if (coord.length() < 8) {
- while (coord.length() < 8) {
- coord += "0";
- }
- }
- coord = coord.substring(0, 8);
- l = Long.valueOf(coord);
- return l;
- }
-
- /**
- * interleaves a lat and a long to place the coordinate in linear sortable
- * space for binning simplicity
- *
- * @param lat
- * @param lon
- * @return
- */
- private Long geoHash(double lat, double lon) {
- Long normLat = normalize(lat, Boolean.TRUE);
- Long normLon = normalize(lon, Boolean.FALSE);
- String sLat = String.valueOf(normLat);
- String sLon = String.valueOf(normLon);
- char[] latInts = sLat.toCharArray();
- char[] lonInts = sLon.toCharArray();
- String geoHash = "";
- int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;
- for (int i = 0; i < len - 1; i++) {
- String a = String.valueOf(latInts[i]);
- String b = String.valueOf(lonInts[i]);
- geoHash += a + b;
- }
-
- return Long.valueOf(geoHash);
- }
-
- private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {
- ArrayList<Long> list = new ArrayList<Long>(sets);
- ArrayList<Long> diffs = new ArrayList<Long>();
- /**
- * create a set of differences between the points
- */
- for (int i = 0; i < list.size() - 1; i++) {
- Long n = list.get(i + 1);
- Long v = list.get(i);
- diffs.add(Math.abs(n - v));
- }
- /**
- * generate an average "distance" between the normed points
- */
- Long sum = 0L;
- for (Long l : diffs) {
- sum += l;
- }
- Long avg=sum;
- if(!diffs.isEmpty()){
- avg = sum / diffs.size();
- }
-
-
- /**
- * generate break values where the disparity is greater than the average
- */
- TreeSet<Long> breaks = new TreeSet<Long>();
- for (int i = 0; i < list.size() - 1; i++) {
- Long n = list.get(i + 1);
- Long v = list.get(i);
- //Long percent = 100 - (v / n * 100);
- Long diff = n - v;
- if (diff > avg) {
- breaks.add(v);
- }
- }
- /**
- * based on the break values, place subsets of close points into bins
- */
- TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();
- Long lastBreak = -1L;
- for (Long br : breaks) {
- if (lastBreak == -1L) {
- binToAmount.put(br, sets.subSet(0L, true, br, true));
- } else {
- binToAmount.put(br, sets.subSet(lastBreak, false, br, true));
- }
- lastBreak = br;
- }
- lastBreak = sets.higher(lastBreak);
- if (lastBreak != null) {
- binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));
- if (binToAmount.get(lastBreak).isEmpty()) {
- binToAmount.get(lastBreak).add(lastBreak);
- }
- }
- /**
- * "binToAmount" is a map of the break value to all the points behind it
- * (it's sorted), so the key is the max value of its set of values
- */
- return binToAmount;
- }
-
- /**
- * returns a map of geohashes and their score
- *
- * @param binToAmount
- * @return Map< Geohash, score>
- */
- private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {
- TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();
- TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();
- /**
- * if there is only one bin return 1 as the rank for each item in the value
- */
- if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {
- for (Long bin : binToAmount.keySet()) {
- for (Long hash : binToAmount.get(bin)) {
- ranks.put(bin, 1d);
- }
- }
- return ranks;
- }
- int total = 0;
- /**
- * generate a total number of points
- */
- for (Set<Long> geohashes : binToAmount.values()) {
- total += geohashes.size();
- }
- /**
- * divide total by bin size, largest bin size gets best score, everything in
- * that bin gets that score because it is part of that primary cluster
- * TODO... do an extra iteration of clustering within the predominant
- * cluster to refine the scoring or make the basis of the binning more
- * granular than > avg
- */
- TreeSet<Double> rankSet = new TreeSet<Double>();
- for (Long key : binToAmount.keySet()) {
- int size = binToAmount.get(key).size();
- Double rank = (double) total / size;
- rankSet.add(rank);
- ranks.put(key, rank);
- }
- /**
- * load the final score map with normalized values
- */
- for (Map.Entry<Long, Double> rank : ranks.entrySet()) {
- double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);
- double reverse = Math.abs(norm - 1);
- double score = reverse > 1d ? 1.0 : reverse;
- normRanks.put(rank.getKey(), score);
- }
-
- return normRanks;
- }
-
- /**
- * transposes a number in a range to a double between 0 and 1
- *
- * @param valueToNormalize the value to be normalized (placed within a new
- * range of 0-1)
- * @param minimum the min of the current range
- * @param maximum the max of the current range
- * @return
- */
- private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
- Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
- d = d == null ? 0d : d;
- return d;
- }
-}
-
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
deleted file mode 100644
index 3f7d5fa..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.util.List;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Structure for scoring linked entities. The Map logically represents a pair :
- * "Score type" to the "actual Score."
- */
-public interface LinkedEntityScorer<T> {
-
-/**
- * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
- * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
- * @param docText the full text of the document.
- * @param sentenceSpans the sentence spans the correspond to the document text
- * @param additionalContext any additional data required to perform the scoring operation
- * @return void
- */
- void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);
-}
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
deleted file mode 100644
index e25ba07..0000000
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import opennlp.tools.doccat.DoccatModel;
-import opennlp.tools.doccat.DocumentCategorizerME;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.domain.BaseLink;
-import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- *
- * Utilizes a doccat model to score toponyms based on surrounding context
- */
-public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
-
-
- DocumentCategorizerME documentCategorizerME;
- DoccatModel doccatModel;
- public static final int RADIUS = 100;
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
- try {
- if (doccatModel == null) {
- String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
- if (path.equals("")) {
- return;
- }
- doccatModel = new DoccatModel(new File(path));
- documentCategorizerME = new DocumentCategorizerME(doccatModel);
- }
- Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
- for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
- Map<String, Double> scores = this.getScore(entry.getValue());
- for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {
- double score = 0d;
- if (scores.containsKey(link.getItemParentID())) {
- score = scores.get(link.getItemParentID());
- }
- link.getScoreMap().put("countrymodel", score);
- }
- }
-
- } catch (FileNotFoundException ex) {
- System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");
- } catch (IOException ex) {
- System.err.println(ex);
- } catch (Exception ex) {
- System.err.println(ex);
- }
- }
-
- /**
- * generates features using a BagOfWordsfeatureGenerator that are within the
- * radius of a mention within the doctext
- *
- * @param linkedSpans
- * @param docText
- * @param additionalContext
- * @param radius
- * @return a map of the index of the linked span to the string of surrounding
- * text: Map<indexofspan,surrounding text>
- */
- public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {
- Map<Integer, String> featureBags = new HashMap<>();
- Map<Integer, Integer> nameMentionMap = new HashMap<>();
- /**
- * iterator over the map that contains a mapping of every country code to
- * all of its mentions in the document
- */
- for (int i = 0; i < linkedSpans.size(); i++) {
- LinkedSpan span = linkedSpans.get(i);
- if (span.getLinkedEntries().isEmpty()) {
- //don't care about spans that did not get linked to anything at all; nothing to work with
- continue;
- }
- /**
- * get the sentence the name span was found in, the beginning of the
- * sentence will suffice as a centroid for feature generation around the
- * named entity
- */
- Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
- nameMentionMap.put(i, mentionIdx);
- }
- /**
- * now associate each span to a string that will be used for categorization
- * against the model.
- */
- for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
- featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));
- }
-
-
- return featureBags;
- }
-
- public String getTextChunk(int mentionIdx, String docText, int radius) {
- int docSize = docText.length();
- int left = 0, right = 0;
- left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
- right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
- String chunk = "";
- if (right <= left) {
- chunk = "";
- } else {
- /**
- * don't want to chop any words in half, so take fron the first space to
- * the last space in the chunk string
- */
- chunk = docText.substring(left, right);
- if (left != 0) {
- left = chunk.indexOf(" ");
- }
- right = chunk.lastIndexOf(" ");
- /**
- * now get the substring again with only whole words
- */
- if (left < right) {
- chunk = chunk.substring(left, right);
- }
- }
-
- return chunk;
- }
-
- private Map<String, Double> getScore(String text) throws Exception {
- Map<String, Double> scoreMap = new HashMap<>();
- double[] categorize = documentCategorizerME.categorize(text);
- int catSize = documentCategorizerME.getNumberOfCategories();
- for (int i = 0; i < catSize; i++) {
- String category = documentCategorizerME.getCategory(i);
- scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);
- }
- return scoreMap;
- }
-
-
-}