blob: 166cfcad872c7b3a934a77ed3ef6aa14b1692756 [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Finds instances of country mentions in a String, typically a document text.
* Used to boost or degrade scoring of linked geo entities
*
*/
public class AdminBoundaryContextGenerator {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private List<CountryContextEntry> countrydata;
private Map<String, Set<String>> nameCodesMap = new HashMap<>();
private final Map<String, Set<Integer>> countryMentions = new HashMap<>();
Map<String, String> countryRegexMap = new HashMap<>();
Map<String, String> provinceRegexMap = new HashMap<>();
Map<String, String> countyRegexMap = new HashMap<>();
private final Set<CountryContextEntry> countryHits = new HashSet<>();
private final EntityLinkerProperties properties;
private final List<AdminBoundary> adminBoundaryData= new ArrayList<>();
private final Set<AdminBoundary> adminBoundaryHits = new HashSet<>();
private AdminBoundaryContext context;
public AdminBoundaryContext getContext(String text) {
context = null;
nameCodesMap.clear();
context = process(text);
return context;
}
private final Set<String> countryHitSet = new HashSet<>();
private final Map<String, String> countryMap = new HashMap<>();
private final Map<String, Map<String, String>> provMap = new HashMap<>();
private final Map<String, Map<String, String>> countyMap = new HashMap<>();
private Map<String, Set<Integer>> provMentions = new HashMap<>();
private Map<String, Set<Integer>> countyMentions = new HashMap<>();
private final Set<String> provHits = new HashSet<>();
private final Set<String> countyHits = new HashSet<>();
public static void main(String[] args) {
try {
AdminBoundaryContextGenerator countryContext
= new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("C:\\Temp\\gaz_data\\newCountryContextfile.txt")));
AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool.");
System.out.println(c);
} catch (Exception ex) {
java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);
}
}
public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException {
this.properties = properties;
if (countrydata == null) {
String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
if (path == null || path.trim().isEmpty()) {
throw new IOException("missing country context data configuration. Property opennlp.geoentitylinker.countrycontext.filepath must have a valid path value in entitylinker properties file");
}
File countryContextFile = new File(path);
if (countryContextFile == null || !countryContextFile.exists()) {
throw new IOException("missing country context file");
}
//countrydata = getCountryContextFromFile(countryContextFile);
getContextFromFile(countryContextFile);
if (adminBoundaryData.isEmpty()) {
throw new IOException("missing country context data");
}
}
}
public Map<String, Set<Integer>> getCountryMentions() {
return countryMentions;
}
/**
* @return returns the last set of hits after calling regexFind
*/
public Set<CountryContextEntry> getCountryHits() {
return countryHits;
}
/**
* @return returns the last name to codes map after calling regexFind
*/
public Map<String, Set<String>> getNameCodesMap() {
return nameCodesMap;
}
public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
this.nameCodesMap = nameCodesMap;
}
private void reset() {
this.nameCodesMap.clear();
this.countryHitSet.clear();
this.countryHits.clear();
this.countryMentions.clear();
this.provHits.clear();
this.provMentions.clear();
this.countyHits.clear();
this.countyMentions.clear();
this.adminBoundaryHits.clear();
}
/**
* Finds indicators of countries, provinces, and cities, as per the USGS and
* Geonames gazetteers. The results of this are used to score toponymns
* downstream. The full text of a document should be passed in here.
*
* @param text the full text of the document (block of text).
*/
private AdminBoundaryContext process(String text) {
try {
reset();
Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet, "country");
if (!countryhitMap.isEmpty()) {
for (String cc : countryhitMap.keySet()) {
Map<String, String> provsForCc = provMap.get(cc);
if (provsForCc != null) {
provMentions.putAll(regexfind(text, provsForCc, provHits, "province"));
if (provMentions != null) {
for (String prov : provMentions.keySet()) {
Map<String, String> get = countyMap.get(prov);
if (get != null) {
countyMentions.putAll(regexfind(text, get, countyHits, "province"));
}
}
}
}
}
} else {
for (Map<String, String> provsForCc : provMap.values()) {
if (provsForCc != null) {
provMentions = regexfind(text, provsForCc, provHits, "province");
if (provMentions != null) {
for (String prov : provMentions.keySet()) {
//fake a country hit based on a province hit... this gets fuzzy
String cc = prov.split("\\.")[0];
if (!countryhitMap.containsKey(cc)) {
countryhitMap.put(cc, provMentions.get(prov));
countryHitSet.add(cc);
} else {
countryhitMap.get(cc).addAll(provMentions.get(prov));
}
Map<String, String> get = countyMap.get(prov);
if (get != null) {
countyMentions = regexfind(text, get, countyHits, "oounty");
}
}
}
}
}
}
Map<String, String> countryRefMap = new HashMap<>();
for (String c : countryHitSet) {
String countryName = countryMap.get(c);
if (countryName != null) {
countryRefMap.put(c, countryName);
}
}
return new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits,
countryRefMap, provMap, countyMap, nameCodesMap, countryRegexMap, provinceRegexMap, countyRegexMap);
} catch (Exception e) {
LOG.error(e.getLocalizedMessage(), e);
}
return null;
}
/**
* discovers indicators of admin boundary data using regex.
*
* @param docText the full text
* @param lookupMap a map to use to find names. the key=a location code, the
* value is an actual name.
* @param hitsRef a reference to a set that stores the hits by id
*/
private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef, String locationType) {
Map<String, Set<Integer>> mentions = new HashMap<>();
if (lookupMap == null) {
return mentions;
}
try {
for (String entry : lookupMap.keySet()) {
String name = lookupMap.get(entry).toLowerCase();
if (name == null) {
continue;
}
switch (locationType) {
case "country":
if (this.countryRegexMap.containsKey(entry)) {
name = countryRegexMap.get(entry);
}
break;
case "province":
if (this.provinceRegexMap.containsKey(entry)) {
name = provinceRegexMap.get(entry);
}
break;
case "county":
if (this.countyRegexMap.containsKey(entry)) {
name = countyRegexMap.get(entry);
}
break;
}
name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") + "([^\\p{L}\\p{Nd}]|$)";
Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
String code = entry.toLowerCase();
code = code.trim().replace("", "");
boolean found = false;
while (rs.find()) {
found = true;
Integer start = rs.start();
String hit = rs.group().toLowerCase().trim();
hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", "");
if (mentions.containsKey(code)) {
mentions.get(code).add(start);
} else {
Set<Integer> newset = new HashSet<>();
newset.add(start);
mentions.put(code, newset);
}
if (!hit.equals("")) {
if (this.nameCodesMap.containsKey(hit)) {
nameCodesMap.get(hit).add(code);
} else {
HashSet<String> newset = new HashSet<>();
newset.add(code);
nameCodesMap.put(hit, newset);
}
}
}
if (found) {
hitsRef.add(code);
}
}
} catch (Exception ex) {
LOG.error(ex.getLocalizedMessage(), ex);
}
return mentions;
}
private void getContextFromFile(File countryContextFile) {
if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {
return;
}
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(countryContextFile));
String line;
int lineNum = 0;
while ((line = reader.readLine()) != null) {
String[] values = line.split("\t");
if (lineNum == 0) {
lineNum++;
continue;
//skip column name headers
}
if (values.length == 9) {
AdminBoundary entry = new AdminBoundary(
values[0].toLowerCase().trim().replace("", ""),
values[3].toLowerCase().trim(),
values[1].toLowerCase().trim(),
values[4].toLowerCase().trim(),
values[2].toLowerCase().trim(),
values[5].toLowerCase().trim(),
values[6].toLowerCase().trim(),
values[7].toLowerCase().trim(),
values[8].toLowerCase().trim());
this.adminBoundaryData.add(entry);
} else {
throw new IllegalArgumentException("Improperly formatted file");
}
}
reader.close();
} catch (IOException ex) {
LOG.error(ex.getLocalizedMessage(), ex);
}
loadMaps(this.adminBoundaryData);
}
private void loadMaps(List<AdminBoundary> boundaries) {
for (AdminBoundary adm : boundaries) {
if (!adm.getCountryCode().equals("null")) {
countryMap.put(adm.getCountryCode(), adm.getCountryName());
if (countryRegexMap.containsKey(adm.getCountryCode())) {
String currentRegex = countryRegexMap.get(adm.getCountryCode());
if (currentRegex.length() > adm.getCountryRegex().length()) {
// the longest one wins if they are not all the same for each entry in the file
countryRegexMap.put(adm.getCountryCode(), currentRegex);
}//else do nothing
} else {
countryRegexMap.put(adm.getCountryCode(), adm.getCountryRegex());
}
if (!adm.getProvCode().equals("null")) {
Map<String, String> provs = provMap.get(adm.getCountryCode());
if (provs == null) {
provs = new HashMap<>();
}
//if (!provs.containsKey(adm.getProvCode())) {
provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName());
provMap.put(adm.getCountryCode(), provs);
// }
if (!adm.getCountyCode().equalsIgnoreCase("no_data_found") && !adm.getCountyName().equalsIgnoreCase("no_data_found")) {
Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode());
if (counties == null) {
counties = new HashMap<>();
} // if (!counties.containsKey(adm.getCountyCode())) {
String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode();
counties.put(countyid, adm.getCountyName());
countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties);
// }
}
}
}
}
fillProvRegexMap();
fillCountyRegexMap();
}
private void fillProvRegexMap() {
this.provinceRegexMap = new HashMap<>();
// this.adminBoundaryData
for (AdminBoundary adm : adminBoundaryData) {
if (provinceRegexMap.containsKey(adm.getProvCode())) {
String currentRegex = provinceRegexMap.get(adm.getProvCode());
if (currentRegex.length() > adm.getProvinceRegex().length()) {
// the longest one wins if they are not all the same for each entry in the file
provinceRegexMap.put(adm.getProvCode(), currentRegex);
}//else do nothing
} else {
provinceRegexMap.put(adm.getProvCode(), adm.getProvinceRegex());
}
}
}
private void fillCountyRegexMap() {
this.countyRegexMap = new HashMap<>();
// this.adminBoundaryData
for (AdminBoundary adm : adminBoundaryData) {
if (countyRegexMap.containsKey(adm.getCountyCode())) {
String currentRegex = countyRegexMap.get(adm.getCountyCode());
if (currentRegex.length() > adm.getCountyRegex().length()) {
// the longest one wins if they are not all the same for each entry in the file
countyRegexMap.put(adm.getCountyCode(), currentRegex);
}//else do nothing
} else {
countyRegexMap.put(adm.getCountyCode(), adm.getCountyRegex());
}
}
}
}