blob: 541d042bb84b4c3cfc857e643cc1ac7d9dc7d643 [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.entitylinker.EntityLinkerProperties;
/**
* Finds instances of country mentions in a String, typically a document text.
* Used to boost or degrade scoring of linked geo entities
*
*/
public class CountryContext {
private Connection con;
private List<CountryContextEntry> countrydata;
private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
private Set<CountryContextEntry> countryHits = new HashSet<>();
public Map<String, Set<String>> getNameCodesMap() {
return nameCodesMap;
}
public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
this.nameCodesMap = nameCodesMap;
}
public CountryContext() {
}
/**
* Finds mentions of countries based on a list from MySQL stored procedure
* called getCountryList. This method finds country mentions in documents,
* which is an essential element of the scoring that is done for geo
* linkedspans. Lazily loads the list from the database.
*
* @param docText the full text of the document
* @param properties EntityLinkerProperties for getting database connection
* @return
*/
public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
countryMentions = new HashMap<String, Set<Integer>>();
nameCodesMap.clear();
try {
// if (con == null) {
// con = getMySqlConnection(properties);
// }
if (countrydata == null) {
countrydata = getCountryContextFromFile(properties);
// countrydata = getCountryData(properties);
}
for (CountryContextEntry entry : countrydata) {
Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
String code = entry.getCc1().toLowerCase();
boolean found = false;
while (rs.find()) {
found = true;
Integer start = rs.start();
String hit = rs.group().toLowerCase();
if (countryMentions.containsKey(code)) {
countryMentions.get(code).add(start);
} else {
Set<Integer> newset = new HashSet<Integer>();
newset.add(start);
countryMentions.put(code, newset);
}
if (!hit.equals("")) {
if (this.nameCodesMap.containsKey(hit)) {
nameCodesMap.get(hit).add(code);
} else {
HashSet<String> newset = new HashSet<String>();
newset.add(code);
nameCodesMap.put(hit, newset);
}
}
}
if (found) {
countryHits.add(entry);
}
}
} catch (Exception ex) {
Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);
}
return countryMentions;
}
/**
* returns a unique list of country codes
*
* @param countryMentions the countryMentions discovered
* @return
*/
public static Set<String> getCountryCodes(List<CountryContextHit> hits) {
Set<String> ccs = new HashSet<String>();
for (CountryContextHit hit : hits) {
ccs.add(hit.getCountryCode().toLowerCase());
}
return ccs;
}
public static String getCountryCodeCSV(Set<String> hits) {
String csv = "";
if (hits.isEmpty()) {
return csv;
}
for (String code : hits) {
csv += "," + code;
}
return csv.substring(1);
}
private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {
String driver = properties.getProperty("db.driver", "org.gjt.mm.mysql.Driver");
String url = properties.getProperty("db.url", "jdbc:mysql://localhost:3306/world");
String username = properties.getProperty("db.username", "root");
String password = properties.getProperty("db.password", "?");
Class.forName(driver);
Connection conn = DriverManager.getConnection(url, username, password);
return conn;
}
/**
* reads the list from the database by calling a stored procedure
* getCountryList
*
* @param properties
* @return
* @throws SQLException
*/
private List<CountryContextEntry> getCountryData(EntityLinkerProperties properties) throws SQLException {
List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();
try {
if (con == null) {
con = getMySqlConnection(properties);
}
CallableStatement cs;
cs = con.prepareCall("CALL `getCountryList`()");
ResultSet rs;
rs = cs.executeQuery();
if (rs == null) {
return entries;
}
while (rs.next()) {
CountryContextEntry s = new CountryContextEntry();
//rc,cc1, full_name_nd_ro,dsg
s.setRc(rs.getString(1));
s.setCc1(rs.getString(2));
//a.district,
s.setFull_name_nd_ro(rs.getString(3));
//b.name as countryname,
s.setDsg(rs.getString(4));
entries.add(s);
}
} catch (SQLException ex) {
System.err.println(ex);
} catch (Exception e) {
System.err.println(e);
} finally {
con.close();
}
return entries;
}
public Map<String, Set<Integer>> getCountryMentions() {
return countryMentions;
}
public Set<CountryContextEntry> getCountryHits() {
return countryHits;
}
private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
List<CountryContextEntry> entries = new ArrayList<>();
String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
BufferedReader reader;
try {
path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
reader = new BufferedReader(new FileReader(path));
while (reader.read() != -1) {
String line = reader.readLine();
String[] values = line.split("\t");
if (values.length != 4) {
throw new IOException("improperly formatted country context file");
}
CountryContextEntry entry = new CountryContextEntry();
// rc,cc1, full_name_nd_ro,dsg
entry.setRc(values[0].toLowerCase());
entry.setCc1(values[1].toLowerCase());
entry.setFull_name_nd_ro(values[2].toLowerCase());
entry.setDsg(values[3].toLowerCase());
entries.add(entry);
}
reader.close();
} catch (IOException e) {
System.err.println(e);
}
return entries;
}
}