/* | |
* Copyright 2014 The Apache Software Foundation. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.addons.geoentitylinker.indexing; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
import opennlp.addons.geoentitylinker.AdminBoundary; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.index.IndexWriter; | |
/** | |
* | |
* @author mgiaconia | |
*/ | |
public class GeonamesProcessor { | |
public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception { | |
Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile); | |
Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes); | |
// List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt")); | |
//admin2Codes.txt | |
readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w); | |
//now append to the coutnry context file | |
writeCountryContextFile(outputCountryContextFile, adm1s); | |
} | |
public GeonamesProcessor() { | |
} | |
private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) { | |
System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath()); | |
Map<String, AdminBoundary> outmap = new HashMap<>(); | |
BufferedReader reader; | |
Set<String> nullcodes = new HashSet<>(); | |
try { | |
reader = new BufferedReader(new FileReader(adm1CodesLookupFile)); | |
int i = 0; | |
String line = ""; | |
while ((line = reader.readLine()) != null) { | |
// String line = reader.readLine(); | |
String[] values = line.split("\t"); | |
if (values.length != 4) { | |
throw new IOException("improperly formatted province lookup file"); | |
} | |
String ccode = values[0].toLowerCase(); | |
String[] split = ccode.split("\\."); | |
String pcode = ""; | |
if (split.length == 2) { | |
//System.out.println(split); | |
ccode = split[0]; | |
pcode = split[1]; | |
} | |
String pname = values[2]; | |
if (ccode.matches("[0-9].*")) { | |
String code = ccode; | |
ccode = pcode; | |
pcode = code; | |
} | |
String cname = ccodes.get(ccode); | |
if (cname == null) { | |
nullcodes.add(ccode); | |
} | |
AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname); | |
// System.out.println(data); | |
outmap.put(ccode + "." + pcode, data); | |
} | |
System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data."); | |
reader.close(); | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
System.out.println("Successfully read geonames province data from: " + adm1CodesLookupFile.getPath()); | |
return outmap; | |
} | |
private static Map<String, String> getCountryCodes(File countryContextFile) { | |
Map<String, String> ccs = new HashMap<>(); | |
BufferedReader reader; | |
try { | |
reader = new BufferedReader(new FileReader(countryContextFile)); | |
int i = 0; | |
String line = ""; | |
boolean start = false; | |
while ((line = reader.readLine()) != null) { | |
if (!line.toLowerCase().startsWith("#iso\t") && !start) { | |
continue; | |
} else { | |
start = true; | |
} | |
String[] values = line.split("\t"); | |
String ccode = values[0].toLowerCase();//this is the 2 digit ISO code | |
String cname = values[4].toLowerCase(); | |
if (!ccode.equals("")) { | |
ccs.put(ccode, cname); | |
} | |
} | |
reader.close(); | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
ccs.put("SS", "South Sudan"); | |
ccs.put("CS", "Kosovo"); | |
return ccs; | |
} | |
public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) { | |
// FileWriter writer = null; | |
try (FileWriter writer = new FileWriter(outfile, true)) { | |
for (String admKey : adms.keySet()) { | |
AdminBoundary adm = adms.get(admKey); | |
if (adm == null) { | |
continue; | |
} | |
String province = adm.getProvinceName(); | |
String country = adm.getCountryName(); | |
String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n"; | |
writer.write(line); | |
// System.out.println(line); | |
} | |
writer.close(); | |
} catch (IOException ex) { | |
ex.printStackTrace(); | |
} | |
System.out.println("successfully wrote Geonames entries to country oontext file"); | |
} | |
/** | |
* | |
* @param gazateerInputData the Geonames allCounties.txt file | |
* @param type the types of gaz entry, usgs, geonames, or regions | |
* @param adms the province info | |
* @param countrycodes the country code info | |
* @param w the lucene index writer | |
* @throws Exception | |
*/ | |
public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception { | |
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); | |
String[] fieldStrings = new String[]{ | |
"geonameid", | |
"name", | |
"asciiname", | |
"alternatenames", | |
"latitude", | |
"longitude", | |
"feature_class", | |
"feature_code", | |
"country code", | |
"cc2", | |
"admin1_code", | |
"admin2_code", | |
"admin3_code", | |
"admin4_code", | |
"population", | |
"elevation", | |
"dem ", | |
"timezone", | |
"modification_date"}; | |
List<String> fields = Arrays.asList(fieldStrings); | |
int counter = 0; | |
System.out.println("reading gazetteer data from file..........."); | |
String line = ""; | |
while ((line = reader.readLine()) != null) { | |
String[] values = line.split(type.getSeparator()); | |
Document doc = new Document(); | |
String admincode = values[10].toLowerCase(); | |
String ccode = values[8].toLowerCase(); | |
if (ccode.contains(",")) { | |
String[] codes = ccode.split(","); | |
if (codes.length > 0) { | |
ccode = codes[0]; | |
} | |
} | |
AdminBoundary adm = adms.get(ccode + "." + admincode); | |
String placeName = values[2]; | |
String lat = values[4]; | |
String lon = values[5]; | |
String dsg = values[7]; | |
String id = values[0]; | |
String concatIndexEntry = ""; | |
if (adm != null) { | |
concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName; | |
} else { | |
//there is no admin info, but we can still use the countrycode to concat the country name | |
String n = countrycodes.get(ccode); | |
if (n != null) { | |
concatIndexEntry = n + ", " + placeName; | |
} else { | |
///don't want a single token hierarchy entry. | |
concatIndexEntry = ""; | |
} | |
} | |
if (ccode == null) { | |
System.out.println("naughty country code"); | |
} | |
for (int i = 0; i < fields.size() - 1; i++) { | |
doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); | |
} | |
/** | |
* add standard fields to the index | |
*/ | |
doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES)); | |
doc.add(new TextField("placename", placeName, Field.Store.YES)); | |
doc.add(new TextField("latitude", lat, Field.Store.YES)); | |
doc.add(new TextField("longitude", lon, Field.Store.YES)); | |
doc.add(new TextField("loctype", dsg, Field.Store.YES)); | |
doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES)); | |
doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES)); | |
doc.add(new TextField("countycode", "", Field.Store.YES)); | |
doc.add(new TextField("locid", id, Field.Store.YES)); | |
doc.add(new TextField("gazsource", "geonames", Field.Store.YES)); | |
w.addDocument(doc); | |
counter++; | |
if (counter % 100000 == 0) { | |
w.commit(); | |
System.out.println(counter + " .........Geonames entries committed to index.............."); | |
} | |
} | |
System.out.println("Completed indexing gaz! index name is: " + type.toString()); | |
} | |
} |