blob: bd73bb9ff04f9a3e67d54e89ed69a2cd854ec694 [file] [log] [blame]
/*
* Copyright 2014 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker.indexing;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import opennlp.addons.geoentitylinker.AdminBoundary;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
/**
*
* @author mgiaconia
*/
public class GeonamesProcessor {
public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception {
Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);
Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes);
// List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));
//admin2Codes.txt
readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w);
//now append to the coutnry context file
writeCountryContextFile(outputCountryContextFile, adm1s);
}
public GeonamesProcessor() {
}
private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) {
System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath());
Map<String, AdminBoundary> outmap = new HashMap<>();
BufferedReader reader;
Set<String> nullcodes = new HashSet<>();
try {
reader = new BufferedReader(new FileReader(adm1CodesLookupFile));
int i = 0;
String line = "";
while ((line = reader.readLine()) != null) {
// String line = reader.readLine();
String[] values = line.split("\t");
if (values.length != 4) {
throw new IOException("improperly formatted province lookup file");
}
String ccode = values[0].toLowerCase();
String[] split = ccode.split("\\.");
String pcode = "";
if (split.length == 2) {
//System.out.println(split);
ccode = split[0];
pcode = split[1];
}
String pname = values[2];
if (ccode.matches("[0-9].*")) {
String code = ccode;
ccode = pcode;
pcode = code;
}
String cname = ccodes.get(ccode);
if (cname == null) {
nullcodes.add(ccode);
}
AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);
// System.out.println(data);
outmap.put(ccode + "." + pcode, data);
}
System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data.");
reader.close();
} catch (IOException ex) {
ex.printStackTrace();
}
System.out.println("Successfully read geonames province data from: " + adm1CodesLookupFile.getPath());
return outmap;
}
private static Map<String, String> getCountryCodes(File countryContextFile) {
Map<String, String> ccs = new HashMap<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(countryContextFile));
int i = 0;
String line = "";
boolean start = false;
while ((line = reader.readLine()) != null) {
if (!line.toLowerCase().startsWith("#iso\t") && !start) {
continue;
} else {
start = true;
}
String[] values = line.split("\t");
String ccode = values[0].toLowerCase();//this is the 2 digit ISO code
String cname = values[4].toLowerCase();
if (!ccode.equals("")) {
ccs.put(ccode, cname);
}
}
reader.close();
} catch (IOException ex) {
ex.printStackTrace();
}
ccs.put("SS", "South Sudan");
ccs.put("CS", "Kosovo");
return ccs;
}
public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
// FileWriter writer = null;
try (FileWriter writer = new FileWriter(outfile, true)) {
for (String admKey : adms.keySet()) {
AdminBoundary adm = adms.get(admKey);
if (adm == null) {
continue;
}
String province = adm.getProvinceName();
String country = adm.getCountryName();
String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n";
writer.write(line);
// System.out.println(line);
}
writer.close();
} catch (IOException ex) {
ex.printStackTrace();
}
System.out.println("successfully wrote Geonames entries to country oontext file");
}
/**
*
* @param gazateerInputData the Geonames allCounties.txt file
* @param type the types of gaz entry, usgs, geonames, or regions
* @param adms the province info
* @param countrycodes the country code info
* @param w the lucene index writer
* @throws Exception
*/
public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
String[] fieldStrings = new String[]{
"geonameid",
"name",
"asciiname",
"alternatenames",
"latitude",
"longitude",
"feature_class",
"feature_code",
"country code",
"cc2",
"admin1_code",
"admin2_code",
"admin3_code",
"admin4_code",
"population",
"elevation",
"dem ",
"timezone",
"modification_date"};
List<String> fields = Arrays.asList(fieldStrings);
int counter = 0;
System.out.println("reading gazetteer data from file...........");
String line = "";
while ((line = reader.readLine()) != null) {
String[] values = line.split(type.getSeparator());
Document doc = new Document();
String admincode = values[10].toLowerCase();
String ccode = values[8].toLowerCase();
if (ccode.contains(",")) {
String[] codes = ccode.split(",");
if (codes.length > 0) {
ccode = codes[0];
}
}
AdminBoundary adm = adms.get(ccode + "." + admincode);
String placeName = values[2];
String lat = values[4];
String lon = values[5];
String dsg = values[7];
String id = values[0];
String concatIndexEntry = "";
if (adm != null) {
concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;
} else {
//there is no admin info, but we can still use the countrycode to concat the country name
String n = countrycodes.get(ccode);
if (n != null) {
concatIndexEntry = n + ", " + placeName;
} else {
///don't want a single token hierarchy entry.
concatIndexEntry = "";
}
}
if (ccode == null) {
System.out.println("naughty country code");
}
for (int i = 0; i < fields.size() - 1; i++) {
doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
}
/**
* add standard fields to the index
*/
doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
doc.add(new TextField("placename", placeName, Field.Store.YES));
doc.add(new TextField("latitude", lat, Field.Store.YES));
doc.add(new TextField("longitude", lon, Field.Store.YES));
doc.add(new TextField("loctype", dsg, Field.Store.YES));
doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));
doc.add(new TextField("countycode", "", Field.Store.YES));
doc.add(new TextField("locid", id, Field.Store.YES));
doc.add(new TextField("gazsource", "geonames", Field.Store.YES));
w.addDocument(doc);
counter++;
if (counter % 100000 == 0) {
w.commit();
System.out.println(counter + " .........Geonames entries committed to index..............");
}
}
System.out.println("Completed indexing gaz! index name is: " + type.toString());
}
}