blob: d41b6ff186a61a6c0d6816fd045766f6411fb559 [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.addons.geoentitylinker.indexing;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
/**
* Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker.
*/
public class GazetteerIndexer {
public static void main(String[] args) {
if (args.length != 8) {
System.out.println("Usage: GazetteerIndexer geonamesData geoNamesCountryInfo geonamesAdmin1CodesASCII "
+ "usgsDataFile usgsGovUnitsFile outputIndexDir outputCountryContextFile regionsFile");
System.out.println();
System.out.println("The GazetteerIndexer.index methods javadoc explains how to retrieve the data files.");
return;
}
File geonamesData = new File(args[0]);
File geoNamesCountryInfo = new File(args[1]);
File geonamesAdmin1CodesASCII = new File(args[2]);
File usgsDataFile = new File(args[3]);
File usgsGovUnitsFile = new File(args[4]);
File outputIndexDir = new File(args[5]);
File outputCountryContextFile = new File(args[6]);
File regionsFile = new File(args[7]);
try {
GazetteerIndexer i = new GazetteerIndexer();
i.index(geonamesData,
geoNamesCountryInfo,
geonamesAdmin1CodesASCII,
usgsDataFile,
usgsGovUnitsFile,
outputIndexDir,
outputCountryContextFile,
regionsFile);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public GazetteerIndexer() {
}
public interface Separable {
String getSeparator();
}
public enum GazType implements Separable {
GEONAMES {
@Override
public String toString() {
return "/opennlp_geoentitylinker_geonames_idx";
}
@Override
public String getSeparator() {
return "\t";
}
},
USGS {
@Override
public String toString() {
return "/opennlp_geoentitylinker_usgsgaz_idx";
}
@Override
public String getSeparator() {
return "\\|";
}
}
}
/**
*
* @param geonamesData the actual Geonames gazetteer data downloaded from
* <a href="https://download.geonames.org/export/dump/">here</a>, then click on this link
* 'allCountries.zip'
*
* @param geoNamesCountryInfo the countryinfo lookup table that can be
* downloaded from
* <a href="https://download.geonames.org/export/dump/countryInfo.txt">here</a>. You'll need to
* copy the page into a file or scrape it
*
* @param geonamesAdmin1CodesASCII The lookup data for the province names for
* each place found
* <a href="https://download.geonames.org/export/dump/admin1CodesASCII.txt">here</a>, highlight the
* table view, and copy results into a text file. Make sure the tab delimited
* format is maintained.
*
* @param usgsDataFile the actual USGS gazetteer downloaded from
* <a href="https://geonames.usgs.gov/domestic/download_data.htm">here</a> click on the
* national_file####.zip link to get all the most recent features
*
* @param usgsGovUnitsFile go to
* <a href="https://geonames.usgs.gov/domestic/download_data.htm">here</a> in the section titled
* "Topical Gazetteers -- File Format" click on the dropdown list and select
* "Government Units". The downloaded file is what you need for this param.
*
* @param outputIndexDir where you want the final index. Must be a directory, not an actual file.
*
* @param outputCountryContextFile The output countrycontext file. This is a
* very important file used inside the GeoEntityLinker to assist in toponym
* resolution.
*
* @param regionsFile this file contains a list of regions in the following
* format: tab delimited text with index 0 as the name of the region, index 1
* as the longitude, and index 2 as the latitude
*
* @throws Exception
*/
public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII, File usgsDataFile,
File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile)
throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
if (!geonamesData.exists()) {
throw new FileNotFoundException("geonames data file does not exist");
}
if (!geoNamesCountryInfo.exists()) {
throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");
}
if (!geonamesAdmin1CodesASCII.exists()) {
throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");
}
if (!usgsDataFile.exists()) {
throw new FileNotFoundException("usgsDataFile data file does not exist");
}
if (!usgsGovUnitsFile.exists()) {
throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");
}
if (!outputIndexDir.exists()) {
throw new FileNotFoundException("outputIndexDir data file does not exist");
}
if (!regionsFile.exists()) {
throw new FileNotFoundException("regionsFile data file does not exist");
}
String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";
Directory index = new MMapDirectory(Paths.get(indexloc));
Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList<>(), true));
Map<String, Analyzer> analyMap = new HashMap<>();
analyMap.put("countrycode", new KeywordAnalyzer());
analyMap.put("admincode", new KeywordAnalyzer());
analyMap.put("loctype", new KeywordAnalyzer());
analyMap.put("countycode", new KeywordAnalyzer());
analyMap.put("gazsource", new KeywordAnalyzer());
PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(a, analyMap);
IndexWriterConfig config = new IndexWriterConfig(aWrapper);
try (IndexWriter w = new IndexWriter(index, config)) {
//write the column headers for the countryContextFile
try (FileWriter countryContextFileWriter = new FileWriter(outputCountryContextFile, false)) {
String colNamesForCountryContextFile = "countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";
countryContextFileWriter.write(colNamesForCountryContextFile);
countryContextFileWriter.flush();
}
USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);
GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);
RegionProcessor.process(regionsFile, outputCountryContextFile, w);
w.commit();
}
System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" +
outputCountryContextFile.getPath() + "' to entitylinker.properties file");
}
}