/* | |
* Copyright 2014 The Apache Software Foundation. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.addons.geoentitylinker.indexing; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.FileWriter; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.StringField; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.index.IndexWriter; | |
public class RegionProcessor { | |
public static void main(String[] args) { | |
RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null); | |
} | |
/** | |
* | |
* @param regionsFile the file that stores Region references. the format of | |
* this file is tab delimitted text with index 0 as the name of the region, | |
* index 1 as the longitude, and index 2 as the latitude | |
* @param outputCountryContextfile this is the country context files shared by | |
* all indexing processors | |
* @param w | |
*/ | |
public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) { | |
try { | |
readFile(regionsFile, outputCountryContextfile, w); | |
} catch (Exception ex) { | |
ex.printStackTrace(); | |
} | |
} | |
public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception { | |
List<String> ccfileentries = new ArrayList<>(); | |
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData)); | |
List<String> fields = new ArrayList<>(); | |
int counter = 0; | |
System.out.println("reading gazetteer data from Regions file..........."); | |
String line = ""; | |
while ((line = reader.readLine()) != null) { | |
String[] values = line.split("\t"); | |
if (counter == 0) { | |
} else { | |
Document doc = new Document(); | |
for (int i = 0; i < fields.size() - 1; i++) { | |
doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES)); | |
} | |
String placeName = values[0]; | |
String lat = values[2]; | |
String lon = values[1]; | |
String dsg = "region"; | |
String id = "rg" + counter; | |
String hierarchy = placeName; | |
doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES)); | |
doc.add(new TextField("placename", placeName, Field.Store.YES)); | |
doc.add(new StringField("latitude", lat, Field.Store.YES)); | |
doc.add(new StringField("longitude", lon, Field.Store.YES)); | |
doc.add(new StringField("loctype", dsg, Field.Store.YES)); | |
doc.add(new StringField("admincode", "", Field.Store.YES)); | |
doc.add(new StringField("countrycode", id, Field.Store.YES)); | |
doc.add(new StringField("countycode", "", Field.Store.YES)); | |
doc.add(new StringField("locid", id, Field.Store.YES)); | |
doc.add(new StringField("gazsource", "region", Field.Store.YES)); | |
//countrycontext file format | |
// US KY 131 United States Kentucky Leslie | |
ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n"); | |
if (w != null) { | |
w.addDocument(doc); | |
} | |
} | |
counter++; | |
} | |
if (w != null) { | |
w.commit(); | |
} | |
FileWriter writer = new FileWriter(outputCountryContextfile, true); | |
for (String string : ccfileentries) { | |
writer.write(string); | |
} | |
System.out.println("successfully wrote Region entries to country oontext file"); | |
writer.close(); | |
System.out.println("Completed indexing regions!"); | |
} | |
} |