blob: cbe8a0d4894070097f7b2f84251e409a73f33749 [file] [log] [blame]
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
/**
*
* @author Owner
*/
public class GazateerIndexer {
public enum GazType {
GEONAMES {
@Override
public String toString() {
return "/opennlp_geoentitylinker_usgsgaz_idx";
}
},
USGS {
@Override
public String toString() {
return "/opennlp_geoentitylinker_usgsgaz_idx";
}
}
}
public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
String indexloc = outputIndexDir + type.toString();
Directory index = new MMapDirectory(new File(indexloc));
Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
IndexWriter w = new IndexWriter(index, config);
readFile(gazateerInputData, w);
w.commit();
w.close();
}
public void readFile(File gazateerInputData, IndexWriter w) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
List<String> fields = new ArrayList<String>();
int counter = 0;
System.out.println("reading gazateer data from file...........");
while (reader.read() != -1) {
String line = reader.readLine();
String[] values = line.split("\\|");//nga format
if (counter == 0) {
// build fields
for (String columnName : values) {
fields.add(columnName.replace("»¿", ""));
}
} else {
Document doc = new Document();
for (int i = 0; i < fields.size() - 1; i++) {
doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
}
w.addDocument(doc);
}
counter++;
if (counter % 10000 == 0) {
w.commit();
System.out.println(counter + " .........committed to index..............");
}
}
}
}