blob: d8be42565e3f6e5d1b3688dd5ec91ceb6311348d [file] [log] [blame]
/*
* Copyright 2013 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
/**
*
* Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
*/
public class GazateerIndexer {
public GazateerIndexer() {
loadAnalyzerMap();
}
Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
public static interface Separable {
String getSeparator();
}
public enum GazType implements Separable {
GEONAMES {
@Override
public String toString() {
return "/opennlp_geoentitylinker_geonames_idx";
}
@Override
public String getSeparator() {
return "\t";
}
},
USGS {
@Override
public String toString() {
return "/opennlp_geoentitylinker_usgsgaz_idx";
}
@Override
public String getSeparator() {
return "\\|";
}
}
}
public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
String indexloc = outputIndexDir + type.toString();
Directory index = new MMapDirectory(new File(indexloc));
Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
IndexWriter w = new IndexWriter(index, config);
readFile(gazateerInputData, w, type);
w.commit();
w.close();
}
public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
List<String> fields = new ArrayList<String>();
int counter = 0;
int langCodeIndex = 0;
System.out.println("reading gazateer data from file...........");
while (reader.read() != -1) {
String line = reader.readLine();
String[] values = line.split(type.getSeparator());
if (counter == 0) {
// build fields
for (int i = 0; i < values.length; i++) {
String columnName = values[i];
fields.add(columnName.replace("»¿", "").trim());
if (columnName.toLowerCase().equals("lc")) {
langCodeIndex = i;
}
}
} else {
Document doc = new Document();
for (int i = 0; i < fields.size() - 1; i++) {
doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
}
if (type == GazType.GEONAMES) {
/**
* see if the map contains a language specific analyzer
*/
if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {
/*
* if so retrieve it from the map
*/
Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);
/**
* index the doc using the specified analyzer
*/
w.addDocument(doc, analyzer);
} else {
w.addDocument(doc);
}
} else {
w.addDocument(doc);
}
}
counter++;
if (counter % 10000 == 0) {
w.commit();
System.out.println(counter + " .........committed to index..............");
}
}
w.commit();
System.out.println("Completed indexing gaz! index name is: " + type.toString());
}
/**
* TODO: make these analyzers configurable
*/
private void loadAnalyzerMap() {
languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
}
}