geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java - opennlp-addons - Git at Google

 /*
  * Copyright 2013 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.addons.geoentitylinker;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Level;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.queryparser.classic.ParseException;

 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.Version;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.util.CharArraySet;

 /**
  *
  * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
  * these indices are based on loading the indexes using the
  * GeoEntityLinkerSetupUtils
  *
  */
 public class GazetteerSearcher {

   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
   private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
   private double scoreCutoff = .70;
   private boolean doubleQuoteAllSearchTerms = false;
   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
   private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
   private Analyzer geonamesAnalyzer;
   //usgs US gazateer
   private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
   private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
   private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
   private Analyzer usgsAnalyzer;
   private EntityLinkerProperties properties;

   public static void main(String[] args) {
     try {
       boolean b = Boolean.valueOf("true");

       new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
     } catch (IOException ex) {
       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
     } catch (Exception ex) {
       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
     }
   }

   public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
     this.properties = properties;
     init();
   }

   /**
    *
    * @param searchString the named entity to look up in the lucene index
    * @param rowsReturned how many rows to allow lucene to return
    * @param code the country code
    *
    * @return
    */
   public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
     searchString = cleanInput(searchString);
     if (searchString.isEmpty()) {
       return linkedData;
     }
     try {
       /**
        * build the search string Sometimes no country context is found. In this
        * case the code variable will be an empty string
        */
       String luceneQueryString = !code.equals("")
               ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
               : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
       /**
        * check the cache and go no further if the records already exist
        */
       ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
       if (get != null) {

         return get;
       }

       QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, geonamesAnalyzer);
       Query q = parser.parse(luceneQueryString);

       TopDocs search = geonamesSearcher.search(q, rowsReturned);

       for (int i = 0; i < search.scoreDocs.length; ++i) {
         GazetteerEntry entry = new GazetteerEntry();
         int docId = search.scoreDocs[i].doc;
         double sc = search.scoreDocs[i].score;

         entry.getScoreMap().put("lucene", sc);
         entry.setIndexID(docId + "");
         entry.setSource("geonames");

         Document d = geonamesSearcher.doc(docId);
         List<IndexableField> fields = d.getFields();
         for (int idx = 0; idx < fields.size(); idx++) {
           String value = d.get(fields.get(idx).name());
           value = value.toLowerCase();
           /**
            * these positions map to the required fields in the gaz TODO: allow a
            * configurable list of columns that map to the GazateerEntry fields,
            * then users would be able to plug in any gazateer they have (if they
            * build a lucene index out of it)
            */
           switch (idx) {
             case 1:
               entry.setItemID(value);
               break;
             case 3:
               entry.setLatitude(Double.valueOf(value));
               break;
             case 4:
               entry.setLongitude(Double.valueOf(value));
               break;
             case 10:
               entry.setItemType(value);
               break;
             case 12:
               entry.setItemParentID(value);
               if (!value.toLowerCase().equals(code.toLowerCase())) {
                 continue;
               }
               break;
             case 23:
               entry.setItemName(value);
               break;
           }
           entry.getIndexData().put(fields.get(idx).name(), value);
         }
         /**
          * norm the levenstein distance
          */
         Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
         /**
          * only want hits above the levenstein thresh
          */
         if (normLev.compareTo(scoreCutoff) >= 0) {
           if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase()) || code.toLowerCase().equals("")) {
             entry.getScoreMap().put("normlucene", normLev);
             //make sure we don't produce a duplicate
             if (!linkedData.contains(entry)) {
               linkedData.add(entry);
               /**
                * add the records to the cache for this query
                */
               GazetteerSearchCache.put(luceneQueryString, linkedData);
             }
           }
         }
       }

     } catch (IOException | ParseException ex) {
       LOGGER.error(ex);
     }

     return linkedData;
   }

   /**
    * Looks up the name in the USGS gazateer, after checking the cache
    *
    * @param searchString the nameed entity to look up in the lucene index
    * @param rowsReturned how many rows to allow lucene to return
    *
    * @return
    */
   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
     searchString = cleanInput(searchString);
     if (searchString.isEmpty()) {
       return linkedData;
     }
     String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
     try {

       /**
        * hit the cache
        */
       ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
       if (get != null) {
         //if the name is already there, return the list of cavhed results
         return get;
       }
       QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, usgsAnalyzer);
       Query q = parser.parse(luceneQueryString);

       TopDocs search = usgsSearcher.search(q, rowsReturned);
       for (int i = 0; i < search.scoreDocs.length; i++) {
         GazetteerEntry entry = new GazetteerEntry();
         int docId = search.scoreDocs[i].doc;
         double sc = search.scoreDocs[i].score;
         //keep track of the min score for normalization

         entry.getScoreMap().put("lucene", sc);
         entry.setIndexID(docId + "");
         entry.setSource("usgs");
         entry.setItemParentID("us");
         Document d = usgsSearcher.doc(docId);
         List<IndexableField> fields = d.getFields();
         for (int idx = 0; idx < fields.size(); idx++) {
           String value = d.get(fields.get(idx).name());
           value = value.toLowerCase();
           switch (idx) {
             case 0:
               entry.setItemID(value);
               break;
             case 1:
               entry.setItemName(value);
               break;
             case 2:
               entry.setItemType(value);
               break;
             case 9:
               entry.setLatitude(Double.valueOf(value));
               break;
             case 10:
               entry.setLongitude(Double.valueOf(value));
               break;
           }
           entry.getIndexData().put(fields.get(idx).name(), value);
         }
         /**
          * norm the levenstein distance
          */
         Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
         /**
          * only want hits above the levenstein thresh
          */
         if (normLev.compareTo(scoreCutoff) >= 0) {
           //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene

           entry.getScoreMap().put("normlucene", normLev);
           //make sure we don't produce a duplicate
           if (!linkedData.contains(entry)) {
             linkedData.add(entry);
             /**
              * add the records to the cache for this query
              */
             GazetteerSearchCache.put(luceneQueryString, linkedData);
           }
         }

       }

     } catch (IOException | ParseException ex) {
       LOGGER.error(ex);
     }

     return linkedData;
   }

   /**
    * Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
    *
    * @param input
    * @return
    */
   private String cleanInput(String input) {
     String output = input.replaceAll(REGEX_CLEAN, " ").trim();
     if (doubleQuoteAllSearchTerms) {
       return "\"" + output + "\"";
     } else {
       return output;
     }

   }

   private void init() throws Exception {
     if (usgsIndex == null) {
       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
       if (indexloc.equals("")) {
         // System.out.println("USGS Gaz location not found");
         LOGGER.error(new Exception("USGS Gaz location not found"));
       }
       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

       scoreCutoff = Double.valueOf(cutoff);
       String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
       doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
       usgsIndex = new MMapDirectory(new File(indexloc));
       usgsReader = DirectoryReader.open(usgsIndex);
       usgsSearcher = new IndexSearcher(usgsReader);
       usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
     }
     if (geonamesIndex == null) {
       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
       if (indexloc.equals("")) {
         LOGGER.error(new Exception("Geonames Gaz location not found"));

       }
       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
       scoreCutoff = Double.valueOf(cutoff);
       geonamesIndex = new MMapDirectory(new File(indexloc));
       geonamesReader = DirectoryReader.open(geonamesIndex);
       geonamesSearcher = new IndexSearcher(geonamesReader);
       //TODO: a language code switch statement should be employed here at some point
       geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

     }
   }
 }
	/*
	* Copyright 2013 The Apache Software Foundation.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.addons.geoentitylinker;

	import java.io.File;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.logging.Level;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexableField;
	import org.apache.lucene.queryparser.classic.ParseException;

	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.MMapDirectory;
	import org.apache.lucene.util.Version;
	import opennlp.tools.entitylinker.EntityLinkerProperties;
	import org.apache.log4j.Logger;
	import org.apache.lucene.analysis.util.CharArraySet;

	/**
	*
	* Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
	* these indices are based on loading the indexes using the
	* GeoEntityLinkerSetupUtils
	*
	*/
	public class GazetteerSearcher {

	private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
	private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
	private double scoreCutoff = .70;
	private boolean doubleQuoteAllSearchTerms = false;
	private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
	private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
	private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
	private Analyzer geonamesAnalyzer;
	//usgs US gazateer
	private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
	private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
	private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
	private Analyzer usgsAnalyzer;
	private EntityLinkerProperties properties;

	public static void main(String[] args) {
	try {
	boolean b = Boolean.valueOf("true");

	new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
	} catch (IOException ex) {
	java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
	} catch (Exception ex) {
	java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
	}
	}

	public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
	this.properties = properties;
	init();
	}

	/**
	*
	* @param searchString the named entity to look up in the lucene index
	* @param rowsReturned how many rows to allow lucene to return
	* @param code the country code
	*
	* @return
	*/
	public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
	ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
	searchString = cleanInput(searchString);
	if (searchString.isEmpty()) {
	return linkedData;
	}
	try {
	/**
	* build the search string Sometimes no country context is found. In this
	* case the code variable will be an empty string
	*/
	String luceneQueryString = !code.equals("")
	? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
	: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
	/**
	* check the cache and go no further if the records already exist
	*/
	ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
	if (get != null) {

	return get;
	}

	QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, geonamesAnalyzer);
	Query q = parser.parse(luceneQueryString);

	TopDocs search = geonamesSearcher.search(q, rowsReturned);

	for (int i = 0; i < search.scoreDocs.length; ++i) {
	GazetteerEntry entry = new GazetteerEntry();
	int docId = search.scoreDocs[i].doc;
	double sc = search.scoreDocs[i].score;

	entry.getScoreMap().put("lucene", sc);
	entry.setIndexID(docId + "");
	entry.setSource("geonames");

	Document d = geonamesSearcher.doc(docId);
	List<IndexableField> fields = d.getFields();
	for (int idx = 0; idx < fields.size(); idx++) {
	String value = d.get(fields.get(idx).name());
	value = value.toLowerCase();
	/**
	* these positions map to the required fields in the gaz TODO: allow a
	* configurable list of columns that map to the GazateerEntry fields,
	* then users would be able to plug in any gazateer they have (if they
	* build a lucene index out of it)
	*/
	switch (idx) {
	case 1:
	entry.setItemID(value);
	break;
	case 3:
	entry.setLatitude(Double.valueOf(value));
	break;
	case 4:
	entry.setLongitude(Double.valueOf(value));
	break;
	case 10:
	entry.setItemType(value);
	break;
	case 12:
	entry.setItemParentID(value);
	if (!value.toLowerCase().equals(code.toLowerCase())) {
	continue;
	}
	break;
	case 23:
	entry.setItemName(value);
	break;
	}
	entry.getIndexData().put(fields.get(idx).name(), value);
	}
	/**
	* norm the levenstein distance
	*/
	Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
	/**
	* only want hits above the levenstein thresh
	*/
	if (normLev.compareTo(scoreCutoff) >= 0) {
	if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase()) \|\| code.toLowerCase().equals("")) {
	entry.getScoreMap().put("normlucene", normLev);
	//make sure we don't produce a duplicate
	if (!linkedData.contains(entry)) {
	linkedData.add(entry);
	/**
	* add the records to the cache for this query
	*/
	GazetteerSearchCache.put(luceneQueryString, linkedData);
	}
	}
	}
	}

	} catch (IOException \| ParseException ex) {
	LOGGER.error(ex);
	}

	return linkedData;
	}

	/**
	* Looks up the name in the USGS gazateer, after checking the cache
	*
	* @param searchString the nameed entity to look up in the lucene index
	* @param rowsReturned how many rows to allow lucene to return
	*
	* @return
	*/
	public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
	ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
	searchString = cleanInput(searchString);
	if (searchString.isEmpty()) {
	return linkedData;
	}
	String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
	try {

	/**
	* hit the cache
	*/
	ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);
	if (get != null) {
	//if the name is already there, return the list of cavhed results
	return get;
	}
	QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, usgsAnalyzer);
	Query q = parser.parse(luceneQueryString);

	TopDocs search = usgsSearcher.search(q, rowsReturned);
	for (int i = 0; i < search.scoreDocs.length; i++) {
	GazetteerEntry entry = new GazetteerEntry();
	int docId = search.scoreDocs[i].doc;
	double sc = search.scoreDocs[i].score;
	//keep track of the min score for normalization

	entry.getScoreMap().put("lucene", sc);
	entry.setIndexID(docId + "");
	entry.setSource("usgs");
	entry.setItemParentID("us");
	Document d = usgsSearcher.doc(docId);
	List<IndexableField> fields = d.getFields();
	for (int idx = 0; idx < fields.size(); idx++) {
	String value = d.get(fields.get(idx).name());
	value = value.toLowerCase();
	switch (idx) {
	case 0:
	entry.setItemID(value);
	break;
	case 1:
	entry.setItemName(value);
	break;
	case 2:
	entry.setItemType(value);
	break;
	case 9:
	entry.setLatitude(Double.valueOf(value));
	break;
	case 10:
	entry.setLongitude(Double.valueOf(value));
	break;
	}
	entry.getIndexData().put(fields.get(idx).name(), value);
	}
	/**
	* norm the levenstein distance
	*/
	Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());
	/**
	* only want hits above the levenstein thresh
	*/
	if (normLev.compareTo(scoreCutoff) >= 0) {
	//only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene

	entry.getScoreMap().put("normlucene", normLev);
	//make sure we don't produce a duplicate
	if (!linkedData.contains(entry)) {
	linkedData.add(entry);
	/**
	* add the records to the cache for this query
	*/
	GazetteerSearchCache.put(luceneQueryString, linkedData);
	}
	}

	}

	} catch (IOException \| ParseException ex) {
	LOGGER.error(ex);
	}

	return linkedData;
	}

	/**
	* Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
	*
	* @param input
	* @return
	*/
	private String cleanInput(String input) {
	String output = input.replaceAll(REGEX_CLEAN, " ").trim();
	if (doubleQuoteAllSearchTerms) {
	return "\"" + output + "\"";
	} else {
	return output;
	}

	}

	private void init() throws Exception {
	if (usgsIndex == null) {
	String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
	if (indexloc.equals("")) {
	// System.out.println("USGS Gaz location not found");
	LOGGER.error(new Exception("USGS Gaz location not found"));
	}
	String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

	scoreCutoff = Double.valueOf(cutoff);
	String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
	doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
	usgsIndex = new MMapDirectory(new File(indexloc));
	usgsReader = DirectoryReader.open(usgsIndex);
	usgsSearcher = new IndexSearcher(usgsReader);
	usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
	}
	if (geonamesIndex == null) {
	String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
	if (indexloc.equals("")) {
	LOGGER.error(new Exception("Geonames Gaz location not found"));

	}
	String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
	scoreCutoff = Double.valueOf(cutoff);
	geonamesIndex = new MMapDirectory(new File(indexloc));
	geonamesReader = DirectoryReader.open(geonamesIndex);
	geonamesSearcher = new IndexSearcher(geonamesReader);
	//TODO: a language code switch statement should be employed here at some point
	geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

	}
	}
	}