OPENNLP-693
OPENNLP-694
OPENNLP-692
Added log4j logging. Added lucene spatial. removed the optional tags from pom for lucene dependency. Also added string sanitizing to Gazetteer searcher so lucene will stop logging syntax problems on noisy NER results.
diff --git a/geoentitylinker-addon/pom.xml b/geoentitylinker-addon/pom.xml
index 6fd5059..7bed7e9 100644
--- a/geoentitylinker-addon/pom.xml
+++ b/geoentitylinker-addon/pom.xml
@@ -1,66 +1,73 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

-  <modelVersion>4.0.0</modelVersion>

-  <parent>

-    <groupId>org.apache.opennlp</groupId>

-    <artifactId>opennlp</artifactId>

-    <version>1.6.0-SNAPSHOT</version>

-    <relativePath>../opennlp/pom.xml</relativePath>

-  </parent>

+    <modelVersion>4.0.0</modelVersion>

+    <parent>

+        <groupId>org.apache.opennlp</groupId>

+        <artifactId>opennlp</artifactId>

+        <version>1.6.0-SNAPSHOT</version>

+        <relativePath>../opennlp/pom.xml</relativePath>

+    </parent>

 

-  <artifactId>geoentitylinker-addon</artifactId>

-  <version>1.0-SNAPSHOT</version>

-  <packaging>jar</packaging>

-  <name>geoentitylinker-addon</name>

+    <artifactId>geoentitylinker-addon</artifactId>

+    <version>1.0-SNAPSHOT</version>

+    <packaging>jar</packaging>

+    <name>geoentitylinker-addon</name>

 

-  <url>http://maven.apache.org</url>

-  <build>

-    <plugins>

-      <plugin>

-        <groupId>org.apache.maven.plugins</groupId>

-        <artifactId>maven-compiler-plugin</artifactId>

-        <version>2.3.2</version>

-        <configuration>

-          <source>1.7</source>

-          <target>1.7</target>

-        </configuration>

-      </plugin>

-    </plugins>

-  </build>

-  <properties>

-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

-  </properties>

+    <url>http://maven.apache.org</url>

+    <build>

+        <plugins>

+            <plugin>

+                <groupId>org.apache.maven.plugins</groupId>

+                <artifactId>maven-compiler-plugin</artifactId>

+                <version>2.3.2</version>

+                <configuration>

+                    <source>1.7</source>

+                    <target>1.7</target>

+                </configuration>

+            </plugin>

+        </plugins>

+    </build>

+    <properties>

+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

+    </properties>

 

-  <dependencies>

-    <dependency>

-      <groupId>junit</groupId>

-      <artifactId>junit</artifactId>

-      <version>3.8.1</version>

-      <scope>test</scope>

-    </dependency>

-    <dependency>

-      <groupId>org.apache.lucene</groupId>

-      <artifactId>lucene-core</artifactId>

-      <version>4.5.0</version>

-      <optional>true</optional>

-    </dependency>

-    <dependency>

-      <groupId>org.apache.lucene</groupId>

-      <artifactId>lucene-analyzers-common</artifactId>

-      <version>4.5.0</version>

-      <optional>true</optional>

-    </dependency>

-    <dependency>

-      <groupId>org.apache.lucene</groupId>

-      <artifactId>lucene-queryparser</artifactId>

-      <version>4.5.0</version>

-      <optional>true</optional>

-    </dependency>

-    <dependency>

-      <groupId>org.apache.opennlp</groupId>

-      <artifactId>opennlp-tools</artifactId>

-      <version>1.6.0-SNAPSHOT</version>

-      <optional>true</optional>

-    </dependency>

-  </dependencies>

+    <dependencies>

+        <dependency>

+            <groupId>junit</groupId>

+            <artifactId>junit</artifactId>

+            <version>3.8.1</version>

+            <scope>test</scope>

+        </dependency>

+        <dependency>

+            <groupId>log4j</groupId>

+            <artifactId>log4j</artifactId>

+            <version>1.2.16</version>

+        </dependency>

+        <dependency>

+            <groupId>org.apache.lucene</groupId>

+            <artifactId>lucene-spatial</artifactId>

+            <version>4.8.0</version>

+        </dependency>

+            

+        <dependency>

+            <groupId>org.apache.lucene</groupId>

+            <artifactId>lucene-core</artifactId>

+            <version>4.8.0</version>

+        </dependency>

+        <dependency>

+            <groupId>org.apache.lucene</groupId>

+            <artifactId>lucene-analyzers-common</artifactId>

+            <version>4.8.0</version>

+        </dependency>

+        <dependency>

+            <groupId>org.apache.lucene</groupId>

+            <artifactId>lucene-queryparser</artifactId>

+            <version>4.8.0</version>

+        </dependency>

+        <dependency>

+            <groupId>org.apache.opennlp</groupId>

+            <artifactId>opennlp-tools</artifactId>

+            <version>1.6.0-SNAPSHOT</version>

+        </dependency>

+    </dependencies>

 </project>

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
index e089e7b..4aa9e16 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
@@ -25,11 +25,11 @@
 import java.util.List;

 import java.util.Map;

 import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

+

 import java.util.regex.Matcher;

 import java.util.regex.Pattern;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

+import org.apache.log4j.Logger;

 

 /**

  * Finds instances of country mentions in a String, typically a document text.

@@ -38,22 +38,23 @@
  */

 public class CountryContext {

 

+  private static final Logger LOGGER = Logger.getLogger(CountryContext.class);

   private List<CountryContextEntry> countrydata;

   private Map<String, Set<String>> nameCodesMap = new HashMap<>();

   private Map<String, Set<Integer>> countryMentions = new HashMap<>();

   private Set<CountryContextEntry> countryHits = new HashSet<>();

   private EntityLinkerProperties properties;

-

+  

   public CountryContext(EntityLinkerProperties properties) throws Exception {

     this.properties = properties;

     if (countrydata == null) {

       String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

-

+      

       File countryContextFile = new File(path);

       countrydata = getCountryContextFromFile(countryContextFile);

     }

   }

-

+  

   public Map<String, Set<Integer>> getCountryMentions() {

     return countryMentions;

   }

@@ -75,7 +76,7 @@
   public Map<String, Set<String>> getNameCodesMap() {

     return nameCodesMap;

   }

-

+  

   public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {

     this.nameCodesMap = nameCodesMap;

   }

@@ -90,7 +91,7 @@
    * Finding mentions in documents is very helpful for scoring. Lazily loads the

    * list from the file.

    *

-   * @param docText    the full text of the document

+   * @param docText the full text of the document

    * @param properties EntityLinkerProperties for getting database connection

    * @return

    */

@@ -98,13 +99,12 @@
     countryMentions = new HashMap<>();

     nameCodesMap.clear();

     try {

-

-

+      

       for (CountryContextEntry entry : countrydata) {

         Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

         Matcher rs = regex.matcher(docText);

         String code = entry.getCc1().toLowerCase();

-

+        

         boolean found = false;

         while (rs.find()) {

           found = true;

@@ -130,27 +130,26 @@
         if (found) {

           countryHits.add(entry);

         }

-

+        

       }

-

+      

     } catch (Exception ex) {

-      Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);

+      LOGGER.error(ex);

     }

-

-

+    

     return countryMentions;

   }

-

+  

   private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {

     List<CountryContextEntry> entries = new ArrayList<>();

     String path = countryContextFile.getPath();

     BufferedReader reader;

-

+    

     try {

       path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

-

+      

       reader = new BufferedReader(new FileReader(path));

-

+      

       while (reader.read() != -1) {

         String line = reader.readLine();

         String[] values = line.split("\t");

@@ -166,10 +165,10 @@
         entries.add(entry);

       }

       reader.close();

-    } catch (IOException e) {

-      System.err.println(e);

+    } catch (IOException ex) {

+      LOGGER.error(ex);

     }

     return entries;

-

+    

   }

 }

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
index 81e31ad..dd65ec7 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
@@ -94,8 +94,8 @@
     String indexloc = outputIndexDir + type.toString();

     Directory index = new MMapDirectory(new File(indexloc));

 

-    Analyzer a = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));

-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);

+    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);

 

     IndexWriter w = new IndexWriter(index, config);

 

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 44d7e7d..9409f70 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -35,6 +35,7 @@
 import org.apache.lucene.store.MMapDirectory;

 import org.apache.lucene.util.Version;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

+import org.apache.log4j.Logger;

 import org.apache.lucene.analysis.util.CharArraySet;

 

 /**

@@ -46,6 +47,8 @@
  */

 public class GazetteerSearcher {

 

+  private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";

+  private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);

   private double scoreCutoff = .90;

   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

@@ -67,13 +70,16 @@
    *

    * @param searchString the named entity to look up in the lucene index

    * @param rowsReturned how many rows to allow lucene to return

-   * @param code         the country code

+   * @param code the country code

    *

    * @return

    */

   public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {

     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

-

+    searchString = cleanInput(searchString);

+    if (searchString.isEmpty()) {

+      return linkedData;

+    }

     try {

       /**

        * build the search string Sometimes no country context is found. In this

@@ -91,7 +97,7 @@
         return get;

       }

 

-      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);

+      QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, geonamesAnalyzer);

       Query q = parser.parse(luceneQueryString);

 

       TopDocs search = geonamesSearcher.search(q, rowsReturned);

@@ -164,7 +170,7 @@
       }

 

     } catch (IOException | ParseException ex) {

-      System.err.println(ex);

+      LOGGER.error(ex);

     }

 

     return linkedData;

@@ -180,6 +186,10 @@
    */

   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {

     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

+     searchString = cleanInput(searchString);

+    if (searchString.isEmpty()) {

+      return linkedData;

+    }

     String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();

     try {

 

@@ -191,7 +201,7 @@
         //if the name is already there, return the list of cavhed results

         return get;

       }

-      QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);

+      QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, usgsAnalyzer);

       Query q = parser.parse(luceneQueryString);

 

       TopDocs search = usgsSearcher.search(q, rowsReturned);

@@ -253,30 +263,34 @@
       }

 

     } catch (IOException | ParseException ex) {

-      System.err.println(ex);

+      LOGGER.error(ex);

     }

 

     return linkedData;

   }

 

+  private String cleanInput(String input) {

+    return input.replaceAll(REGEX_CLEAN, "").trim();

+  }

+

   private void init() throws Exception {

     if (usgsIndex == null) {

       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

       if (indexloc.equals("")) {

-        System.out.println("USGS Gaz location not found");

-

+        // System.out.println("USGS Gaz location not found");

+        LOGGER.error(new Exception("USGS Gaz location not found"));

       }

       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

       scoreCutoff = Double.valueOf(cutoff);

       usgsIndex = new MMapDirectory(new File(indexloc));

       usgsReader = DirectoryReader.open(usgsIndex);

       usgsSearcher = new IndexSearcher(usgsReader);

-      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));

+      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

     }

     if (geonamesIndex == null) {

       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

       if (indexloc.equals("")) {

-        System.out.println("Geonames Gaz location not found");

+        LOGGER.error(new Exception("Geonames Gaz location not found"));

 

       }

       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

@@ -285,7 +299,7 @@
       geonamesReader = DirectoryReader.open(geonamesIndex);

       geonamesSearcher = new IndexSearcher(geonamesReader);

       //TODO: a language code switch statement should be employed here at some point

-      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));

+      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

 

     }

   }

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 1804020..b147d27 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -39,7 +39,6 @@
   private GazetteerSearcher gazateerSearcher;

   private List<LinkedEntityScorer> scorers = new ArrayList<>();

 

-

   @Override

   public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {

     ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();

@@ -64,7 +63,7 @@
         ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();

         if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)

                 || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {

-        

+

           if (!countryMentions.keySet().isEmpty()) {

             for (String code : countryMentions.keySet()) {

               if (!code.equals("us")) {

@@ -82,7 +81,8 @@
           //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);

           usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));

         }

-        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd());

+        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd(), "location",names[i].getProb());

+    

 

         if (!usgsEntries.isEmpty()) {

           geoSpan.getLinkedEntries().addAll(usgsEntries);

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
index 8a1564f..35b423a 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
@@ -27,6 +27,7 @@
 import opennlp.tools.entitylinker.BaseLink;

 import opennlp.tools.entitylinker.LinkedSpan;

 import opennlp.tools.util.Span;

+import org.apache.log4j.Logger;

 

 /**

  *

@@ -34,6 +35,7 @@
  */

 public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {

 

+  private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class);

   DocumentCategorizerME documentCategorizerME;

   DoccatModel doccatModel;

   public static final int RADIUS = 200;

@@ -45,12 +47,9 @@
       if (doccatModel == null) {

         String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");

         if (path.equals("")) {

-          if (!modelexists) {

-            System.err.println(this.getClass().getSimpleName() + ": could not find property \"opennlp.geoentitylinker.modelbasedscorer.modelpath\" : no ModelBasedScoring will be performed");

-          }

-          modelexists = true;

           return;

         }

+        modelexists = true;

         doccatModel = new DoccatModel(new File(path));

         documentCategorizerME = new DocumentCategorizerME(doccatModel);

       }

@@ -67,11 +66,11 @@
       }

 

     } catch (FileNotFoundException ex) {

-      System.err.println(this.getClass().getSimpleName() + ": could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");

+      LOGGER.error(ex);

     } catch (IOException ex) {

-      System.err.println(ex);

+      LOGGER.error(ex);

     } catch (Exception ex) {

-      System.err.println(ex);

+      LOGGER.error(ex);

     }

   }

 

@@ -80,11 +79,11 @@
    * radius of a mention within the doctext

    *

    * @param linkedSpans

+   * @param sentenceSpans

    * @param docText

-   * @param additionalContext

    * @param radius

    * @return a map of the index of the linked span to the string of surrounding

-   *         text: Map<indexofspan,surrounding text>

+   * text: Map<indexofspan,surrounding text>

    */

   public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {

     Map<Integer, String> featureBags = new HashMap<>();

@@ -115,7 +114,6 @@
       featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));

     }

 

-

     return featureBags;

   }

 

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
index 81ca1eb..bf7f701 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
@@ -15,6 +15,9 @@
  */

 package opennlp.addons.geoentitylinker;

 

+import com.spatial4j.core.context.SpatialContext;

+import com.spatial4j.core.io.GeohashUtils;

+import com.spatial4j.core.shape.Point;

 import java.util.ArrayList;

 import java.util.HashMap;

 import java.util.List;

@@ -42,8 +45,7 @@
       Double latw = entry.getLatitude();

       Double lonw = entry.getLongitude();

 

-

-      String key = simpleGeohash(latw, lonw).substring(0, precision);

+      String key = geoHash(latw, lonw).substring(0, precision);

       if (map.containsKey(key)) {

         map.get(key).add(entry);

       } else {

@@ -72,7 +74,44 @@
       }

     }

 

+  }

 

+  /**

+   * Returns a geohash based on Lucene Spatial

+   *

+   * @param lat the input latitude Y

+   * @param lon the input longitude X

+   * @return

+   */

+  public String geoHash(Double lat, Double lon) {

+    String encodeLatLon = GeohashUtils.encodeLatLon(lat, lon);

+    return encodeLatLon;

+  }

+

+  /**

+   * Returns the X and Y point for the geohash. Element 0 is the X (longitude)

+   * element 1 is the Y (latitude)

+   *

+   * @param geohash

+   * @return

+   */

+  public double[] geoHashToPoint(String geohash) {

+    Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO);

+    double[] coords = new double[]{decode.getX(), decode.getY()};

+    return coords;

+  }

+

+  /**

+   * Returns the X and Y point for the geohash. Element 0 is the X (longitude)

+   * element 1 is the Y (latitude)

+   *

+   * @param geohash

+   * @return

+   */

+  public String geoHashToPointStr(String geohash) {

+    Point decode = GeohashUtils.decode(geohash, SpatialContext.GEO);

+    String point = decode.getX() + "," + decode.getY();

+    return point;

   }

 

   /**