OPENNLP-756
Many small changes in a few classes due to the REGEX support in the country context file. The country context file is now capable of regex. A bug was also fixed in the AdminBoundaryContextGenerator which improved the performance of the ProvinceProximityScorer.
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
index c0d2645..556caa1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
@@ -74,14 +74,14 @@
       GeoEntityLinker linker = new GeoEntityLinker();

       linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));

 

-      countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");

-

+      AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool.");

+      System.out.println(c);

     } catch (Exception ex) {

       java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);

     }

   }

 

-  public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException{

+  public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException {

     this.properties = properties;

     if (countrydata == null) {

       String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");

@@ -155,12 +155,12 @@
         for (String cc : countryhitMap.keySet()) {

           Map<String, String> provsForCc = provMap.get(cc);

           if (provsForCc != null) {

-            provMentions = regexfind(text, provsForCc, provHits);

+            provMentions.putAll(regexfind(text, provsForCc, provHits));

             if (provMentions != null) {

               for (String prov : provMentions.keySet()) {

                 Map<String, String> get = countyMap.get(prov);

                 if (get != null) {

-                  countyMentions = regexfind(text, get, countyHits);

+                  countyMentions.putAll(regexfind(text, get, countyHits));

                 }

               }

             }

@@ -208,64 +208,6 @@
     return null;

   }

 

-  /**

-   * Finds mentions of countries to assist in toponym resolution. Countries are

-   * discovered via regex based on a configured file called

-   * opennlp.geoentitylinker.countrycontext.txt. the file is configured using

-   * the entitylinker.properties file as such:

-   * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt

-   *

-   * Finding mentions in documents is very helpful for scoring. Lazily loads the

-   * list from the file.

-   *

-   * @param docText the full text of the document

-   * @return

-   */

-  @Deprecated

-  public Map<String, Set<Integer>> regexfind(String docText) {

-    countryMentions = new HashMap<>();

-    nameCodesMap.clear();

-    try {

-

-      for (CountryContextEntry entry : countrydata) {

-        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

-        Matcher rs = regex.matcher(docText);

-        String code = entry.getCc1().toLowerCase();

-

-        boolean found = false;

-        while (rs.find()) {

-          found = true;

-          Integer start = rs.start();

-          String hit = rs.group().toLowerCase();

-          if (countryMentions.containsKey(code)) {

-            countryMentions.get(code).add(start);

-          } else {

-            Set<Integer> newset = new HashSet<Integer>();

-            newset.add(start);

-            countryMentions.put(code, newset);

-          }

-          if (!hit.equals("")) {

-            if (this.nameCodesMap.containsKey(hit)) {

-              nameCodesMap.get(hit).add(code);

-            } else {

-              HashSet<String> newset = new HashSet<String>();

-              newset.add(code);

-              nameCodesMap.put(hit, newset);

-            }

-          }

-        }

-        if (found) {

-          countryHits.add(entry);

-        }

-

-      }

-

-    } catch (Exception ex) {

-      LOGGER.error(ex);

-    }

-

-    return countryMentions;

-  }

 

   /**

    * discovers indicators of admin boundary data using regex.

@@ -292,7 +234,7 @@
         Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

         Matcher rs = regex.matcher(docText);

         String code = entry.toLowerCase();

-

+        code = code.trim().replace("", "");

         boolean found = false;

         while (rs.find()) {

           found = true;

@@ -349,7 +291,7 @@
         }

         if (values.length == 6) {

           AdminBoundary entry = new AdminBoundary(

-                  values[0].toLowerCase().trim(),

+                  values[0].toLowerCase().trim().replace("", ""),

                   values[3].toLowerCase().trim(),

                   values[1].toLowerCase().trim(),

                   values[4].toLowerCase().trim(),

@@ -358,7 +300,7 @@
           entries.add(entry);

         } else {

           AdminBoundary entry = new AdminBoundary(

-                  values[0].toLowerCase().trim(),

+                values[0].toLowerCase().trim().replace("", ""),

                   values[3].toLowerCase().trim(),

                   values[1].toLowerCase().trim(),

                   values[4].toLowerCase().trim(),

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index d7c6dbf..3a5eeb0 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
@@ -76,7 +76,7 @@
     }

   }

 

-  public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {

+  public GazetteerSearcher(EntityLinkerProperties properties) throws IOException {

     this.properties = properties;

     init();

   }

@@ -206,7 +206,7 @@
 

   }

 

-  private void init() throws Exception {

+  private void init() throws IOException {

 

     if (opennlpIndex == null) {

       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index b6a8c1c..4d01718 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
@@ -186,7 +186,7 @@
 

   @Override

   public void init(EntityLinkerProperties properties) throws IOException {

-    try {

+   

       this.linkerProperties = properties;

       countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);

       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);

@@ -199,9 +199,7 @@
       }

       topN = rws;

       loadScorers();

-    } catch (Exception ex) {

-      throw new IOException(ex);

-    }

+    

   }

 

   @Override

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
index 1c4a936..90a6e99 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
@@ -44,7 +44,7 @@
       i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),

               new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),

               new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),

-              new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),

+              new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20141202.txt.txt"),

               new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),

               new File("C:\\temp\\gazetteers\\"),

               new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index cc34b1a..ff4219f 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
@@ -168,6 +168,7 @@
           }

         }

       }

+      

       link.getScoreMap().put("countrycontext", score);

     }

     return span;

diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index afdb4b1..e245a00 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
@@ -156,9 +156,12 @@
      * span is referring to the toponym form this code key>

      */

     Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);

+    if (scoreMap.isEmpty()) {

+      return span;

+    }

     for (BaseLink link : span.getLinkedEntries()) {

       //getItemParentId is the country code

-    GazetteerEntry entry = (GazetteerEntry)link;

+      GazetteerEntry entry = (GazetteerEntry) link;

       String spanCountryCode = entry.getProvinceCode();

       if (scoreMap.containsKey(spanCountryCode)) {

 

@@ -184,9 +187,9 @@
 

   /**

    * takes a map of distances from the toponym to each province mention and

-   * generates a map of scores for each province code. The map is then correlated

-   * to the code of the BaseLink parentid for retrieval. Then the score is added

-   * to the overall list.

+   * generates a map of scores for each province code. The map is then

+   * correlated to the code of the BaseLink parentid for retrieval. Then the

+   * score is added to the overall list.

    *

    * @param distanceMap

    * @param sentences