OPENNLP-756 OPENNLP-750 Improved Regex handling in scorers and country context generator. Upgraded Lucene dependency to 6.0.0 Fixed ProvinceProximityScorer and CountryProximityScorer Fixed num rows returned bug Added regex support to Country and Province in countrycontextfile, and added headers for better editing in things like xl Cleaned up some other code, will post new CountryContext file on to OPENNLP-756 All indexes should be rebuilt because of new country context file format returned from the gazetteerIndexer class

commit: 60f958e3cb3570c30b315caa9c406f21a9394d62 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Thu Jun 09 20:09:01 2016 +0000
committer: Mark Giaconia <markg@apache.org> Thu Jun 09 20:09:01 2016 +0000
tree: 7569a2c18e20124454ef7ec2ddb777450ea4679e
parent: 3be3a5153a21a37ef4b6ae700c7f52b7a37bc7b8 [diff]
diff --git a/geoentitylinker-addon/pom.xml b/geoentitylinker-addon/pom.xml
index b4f727b..07d03cc 100644
--- a/geoentitylinker-addon/pom.xml
+++ b/geoentitylinker-addon/pom.xml

@@ -23,7 +23,7 @@
     <parent>

         <groupId>org.apache.opennlp</groupId>

         <artifactId>opennlp</artifactId>

-        <version>1.6.0-SNAPSHOT</version>

+        <version>1.6.0</version>

         <relativePath>../opennlp/pom.xml</relativePath>

     </parent>

 

@@ -62,31 +62,33 @@
             <artifactId>log4j</artifactId>

             <version>1.2.16</version>

         </dependency>

-        <dependency>

-            <groupId>org.apache.lucene</groupId>

-            <artifactId>lucene-spatial</artifactId>

-            <version>4.8.0</version>

-        </dependency>

+      

             

         <dependency>

             <groupId>org.apache.lucene</groupId>

             <artifactId>lucene-core</artifactId>

-            <version>4.8.0</version>

+            <version>6.0.0</version>

         </dependency>

         <dependency>

             <groupId>org.apache.lucene</groupId>

             <artifactId>lucene-analyzers-common</artifactId>

-            <version>4.8.0</version>

+            <version>6.0.0</version>

         </dependency>

         <dependency>

             <groupId>org.apache.lucene</groupId>

             <artifactId>lucene-queryparser</artifactId>

-            <version>4.8.0</version>

+            <version>6.0.0</version>

         </dependency>

         <dependency>

             <groupId>org.apache.opennlp</groupId>

             <artifactId>opennlp-tools</artifactId>

             <version>1.6.0</version>

         </dependency>

+        <dependency>

+            <groupId>com.spatial4j</groupId>

+            <artifactId>spatial4j</artifactId>

+            <version>0.4.1</version>

+            <type>jar</type>

+        </dependency>

     </dependencies>

 </project>


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
index 7961f1d..a01b0bb 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java

@@ -30,23 +30,22 @@
   private final String countryName;

   private final String countyName;

   private final String countyCode;

+  private final String countryRegex;

+  private final String provinceRegex;

+  private final String countyRegex;

 

-  public AdminBoundary(String cc, String ac, String pname, String countryName) {

-    this.countryCode = cc;

-    this.provinceCode = ac;

-    this.provinceName = pname;

-    this.countryName = countryName;

-    this.countyCode = NO_DATA_FOUND_VALUE;

-    this.countyName = NO_DATA_FOUND_VALUE;

-  }

 

-  public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName) {

+  public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName,

+      String countryRegex, String provinceRegex, String countyRegex) {

     this.countryCode = countryCode;

     this.provinceCode = provinceCode;

     this.provinceName = provinceName;

     this.countryName = countryName;

     this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName;

     this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode;

+    this.countryRegex = countryRegex;

+    this.provinceRegex = provinceRegex;

+    this.countyRegex = countyRegex;

   }

 

   public String getCountryCode() {

@@ -120,4 +119,20 @@
     return true;

   }

 

+  public String getProvinceCode() {

+    return provinceCode;

+  }

+

+  public String getCountryRegex() {

+    return countryRegex;

+  }

+

+  public String getProvinceRegex() {

+    return provinceRegex;

+  }

+

+  public String getCountyRegex() {

+    return countyRegex;

+  }

+

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
index ba0407e..4fccffb 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java

@@ -29,20 +29,24 @@
   private final Set<String> provHits;

   private final Set<String> countyHits;

   private final Map<String, String> countryRefMap;

+  private final Map<String, String> countryRegexMap;

+  private final Map<String, String> countyRegexMap;

+  private final Map<String, String> provinceRegexMap;

   private final Map<String, Map<String, String>> provRefMap;

   private final Map<String, Map<String, String>> countyRefMap;

   private final Set<String> whereClauses;

   private final Map<String, Set<String>> nameCodesMap;

 

   public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions,

-          Map<String, Set<Integer>> provMentions,

-          Map<String, Set<Integer>> countyMentions,

-          Set<String> countryHits,

-          Set<String> provHits,

-          Set<String> countyHits,

-          Map<String, String> countryRefMap,

-          Map<String, Map<String, String>> provRefMap,

-          Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap) {

+      Map<String, Set<Integer>> provMentions,

+      Map<String, Set<Integer>> countyMentions,

+      Set<String> countryHits,

+      Set<String> provHits,

+      Set<String> countyHits,

+      Map<String, String> countryRefMap,

+      Map<String, Map<String, String>> provRefMap,

+      Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap, Map<String, String> countryRegexMap, Map<String, String> provinceRegexMap,

+      Map<String, String> countyRegexMap) {

     this.countryMentions = countryMentions;

     this.provMentions = provMentions;

     this.countyMentions = countyMentions;

@@ -54,6 +58,9 @@
     this.countyRefMap = countyRefMap;

     this.whereClauses = setWhereClauses();

     this.nameCodesMap = nameCodesMap;

+    this.countryRegexMap = countryRegexMap;

+    this.provinceRegexMap = provinceRegexMap;

+    this.countyRegexMap = countyRegexMap;

   }

 

   public Map<String, Set<String>> getNameCodesMap() {

@@ -131,4 +138,16 @@
     return clauses;

   }

 

+  public Map<String, String> getCountryRegexMap() {

+    return countryRegexMap;

+  }

+

+  public Map<String, String> getCountyRegexMap() {

+    return countyRegexMap;

+  }

+

+  public Map<String, String> getProvinceRegexMap() {

+    return provinceRegexMap;

+  }

+

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
index 556caa1..b645156 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java

@@ -43,9 +43,14 @@
   private List<CountryContextEntry> countrydata;

   private Map<String, Set<String>> nameCodesMap = new HashMap<>();

   private Map<String, Set<Integer>> countryMentions = new HashMap<>();

+

+  Map<String, String> countryRegexMap = new HashMap<>();

+  Map<String, String> provinceRegexMap = new HashMap<>();

+  Map<String, String> countyRegexMap = new HashMap<>();

+

   private Set<CountryContextEntry> countryHits = new HashSet<>();

   private EntityLinkerProperties properties;

-  private List<AdminBoundary> adminBoundaryData;

+  private List<AdminBoundary> adminBoundaryData= new ArrayList<>();

   private Set<AdminBoundary> adminBoundaryHits = new HashSet<>();

   private AdminBoundaryContext context;

 

@@ -70,9 +75,8 @@
 

   public static void main(String[] args) {

     try {

-      AdminBoundaryContextGenerator countryContext = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));

-      GeoEntityLinker linker = new GeoEntityLinker();

-      linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));

+      AdminBoundaryContextGenerator countryContext

+          = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("C:\\Temp\\gaz_data\\newCountryContextfile.txt")));

 

       AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool.");

       System.out.println(c);

@@ -93,7 +97,7 @@
         throw new IOException("missing country context file");

       }

       //countrydata = getCountryContextFromFile(countryContextFile);

-      adminBoundaryData = getContextFromFile(countryContextFile);

+      getContextFromFile(countryContextFile);

       if (adminBoundaryData.isEmpty()) {

         throw new IOException("missing country context data");

       }

@@ -150,17 +154,17 @@
     try {

 

       reset();

-      Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);

+      Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet, "country");

       if (!countryhitMap.isEmpty()) {

         for (String cc : countryhitMap.keySet()) {

           Map<String, String> provsForCc = provMap.get(cc);

           if (provsForCc != null) {

-            provMentions.putAll(regexfind(text, provsForCc, provHits));

+            provMentions.putAll(regexfind(text, provsForCc, provHits, "province"));

             if (provMentions != null) {

               for (String prov : provMentions.keySet()) {

                 Map<String, String> get = countyMap.get(prov);

                 if (get != null) {

-                  countyMentions.putAll(regexfind(text, get, countyHits));

+                  countyMentions.putAll(regexfind(text, get, countyHits, "province"));

                 }

               }

             }

@@ -169,7 +173,7 @@
       } else {

         for (Map<String, String> provsForCc : provMap.values()) {

           if (provsForCc != null) {

-            provMentions = regexfind(text, provsForCc, provHits);

+            provMentions = regexfind(text, provsForCc, provHits, "province");

             if (provMentions != null) {

               for (String prov : provMentions.keySet()) {

                 //fake a country hit based on a province hit... this gets fuzzy

@@ -182,7 +186,7 @@
                 }

                 Map<String, String> get = countyMap.get(prov);

                 if (get != null) {

-                  countyMentions = regexfind(text, get, countyHits);

+                  countyMentions = regexfind(text, get, countyHits, "oounty");

                 }

               }

             }

@@ -199,7 +203,9 @@
         }

       }

 

-      AdminBoundaryContext context = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, countryRefMap, provMap, countyMap, nameCodesMap);

+      AdminBoundaryContext context

+          = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits,

+              countryRefMap, provMap, countyMap, nameCodesMap, countryRegexMap, provinceRegexMap, countyRegexMap);

 

       return context;

     } catch (Exception e) {

@@ -208,7 +214,6 @@
     return null;

   }

 

-

   /**

    * discovers indicators of admin boundary data using regex.

    *

@@ -218,7 +223,7 @@
    * @param hitsRef a reference to a set that stores the hits by id

    * @return

    */

-  private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef) {

+  private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef, String locationType) {

     Map<String, Set<Integer>> mentions = new HashMap<>();

     if (lookupMap == null) {

       return mentions;

@@ -226,10 +231,29 @@
     try {

 

       for (String entry : lookupMap.keySet()) {

+

         String name = lookupMap.get(entry).toLowerCase();

         if (name == null) {

           continue;

         }

+        switch (locationType) {

+          case "country":

+            if (this.countryRegexMap.containsKey(entry)) {

+              name = countryRegexMap.get(entry);

+            }

+            break;

+

+          case "province":

+            if (this.provinceRegexMap.containsKey(entry)) {

+              name = provinceRegexMap.get(entry);

+            }

+            break;

+          case "county":

+            if (this.countyRegexMap.containsKey(entry)) {

+              name = countyRegexMap.get(entry);

+            }

+            break;

+        }

         name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") + "([^\\p{L}\\p{Nd}]|$)";

         Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

         Matcher rs = regex.matcher(docText);

@@ -274,39 +298,37 @@
     return mentions;

   }

 

-  private List<AdminBoundary> getContextFromFile(File countryContextFile) {

+  private void getContextFromFile(File countryContextFile) {

     if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {

-      return adminBoundaryData;

+      return;

     }

-    List<AdminBoundary> entries = new ArrayList<>();

+

     BufferedReader reader;

     try {

       reader = new BufferedReader(new FileReader(countryContextFile));

       String line = "";

+      int lineNum = 0;

       while ((line = reader.readLine()) != null) {

         String[] values = line.split("\t");

-        int len = values.length;

-        if (len < 5 || len > 6) {

-          throw new IllegalArgumentException("Improperly formatted file");

+        if (lineNum == 0) {

+          lineNum++;

+          continue;

+          //skip column name headers

         }

-        if (values.length == 6) {

+        if (values.length == 9) {

           AdminBoundary entry = new AdminBoundary(

-                  values[0].toLowerCase().trim().replace("ï»¿", ""),

-                  values[3].toLowerCase().trim(),

-                  values[1].toLowerCase().trim(),

-                  values[4].toLowerCase().trim(),

-                  values[2].toLowerCase().trim(),

-                  values[5].toLowerCase().trim());

-          entries.add(entry);

+              values[0].toLowerCase().trim().replace("ï»¿", ""),

+              values[3].toLowerCase().trim(),

+              values[1].toLowerCase().trim(),

+              values[4].toLowerCase().trim(),

+              values[2].toLowerCase().trim(),

+              values[5].toLowerCase().trim(),

+              values[6].toLowerCase().trim(),

+              values[7].toLowerCase().trim(),

+              values[8].toLowerCase().trim());

+          this.adminBoundaryData.add(entry);

         } else {

-          AdminBoundary entry = new AdminBoundary(

-                values[0].toLowerCase().trim().replace("ï»¿", ""),

-                  values[3].toLowerCase().trim(),

-                  values[1].toLowerCase().trim(),

-                  values[4].toLowerCase().trim(),

-                  values[2].toLowerCase().trim(),

-                  "");

-          entries.add(entry);

+          throw new IllegalArgumentException("Improperly formatted file");

         }

 

       }

@@ -314,8 +336,8 @@
     } catch (IOException ex) {

       LOGGER.error(ex);

     }

-    loadMaps(entries);

-    return entries;

+

+    loadMaps(this.adminBoundaryData);

 

   }

 

@@ -323,6 +345,15 @@
     for (AdminBoundary adm : boundaries) {

       if (!adm.getCountryCode().equals("null")) {

         countryMap.put(adm.getCountryCode(), adm.getCountryName());

+        if (countryRegexMap.containsKey(adm.getCountryCode())) {

+          String currentRegex = countryRegexMap.get(adm.getCountryCode());

+          if (currentRegex.length() > adm.getCountryRegex().length()) {

+            // the longest one wins if they are not all the same for each entry in the file

+            countryRegexMap.put(adm.getCountryCode(), currentRegex);

+          }//else do nothing

+        } else {

+          countryRegexMap.put(adm.getCountryCode(), adm.getCountryRegex());

+        }

 

         if (!adm.getProvCode().equals("null")) {

           Map<String, String> provs = provMap.get(adm.getCountryCode());

@@ -349,6 +380,43 @@
         }

       }

     }

+    fillProvRegexMap();

+    fillCountyRegexMap();

+  }

+

+  private void fillProvRegexMap() {

+    this.provinceRegexMap = new HashMap<>();

+    // this.adminBoundaryData

+    for (AdminBoundary adm : adminBoundaryData) {

+

+      if (provinceRegexMap.containsKey(adm.getProvCode())) {

+        String currentRegex = provinceRegexMap.get(adm.getProvCode());

+        if (currentRegex.length() > adm.getProvinceRegex().length()) {

+          // the longest one wins if they are not all the same for each entry in the file

+          provinceRegexMap.put(adm.getProvCode(), currentRegex);

+        }//else do nothing

+      } else {

+        provinceRegexMap.put(adm.getProvCode(), adm.getProvinceRegex());

+      }

+    }

+  }

+

+  private void fillCountyRegexMap() {

+    this.countyRegexMap = new HashMap<>();

+    // this.adminBoundaryData

+    for (AdminBoundary adm : adminBoundaryData) {

+

+      if (countyRegexMap.containsKey(adm.getCountyCode())) {

+        String currentRegex = countyRegexMap.get(adm.getCountyCode());

+        if (currentRegex.length() > adm.getCountyRegex().length()) {

+          // the longest one wins if they are not all the same for each entry in the file

+          countyRegexMap.put(adm.getCountyCode(), currentRegex);

+        }//else do nothing

+      } else {

+        countyRegexMap.put(adm.getCountyCode(), adm.getCountyRegex());

+      }

+    }

+

   }

 

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 3a5eeb0..e18253d 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java

@@ -17,6 +17,7 @@
 

 import java.io.File;

 import java.io.IOException;

+import java.nio.file.Paths;

 import java.util.ArrayList;

 import java.util.HashMap;

 import java.util.List;

@@ -101,10 +102,10 @@
        * build the search string Sometimes no country context is found. In this

        * case the code variables will be empty strings

        */

-      String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ")" + "AND " + whereClause;

+      String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") " + "AND " + whereClause;

       if (searchString.trim().contains(" ") && useHierarchyField) {

         placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"

-                + " AND " + whereClause;

+            + " AND " + whereClause;

       }

 

       /**

@@ -118,7 +119,7 @@
       /**

        * search the placename

        */

-      QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);

+      QueryParser parser = new QueryParser(placeNameQueryString, opennlpAnalyzer);

       Query q = parser.parse(placeNameQueryString);

       //Filter filter = new QueryWrapperFilter(new QueryParser(Version.LUCENE_48, whereClause, opennlpAnalyzer).parse(whereClause));      

 

@@ -160,7 +161,7 @@
         for (int idx = 0; idx < fields.size(); idx++) {

           entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name()));

         }

-       

+

         /**

          * only want hits above the levenstein thresh. This should be a low

          * thresh due to the use of the hierarchy field in the index

@@ -178,7 +179,7 @@
         }

         //}

       }

-      

+

     } catch (IOException | ParseException ex) {

       LOGGER.error(ex);

     }

@@ -186,8 +187,6 @@
     return linkedData;

   }

 

- 

-

   /**

    * Replaces any noise chars with a space, and depending on configuration adds

    * double quotes to the string

@@ -215,12 +214,12 @@
 

       }

 

-      opennlpIndex = new MMapDirectory(new File(indexloc));

+      opennlpIndex = new MMapDirectory(Paths.get(indexloc));

       opennlpReader = DirectoryReader.open(opennlpIndex);

       opennlpSearcher = new IndexSearcher(opennlpReader);

       opennlpAnalyzer

-              = //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-              new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+          = //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+          new StandardAnalyzer(new CharArraySet(new ArrayList(), true));

       Map<String, Analyzer> analyMap = new HashMap<>();

 

       analyMap.put("countrycode", new KeywordAnalyzer());

@@ -230,7 +229,7 @@
       analyMap.put("gazsource", new KeywordAnalyzer());

 

       opennlpAnalyzer

-              = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);

+          = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);

 

       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

       String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 4d01718..43be5d5 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

@@ -25,6 +25,7 @@
 import java.util.Collections;

 import java.util.Comparator;

 import java.util.HashMap;

+import java.util.Iterator;

 import java.util.List;

 import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;

 import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;

@@ -59,10 +60,10 @@
     AdminBoundaryContext context = countryContext.getContext(doctext);

     for (int s = 0; s < sentences.length; s++) {

       Span[] names = namesBySentence[s];

-      

+

       Span[] tokenSpans = tokensBySentence[s];

       String[] tokens = Span.spansToStrings(tokenSpans, sentences[s].getCoveredText(doctext));

-      

+

       String[] matches = Span.spansToStrings(names, tokens);

 

       for (int i = 0; i < matches.length; i++) {

@@ -140,19 +141,30 @@
           double sumo2 = 0d;

           for (String object : o1scoreMap.keySet()) {

             if (object.equals("typescore")

-                    || object.equals("countrycontext")

-                    || object.equals("placenamedicecoef")

-                    || object.equals("geohashbin")

-                    || object.equals("normlucene")) {

+                || object.equals("countrycontext")

+                || object.equals("placenamedicecoef")

+                || object.equals("provincecontext")

+                || object.equals("geohashbin")

+                || object.equals("normlucene")) {

               sumo1 += o1scoreMap.get(object);

               sumo2 += o2scoreMap.get(object);

             }

           }

 

           return Double.compare(sumo1,

-                  sumo2);

+              sumo2);

         }

       }));

+      //prune the list to topN

+      Iterator iterator = linkedData.iterator();

+      int n = 0;

+      while (iterator.hasNext()) {

+        if (n >= topN) {

+          iterator.remove();

+        }

+        iterator.next();

+        n++;

+      }

     }

 

     return spans;

@@ -186,26 +198,26 @@
 

   @Override

   public void init(EntityLinkerProperties properties) throws IOException {

-   

-      this.linkerProperties = properties;

-      countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);

-      gazateerSearcher = new GazetteerSearcher(this.linkerProperties);

-      String rowsRetStr = this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", "2");

-      Integer rws = 2;

-      try {

-        rws = Integer.valueOf(rowsRetStr);

-      } catch (NumberFormatException e) {

-        rws = 2;

-      }

-      topN = rws;

-      loadScorers();

-    

+

+    this.linkerProperties = properties;

+    countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);

+    gazateerSearcher = new GazetteerSearcher(this.linkerProperties);

+    String rowsRetStr = this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", "2");

+    Integer rws = 2;

+    try {

+      rws = Integer.valueOf(rowsRetStr);

+    } catch (NumberFormatException e) {

+      rws = 2;

+    }

+    topN = rws;

+    loadScorers();

+

   }

 

   @Override

-  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence, 

-		  Span[][] namesBySentence, int sentenceIndex) {

+  public List<LinkedSpan> find(String doctext, Span[] sentences, Span[][] tokensBySentence,

+      Span[][] namesBySentence, int sentenceIndex) {

     throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document "

-    		+ "for proper scoring. This method is unsupported");

+        + "for proper scoring. This method is unsupported");

   }

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
index c0d9642..88ca56f 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java

@@ -17,6 +17,8 @@
 

 import java.io.File;

 import java.io.FileNotFoundException;

+import java.io.FileWriter;

+import java.nio.file.Paths;

 import java.util.ArrayList;

 import java.util.HashMap;

 import java.util.Map;

@@ -30,7 +32,6 @@
 import org.apache.lucene.index.IndexWriterConfig;

 import org.apache.lucene.store.Directory;

 import org.apache.lucene.store.MMapDirectory;

-import org.apache.lucene.util.Version;

 

 /**

  * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker.

@@ -47,15 +48,15 @@
       return;

     }

 

-    File geonamesData = new File(args[0]); 

-    File geoNamesCountryInfo = new File(args[1]); 

+    File geonamesData = new File(args[0]);

+    File geoNamesCountryInfo = new File(args[1]);

     File geonamesAdmin1CodesASCII = new File(args[2]);

-    File usgsDataFile = new File(args[3]); 

-    File usgsGovUnitsFile = new File(args[4]); 

-    File outputIndexDir = new File(args[5]); 

-    File outputCountryContextFile = new File(args[6]); 

+    File usgsDataFile = new File(args[3]);

+    File usgsGovUnitsFile = new File(args[4]);

+    File outputIndexDir = new File(args[5]);

+    File outputCountryContextFile = new File(args[6]);

     File regionsFile = new File(args[7]);

-    

+

     try {

       GazetteerIndexer i = new GazetteerIndexer();

       i.index(geonamesData,

@@ -83,27 +84,27 @@
   public enum GazType implements Separable {

 

     GEONAMES {

-              @Override

-              public String toString() {

-                return "/opennlp_geoentitylinker_geonames_idx";

-              }

+          @Override

+          public String toString() {

+            return "/opennlp_geoentitylinker_geonames_idx";

+          }

 

-              @Override

-              public String getSeparator() {

-                return "\t";

-              }

-            },

+          @Override

+          public String getSeparator() {

+            return "\t";

+          }

+        },

     USGS {

-              @Override

-              public String toString() {

-                return "/opennlp_geoentitylinker_usgsgaz_idx";

-              }

+          @Override

+          public String toString() {

+            return "/opennlp_geoentitylinker_usgsgaz_idx";

+          }

 

-              @Override

-              public String getSeparator() {

-                return "\\|";

-              }

-            }

+          @Override

+          public String getSeparator() {

+            return "\\|";

+          }

+        }

   }

 

   /**

@@ -113,7 +114,8 @@
    * 'allCountries.zip'

    * @param geoNamesCountryInfo the countryinfo lookup table that can be

    * downloaded from here

-   * http://download.geonames.org/export/dump/countryinfo.txt

+   * http://download.geonames.org/export/dump/countryInfo.txt You'll need to

+   * copy the page into a file or scrape it

    * @param geonamesAdmin1CodesASCII The lookup data for the province names for

    * each place found here:

    * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight the

@@ -138,7 +140,7 @@
    * @throws Exception

    */

   public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII,

-          File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) throws Exception {

+      File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) throws Exception {

     if (!outputIndexDir.isDirectory()) {

       throw new IllegalArgumentException("outputIndexDir must be a directory.");

     }

@@ -166,8 +168,8 @@
     }

 

     String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";

-    Directory index = new MMapDirectory(new File(indexloc));

-    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+    Directory index = new MMapDirectory(Paths.get(indexloc));

+    Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList(), true));

     Map<String, Analyzer> analyMap = new HashMap<>();

 

     analyMap.put("countrycode", new KeywordAnalyzer());

@@ -175,13 +177,22 @@
     analyMap.put("loctype", new KeywordAnalyzer());

     analyMap.put("countycode", new KeywordAnalyzer());

     analyMap.put("gazsource", new KeywordAnalyzer());

-    

-    PerFieldAnalyzerWrapper aWrapper

-            = new PerFieldAnalyzerWrapper(a, analyMap);

 

-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, aWrapper);

+    PerFieldAnalyzerWrapper aWrapper

+        = new PerFieldAnalyzerWrapper(a, analyMap);

+

+    IndexWriterConfig config = new IndexWriterConfig(aWrapper);

 

     IndexWriter w = new IndexWriter(index, config);

+    

+    //write the column headers for the countryContextFile 

+    FileWriter countryContextFileWriter = new FileWriter(outputCountryContextFile, false);

+    String colNamesForCountryContextFile = "countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";

+    countryContextFileWriter.write(colNamesForCountryContextFile);

+    countryContextFileWriter.flush();

+    countryContextFileWriter.close();

+    

+    

     USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);

 

     GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
index 40344ed..8b57aaa 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java

@@ -92,7 +92,7 @@
         if (cname == null) {

           nullcodes.add(ccode);

         }

-        AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);

+        AdminBoundary data = new AdminBoundary(ccode, cname, pcode, pname, "NO_DATA_FOUND", "NO_DATA_FOUND", cname, pname, "NO_DATA_FOUND");

         //  System.out.println(data);

         outmap.put(ccode + "." + pcode, data);

 

@@ -155,7 +155,8 @@
         String province = adm.getProvinceName();

         String country = adm.getCountryName();

 

-        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n";

+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\t" + "(" + country + ")" + "\t"

+            + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";

         writer.write(line);

         // System.out.println(line);

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
index 93b51ac..f457822 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java

@@ -89,7 +89,8 @@
         //countrycontext file format

         // US	KY	131	United States	Kentucky	Leslie

 

-        ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n");

+        ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "("

+            + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n");

         if (w != null) {

           w.addDocument(doc);

         }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index 26f2c2b..fcd61c1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java

@@ -82,7 +82,13 @@
         String admincode = values[3];

         AdminBoundary get = lookupMap.get(admincode + "." + ccode);

         String countyname = "";

+        if (get == null) {

+          System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: " + admincode + " , placename: " + placeName);

+          continue;

+

+        }

         String countyCode = get.getCountyCode();

+

         if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {

           countyname = get.getCountyName();

         }

@@ -125,8 +131,7 @@
       }

 

     }

-   

-  

+

     for (String state : states.keySet()) {

       StateCentroid get = states.get(state);

       Document doc = new Document();

@@ -143,8 +148,8 @@
       doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));

       doc.add(new StringField("gazsource", "usgs", Field.Store.YES));

       w.addDocument(doc);

-      

-     // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));

+

+      // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));

     }

     Document doc = new Document();

     doc.add(new TextField("hierarchy", "united states", Field.Store.YES));

@@ -202,7 +207,7 @@
         String stateName = values[6];

         String countryCode = values[7];

         String countryName = values[8];

-        AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName);

+        AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName, null, null, null);

         outmap.put(stateCode + "." + countyCode, adminBoundary);

         //  System.out.println(adminBoundary);

 

@@ -232,7 +237,8 @@
          * this is the standard format of the country context file... Geonames

          * data will have an empty string for the county

          */

-        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\n";

+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\t"

+            + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";

         writer.write(line);

         ///  System.out.println(line);

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index ff4219f..aea8f9b 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java

@@ -22,6 +22,7 @@
 import java.util.Map;

 import java.util.Set;

 import java.util.TreeSet;

+import java.util.regex.Pattern;

 import opennlp.addons.geoentitylinker.AdminBoundaryContext;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.BaseLink;

@@ -41,10 +42,12 @@
 

   private Map<String, Set<String>> nameCodesMap;

   String dominantCode = "";

+  private Map<String, String> regexMap = new HashMap<>();

 

   @Override

   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

 

+    regexMap = additionalContext.getCountryRegexMap();

     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

 

   }

@@ -54,20 +57,19 @@
    * matches. Currently the scoring indicates the probability that the toponym

    * is correct based on the country context in the document

    *

-   * @param linkedData     the linked spans, holds the Namefinder results, and

-   *                       the list of BaseLink for each

-   * @param countryHits    all the country mentions in the document

-   * @param nameCodesMap   maps a country indicator name to a country code. Used

-   *                       to determine if the namefinder found the same exact

-   *                       toponym the country context did. If so the score is

-   *                       boosted due to the high probability that the

-   *                       NameFinder actually "rediscovered" a country

-   * @param docText        the full text of the document...not used in this

-   *                       default implementation

-   * @param sentences      the sentences that correspond to the doc text.

+   * @param linkedData the linked spans, holds the Namefinder results, and the

+   * list of BaseLink for each

+   * @param countryHits all the country mentions in the document

+   * @param nameCodesMap maps a country indicator name to a country code. Used

+   * to determine if the namefinder found the same exact toponym the country

+   * context did. If so the score is boosted due to the high probability that

+   * the NameFinder actually "rediscovered" a country

+   * @param docText the full text of the document...not used in this default

+   * implementation

+   * @param sentences the sentences that correspond to the doc text.

    * @param maxAllowedDist a constant that is used to determine which country

-   *                       mentions, based on proximity within the text, should

-   *                       be used to score the Named Entity.

+   * mentions, based on proximity within the text, should be used to score the

+   * Named Entity.

    * @return

    */

   public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {

@@ -155,11 +157,10 @@
 

         score = scoreMap.get(spanCountryCode);

         ///does the name extracted match a country name?

-        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {

+        if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {

           //if so, is it the correct country code for that name?

           if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {

             //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1

-            //TODO: make this smarter, and utilize province/state info in the future to be even more specific

             score = (score + .75) > 1.0 ? 1d : (score + .75);

 

             if (link.getItemParentID().equals(dominantCode)) {

@@ -168,17 +169,17 @@
           }

         }

       }

-      

+

       link.getScoreMap().put("countrycontext", score);

     }

     return span;

   }

 

   /**

-   * takes a map of distances from the toponym to each country mention and generates

-   * a map of scores for each country code. The map is then correlated to the

-   * code of the BaseLink parentid for retrieval. Then the

-   * score is added to the overall list.

+   * takes a map of distances from the toponym to each country mention and

+   * generates a map of scores for each country code. The map is then correlated

+   * to the code of the BaseLink parentid for retrieval. Then the score is added

+   * to the overall list.

    *

    * @param distanceMap

    * @param sentences

@@ -213,13 +214,22 @@
         normalizedDistances.add(reverse);

       }

 

-

       List<Double> doubles = new ArrayList<Double>(normalizedDistances);

       scoreMap.put(key, slidingDistanceAverage(doubles));

     }

     return scoreMap;

   }

 

+  private boolean regexMatch(String placeName, String countryCode) {

+    if (regexMap.containsKey(countryCode)) {

+      String regexForCountry = regexMap.get(countryCode);

+

+      Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);

+      return p.matcher(placeName.trim()).matches();

+    }

+    return false;

+  }

+

   /**

    * this method is an attempt to make closer clusters of mentions group

    * together to smooth out the average, so one distant outlier does not kill

@@ -259,8 +269,8 @@
    * range. Used to normalize distances in this class.

    *

    * @param valueToNormalize the value to place within the new range

-   * @param minimum          the min of the set to be transposed

-   * @param maximum          the max of the set to be transposed

+   * @param minimum the min of the set to be transposed

+   * @param maximum the max of the set to be transposed

    * @return

    */

   private Double normalize(int valueToNormalize, int minimum, int maximum) {


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index e245a00..d9e7d19 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java

@@ -22,6 +22,7 @@
 import java.util.Map;

 import java.util.Set;

 import java.util.TreeSet;

+import java.util.regex.Pattern;

 import opennlp.addons.geoentitylinker.AdminBoundaryContext;

 import opennlp.addons.geoentitylinker.GazetteerEntry;

 import opennlp.tools.entitylinker.BaseLink;

@@ -43,10 +44,12 @@
 

   private Map<String, Set<String>> nameCodesMap;

   String dominantCode = "";

+  private Map<String, String> regexMap = new HashMap<>();

 

   @Override

   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

     if (!additionalContext.getProvHits().isEmpty()) {

+      regexMap = additionalContext.getProvinceRegexMap();

       score(linkedSpans, additionalContext.getProvMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

     } else {

       for (LinkedSpan<BaseLink> span : linkedSpans) {

@@ -167,11 +170,11 @@
 

         score = scoreMap.get(spanCountryCode);

         ///does the name extracted match a province name?

-        if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {

+        if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {

           //if so, is it the correct country code for that name?

           if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) {

             //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1

-            //TODO: make this smarter, and utilize province/state info in the future to be even more specific

+            //TODO: make this smarter

             score = (score + .75) > 1.0 ? 1d : (score + .75);

 

             if (entry.getProvinceCode().equals(dominantCode)) {

@@ -185,6 +188,16 @@
     return span;

   }

 

+  private boolean regexMatch(String placeName, String countryCode) {

+    if (regexMap.containsKey(countryCode)) {

+      String regexForCountry = regexMap.get(countryCode);

+

+      Pattern p = Pattern.compile(regexForCountry, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);

+      return p.matcher(placeName.trim()).matches();

+    }

+    return false;

+  }

+

   /**

    * takes a map of distances from the toponym to each province mention and

    * generates a map of scores for each province code. The map is then
commit	60f958e3cb3570c30b315caa9c406f21a9394d62	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Thu Jun 09 20:09:01 2016 +0000
committer	Mark Giaconia <markg@apache.org>	Thu Jun 09 20:09:01 2016 +0000
tree	7569a2c18e20124454ef7ec2ddb777450ea4679e
parent	3be3a5153a21a37ef4b6ae700c7f52b7a37bc7b8 [diff]