OPENNLP-706 Significant fix to the indexing so that country names are properly discovered. Added a typeboosting scorer, and added descending sort to the output of each call to the geoentitylinker. Also did some general cleanup. Made configurable how many matches are returned from the gazetteer via a property.

commit: 81a91cde31904a86429e795d45324cf5adaee82c [log] [tgz]
author: Mark Giaconia <markg@apache.org> Fri Aug 15 18:10:51 2014 +0000
committer: Mark Giaconia <markg@apache.org> Fri Aug 15 18:10:51 2014 +0000
tree: fbca7f6375944ff46390ac980a041a4e2d7ee9b4
parent: 36759584cec4a4ed6126c038dbb48101d9fb3de3 [diff]
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
index 323aabb..af7f704 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java

@@ -117,16 +117,16 @@
       }

       Map<String, String> provs = new HashMap<>();

 

-      if (!provsForCountry.isEmpty()) {

-        for (String pcode : provsForCountry.keySet()) {

-          if (this.getProvHits().contains(pcode)) {

-            provs.put(pcode, provsForCountry.get(pcode));

-

-            clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType);

-

-          }

-        }

-      }

+//      if (!provsForCountry.isEmpty()) {

+//        for (String pcode : provsForCountry.keySet()) {

+//          if (this.getProvHits().contains(pcode)) {

+//            provs.put(pcode, provsForCountry.get(pcode));

+//

+//            clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType);

+//

+//          }

+//        }

+//      }

       if (provs.isEmpty()) {

         //got a country with no mentioned provs

         clauses.add(" countrycode:" + countryCode + gazType);


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 22211d7..a416136 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java

@@ -18,7 +18,9 @@
 import java.io.File;

 import java.io.IOException;

 import java.util.ArrayList;

+import java.util.HashMap;

 import java.util.List;

+import java.util.Map;

 import java.util.logging.Level;

 import org.apache.lucene.analysis.Analyzer;

 import org.apache.lucene.analysis.standard.StandardAnalyzer;

@@ -37,31 +39,28 @@
 import org.apache.lucene.util.Version;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import org.apache.log4j.Logger;

+import org.apache.lucene.analysis.core.KeywordAnalyzer;

 import org.apache.lucene.analysis.util.CharArraySet;

 

 /**

  *

  * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of

- * these indices are based on loading the indexes using the

- * GeoEntityLinkerSetupUtils

+ * these indices are based on loading the indexes using the GazetteerIndexer

  *

  */

 public class GazetteerSearcher {

 

   //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) ";

-

   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";

   private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);

   private double scoreCutoff = .70;

-  private boolean doubleQuoteAllSearchTerms = true;

+  private boolean doubleQuoteAllSearchTerms = false;

   private boolean useHierarchyField = false;

-  private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

-  private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

+

   private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

   private Analyzer geonamesAnalyzer;

   //usgs US gazateer

-  private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));

-  private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);

+

   private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);

   private Analyzer usgsAnalyzer;

   private EntityLinkerProperties properties;

@@ -75,7 +74,7 @@
     try {

       boolean b = Boolean.valueOf("true");

 

-      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("baghdad", 5, "iz");

+      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("italy", 5, " countrycode:it AND gazsource:geonames");

     } catch (IOException ex) {

       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);

     } catch (Exception ex) {

@@ -108,12 +107,12 @@
        * build the search string Sometimes no country context is found. In this

        * case the code variables will be empty strings

        */

-      String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;

+      String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ")" + "AND " + whereClause;

       if (searchString.trim().contains(" ") && useHierarchyField) {

         placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"

                 + " AND " + whereClause;

       }

-       

+

       /**

        * check the cache and go no further if the records already exist

        */

@@ -127,6 +126,7 @@
        */

       QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);

       Query q = parser.parse(placeNameQueryString);

+      //Filter filter = new QueryWrapperFilter(new QueryParser(Version.LUCENE_48, whereClause, opennlpAnalyzer).parse(whereClause));      

 

       TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);

 

@@ -173,213 +173,8 @@
          * only want hits above the levenstein thresh. This should be a low

          * thresh due to the use of the hierarchy field in the index

          */

-        if (normLev > scoreCutoff) {

-          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {

-            entry.getScoreMap().put("normlucene", normLev);

-            //make sure we don't produce a duplicate

-            if (!linkedData.contains(entry)) {

-              linkedData.add(entry);

-              /**

-               * add the records to the cache for this query

-               */

-              GazetteerSearchCache.put(placeNameQueryString, linkedData);

-            }

-          }

-        }

-      }

-

-    } catch (IOException | ParseException ex) {

-      LOGGER.error(ex);

-    }

-

-    return linkedData;

-  }

-

-  /**

-   *

-   * @param searchString the named entity to look up in the lucene index

-   * @param rowsReturned how many rows to allow lucene to return

-   * @param code the country code

-   *

-   * @return

-   */

-  @Deprecated

-  public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {

-    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

-    searchString = cleanInput(searchString);

-    if (searchString.isEmpty()) {

-      return linkedData;

-    }

-    try {

-      /**

-       * build the search string Sometimes no country context is found. In this

-       * case the code variable will be an empty string

-       */

-      String luceneQueryString = !code.equals("")

-              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"

-              : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();

-      /**

-       * check the cache and go no further if the records already exist

-       */

-      ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);

-      if (get != null) {

-

-        return get;

-      }

-

-      QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, geonamesAnalyzer);

-      Query q = parser.parse(luceneQueryString);

-

-      TopDocs search = geonamesSearcher.search(q, rowsReturned);

-

-      for (int i = 0; i < search.scoreDocs.length; ++i) {

-        GazetteerEntry entry = new GazetteerEntry();

-        int docId = search.scoreDocs[i].doc;

-        double sc = search.scoreDocs[i].score;

-

-        entry.getScoreMap().put("lucene", sc);

-        entry.setIndexID(docId + "");

-        entry.setSource("geonames");

-

-        Document d = geonamesSearcher.doc(docId);

-        List<IndexableField> fields = d.getFields();

-        for (int idx = 0; idx < fields.size(); idx++) {

-          String value = d.get(fields.get(idx).name());

-          value = value.toLowerCase();

-          /**

-           * these positions map to the required fields in the gaz TODO: allow a

-           * configurable list of columns that map to the GazateerEntry fields,

-           * then users would be able to plug in any gazateer they have (if they

-           * build a lucene index out of it)

-           */

-          switch (idx) {

-            case 1:

-              entry.setItemID(value);

-              break;

-            case 3:

-              entry.setLatitude(Double.valueOf(value));

-              break;

-            case 4:

-              entry.setLongitude(Double.valueOf(value));

-              break;

-            case 10:

-              entry.setItemType(value);

-              break;

-            case 12:

-              entry.setItemParentID(value);

-              if (!value.toLowerCase().equals(code.toLowerCase())) {

-                continue;

-              }

-              break;

-            case 23:

-              entry.setItemName(value);

-              break;

-          }

-          entry.getIndexData().put(fields.get(idx).name(), value);

-        }

-        /**

-         * norm the levenstein distance

-         */

-        Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());

-        /**

-         * only want hits above the levenstein thresh

-         */

-        if (normLev.compareTo(scoreCutoff) >= 0) {

-          if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase()) || code.toLowerCase().equals("")) {

-            entry.getScoreMap().put("normlucene", normLev);

-            //make sure we don't produce a duplicate

-            if (!linkedData.contains(entry)) {

-              linkedData.add(entry);

-              /**

-               * add the records to the cache for this query

-               */

-              GazetteerSearchCache.put(luceneQueryString, linkedData);

-            }

-          }

-        }

-      }

-

-    } catch (IOException | ParseException ex) {

-      LOGGER.error(ex);

-    }

-

-    return linkedData;

-  }

-

-  /**

-   * Looks up the name in the USGS gazateer, after checking the cache

-   *

-   * @param searchString the nameed entity to look up in the lucene index

-   * @param rowsReturned how many rows to allow lucene to return

-   *

-   * @return

-   */

-  @Deprecated

-  public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {

-    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

-    searchString = cleanInput(searchString);

-    if (searchString.isEmpty()) {

-      return linkedData;

-    }

-    String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();

-    try {

-

-      /**

-       * hit the cache

-       */

-      ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(luceneQueryString);

-      if (get != null) {

-        //if the name is already there, return the list of cavhed results

-        return get;

-      }

-      QueryParser parser = new QueryParser(Version.LUCENE_48, luceneQueryString, usgsAnalyzer);

-      Query q = parser.parse(luceneQueryString);

-

-      TopDocs search = usgsSearcher.search(q, rowsReturned);

-      for (int i = 0; i < search.scoreDocs.length; i++) {

-        GazetteerEntry entry = new GazetteerEntry();

-        int docId = search.scoreDocs[i].doc;

-        double sc = search.scoreDocs[i].score;

-        //keep track of the min score for normalization

-

-        entry.getScoreMap().put("lucene", sc);

-        entry.setIndexID(docId + "");

-        entry.setSource("usgs");

-        entry.setItemParentID("us");

-        Document d = usgsSearcher.doc(docId);

-        List<IndexableField> fields = d.getFields();

-        for (int idx = 0; idx < fields.size(); idx++) {

-          String value = d.get(fields.get(idx).name());

-          value = value.toLowerCase();

-          switch (idx) {

-            case 0:

-              entry.setItemID(value);

-              break;

-            case 1:

-              entry.setItemName(value);

-              break;

-            case 2:

-              entry.setItemType(value);

-              break;

-            case 9:

-              entry.setLatitude(Double.valueOf(value));

-              break;

-            case 10:

-              entry.setLongitude(Double.valueOf(value));

-              break;

-          }

-          entry.getIndexData().put(fields.get(idx).name(), value);

-        }

-        /**

-         * norm the levenstein distance

-         */

-        Double normLev = Double.valueOf(searchString.length()) / Double.valueOf(entry.getItemName().length());

-        /**

-         * only want hits above the levenstein thresh

-         */

-        if (normLev.compareTo(scoreCutoff) >= 0) {

-          //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene

-

+        // if (normLev > scoreCutoff) {

+        if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {

           entry.getScoreMap().put("normlucene", normLev);

           //make sure we don't produce a duplicate

           if (!linkedData.contains(entry)) {

@@ -387,16 +182,18 @@
             /**

              * add the records to the cache for this query

              */

-            GazetteerSearchCache.put(luceneQueryString, linkedData);

+            GazetteerSearchCache.put(placeNameQueryString, linkedData);

           }

         }

-

+        //}

       }

 

     } catch (IOException | ParseException ex) {

       LOGGER.error(ex);

     }

 

+  

+

     return linkedData;

   }

 

@@ -431,7 +228,17 @@
       opennlpReader = DirectoryReader.open(opennlpIndex);

       opennlpSearcher = new IndexSearcher(opennlpReader);

       //TODO: a language code switch statement should be employed here at some point

-      opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+      opennlpAnalyzer

+              = //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+              new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+      Map<String, Analyzer> analyMap = new HashMap<>();

+

+      analyMap.put("countrycode", new KeywordAnalyzer());

+      analyMap.put("admincode", new KeywordAnalyzer());

+      analyMap.put("loctype", new KeywordAnalyzer());

+      analyMap.put("countycode", new KeywordAnalyzer());

+      analyMap.put("gazsource", new KeywordAnalyzer());

+

       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

       String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));

       if (cutoff != null && !cutoff.isEmpty()) {


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index f608780..e6ffea8 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

@@ -22,9 +22,11 @@
 import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;

 import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;

 import java.util.ArrayList;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.HashMap;

 import java.util.List;

-import java.util.Map;

-import java.util.Set;

+import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;

 import opennlp.tools.entitylinker.BaseLink;

 import opennlp.tools.entitylinker.LinkedSpan;

 import opennlp.tools.util.Span;

@@ -39,8 +41,8 @@
  */

 public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

 

+  private static Integer topN = 2;

   private AdminBoundaryContextGenerator countryContext;

-  private Map<String, Set<Integer>> countryMentions;

   private EntityLinkerProperties linkerProperties;

   private GazetteerSearcher gazateerSearcher;

   private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();

@@ -64,10 +66,10 @@
         ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();

         if (!context.getWhereClauses().isEmpty()) {

           for (String whereclause : context.getWhereClauses()) {

-            geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));

+            geoNamesEntries.addAll(gazateerSearcher.find(matches[i], topN, whereclause));

           }

         } else {//this means there were no where clauses generated so the where clause will default to look at the entire index

-          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:usgs geonames regions "));

+          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], topN, " gaztype:usgs geonames regions "));

         }

         if (geoNamesEntries.isEmpty()) {

           continue;

@@ -86,6 +88,36 @@
         scorer.score(spans, doctext, sentences, linkerProperties, context);

       }

     }

+    /**

+     * sort the data with the best score on top based on the sum of the scores below from the score map for each baselink object

+     */

+    for (LinkedSpan<BaseLink> s : spans) {

+      ArrayList<BaseLink> linkedData = s.getLinkedEntries();

+      Collections.sort(linkedData, Collections.reverseOrder(new Comparator<BaseLink>() {

+        @Override

+        public int compare(BaseLink o1, BaseLink o2) {

+          HashMap<String, Double> o1scoreMap = o1.getScoreMap();

+          HashMap<String, Double> o2scoreMap = o2.getScoreMap();

+          if (o1scoreMap.size() != o2scoreMap.size()) {

+            return 0;

+          }

+          double sumo1 = 0d;

+          double sumo2 = 0d;

+          for (String object : o1scoreMap.keySet()) {

+            if (object.equals("typescore")

+                    || object.equals("countrycontext")

+                    || object.equals("normlucene")

+                    || object.equals("geohashbin")) {

+              sumo1 += o1scoreMap.get(object);

+              sumo2 += o2scoreMap.get(object);

+            }

+          }

+

+          return Double.compare(sumo1,

+                  sumo2);

+        }

+      }));

+    }

 

     return spans;

   }

@@ -96,16 +128,25 @@
       scorers.add(new CountryProximityScorer());

       scorers.add(new ModelBasedScorer());

       scorers.add(new FuzzyStringMatchScorer());

-      // scorers.add(new ProvinceProximityScorer());

+      scorers.add(new PlacetypeScorer());

     }

   }

 

+  

   @Override

   public void init(EntityLinkerProperties properties) throws IOException {

     try {

       this.linkerProperties = properties;

       countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);

       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);

+      String rowsRetStr = this.linkerProperties.getProperty("opennlp.geoentitylinker.gaz.rowsreturned", "2");

+      Integer rws = 2;

+      try {

+        rws = Integer.valueOf(rowsRetStr);

+      } catch (NumberFormatException e) {

+        rws = 2;

+      }

+      topN = rws;

       loadScorers();

     } catch (Exception ex) {

       throw new IOException(ex);


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
index 98dd7b5..1c4a936 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java

@@ -18,8 +18,12 @@
 import java.io.File;

 import java.io.FileNotFoundException;

 import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.Map;

 

 import org.apache.lucene.analysis.Analyzer;

+import org.apache.lucene.analysis.core.KeywordAnalyzer;

+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;

 import org.apache.lucene.analysis.standard.StandardAnalyzer;

 import org.apache.lucene.analysis.util.CharArraySet;

 import org.apache.lucene.index.IndexWriter;

@@ -88,8 +92,8 @@
   /**

    *

    * @param geonamesData the actual Geonames gazetteer data downloaded from

-   * here: http://download.geonames.org/export/dump/ then click on this

-   * link 'allCountries.zip'

+   * here: http://download.geonames.org/export/dump/ then click on this link

+   * 'allCountries.zip'

    * @param geoNamesCountryInfo the countryinfo lookup table that can be

    * downloaded from here

    * http://download.geonames.org/export/dump/countryinfo.txt

@@ -146,9 +150,19 @@
 

     String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";

     Directory index = new MMapDirectory(new File(indexloc));

-

     Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);

+    Map<String, Analyzer> analyMap = new HashMap<>();

+

+    analyMap.put("countrycode", new KeywordAnalyzer());

+    analyMap.put("admincode", new KeywordAnalyzer());

+    analyMap.put("loctype", new KeywordAnalyzer());

+    analyMap.put("countycode", new KeywordAnalyzer());

+    analyMap.put("gazsource", new KeywordAnalyzer());

+    

+    PerFieldAnalyzerWrapper aWrapper

+            = new PerFieldAnalyzerWrapper(a, analyMap);

+

+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, aWrapper);

 

     IndexWriter w = new IndexWriter(index, config);

     USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);

@@ -161,67 +175,4 @@
     System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" + outputCountryContextFile.getPath() + "' to entitylinker.properties file");

   }

 

-  /**

-   * indexes the USGS or Geonames gazateers.

-   *

-   * @param outputIndexDir a DIRECTORY path where you would like to store the

-   * output lucene indexes

-   * @param gazetteerInputData the file, "as is" that was downloaded from the

-   * USGS and GEONAMES website

-   * @param type indicates whether the data is USGS or GEONAMES format

-   * @throws Exception

-   */

-  @Deprecated

-  public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {

-    if (!outputIndexDir.isDirectory()) {

-      throw new IllegalArgumentException("outputIndexDir must be a directory.");

-

-    }

-

-    String indexloc = outputIndexDir + type.toString();

-    Directory index = new MMapDirectory(new File(indexloc));

-

-    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);

-

-    IndexWriter w = new IndexWriter(index, config);

-    //  GeonamesProcessor.process(new File("C:\\temp\\gazetteers\\geonamesdata\\countrycodes.txt"), new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"), gazetteerInputData, null, w);

-    // USGSProcessor.process(gazetteerInputData, outputIndexDir, w);

-    //  readFile(gazetteerInputData, w, type);

-    w.commit();

-    w.close();

-

-  }

-//

-//  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {

-//    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

-//    List<String> fields = new ArrayList<>();

-//    int counter = 0;

-//    System.out.println("reading gazetteer data from file...........");

-//    while (reader.read() != -1) {

-//      String line = reader.readLine();

-//      String[] values = line.split(type.getSeparator());

-//      if (counter == 0) {

-//        for (String columnName : values) {

-//          fields.add(columnName.replace("»¿", "").trim());

-//        }

-//

-//      } else {

-//        Document doc = new Document();

-//        for (int i = 0; i < fields.size() - 1; i++) {

-//          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

-//        }

-//        w.addDocument(doc);

-//      }

-//      counter++;

-//      if (counter % 100000 == 0) {

-//        w.commit();

-//        System.out.println(counter + " .........committed to index..............");

-//      }

-//

-//    }

-//    w.commit();

-//    System.out.println("Completed indexing gaz! index name is: " + type.toString());

-//  }

-

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
index c529676..3392245 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java

@@ -29,6 +29,7 @@
 import opennlp.addons.geoentitylinker.AdminBoundary;

 import org.apache.lucene.document.Document;

 import org.apache.lucene.document.Field;

+import org.apache.lucene.document.StringField;

 import org.apache.lucene.document.TextField;

 import org.apache.lucene.index.IndexWriter;

 

@@ -230,13 +231,17 @@
       String lat = values[4];

       String lon = values[5];

       String dsg = values[7].toLowerCase();

+

       String id = values[0];

       String concatIndexEntry = "";

+      String countryname = "";

       if (adm != null) {

         concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;

+        countryname = adm.getCountryName();

       } else {

         //there is no admin info, but we can still use the countrycode to concat the country name

         String n = countrycodes.get(ccode);

+        countryname = n;

         if (n != null) {

           concatIndexEntry = n + ", " + placeName;

         } else {

@@ -251,27 +256,30 @@
         doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

 

       }

-

+      if (dsg.equals("pcli")) {

+        System.out.println("placename: " + placeName + " RESET TO: " + countryname);

+        placeName = countryname;

+      }

       /**

        * add standard fields to the index

        */

       doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));

       doc.add(new TextField("placename", placeName, Field.Store.YES));

+      // doc.add(new TextField("countryname", countryname, Field.Store.YES));

+      //System.out.println(placeName);

+

       doc.add(new TextField("latitude", lat, Field.Store.YES));

       doc.add(new TextField("longitude", lon, Field.Store.YES));

-      if (boostMap.containsKey(dsg)) {

-        TextField f = new TextField("loctype", dsg, Field.Store.YES);

-        f.setBoost(boostMap.get(dsg));

-        doc.add(f);

-      } else {

-        doc.add(new TextField("loctype", dsg, Field.Store.YES));

+      doc.add(new StringField("loctype", dsg, Field.Store.YES));

+      doc.add(new StringField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));

+      doc.add(new StringField("countrycode", ccode.toLowerCase(), Field.Store.YES));

+      doc.add(new StringField("countycode", "", Field.Store.YES));

+      doc.add(new StringField("locid", id, Field.Store.YES));

+      placeName = placeName.replace("republic of", "").replace("federative", "");

+      if (id.equals("3175395")) {

+        System.out.println(placeName);

       }

-      doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));

-      doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));

-      doc.add(new TextField("countycode", "", Field.Store.YES));

-

-      doc.add(new TextField("locid", id, Field.Store.YES));

-      doc.add(new TextField("gazsource", "geonames", Field.Store.YES));

+      doc.add(new StringField("gazsource", "geonames", Field.Store.YES));

 

       w.addDocument(doc);

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
index 6e5b974..7df001b 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java

@@ -23,6 +23,7 @@
 import java.util.List;

 import org.apache.lucene.document.Document;

 import org.apache.lucene.document.Field;

+import org.apache.lucene.document.StringField;

 import org.apache.lucene.document.TextField;

 import org.apache.lucene.index.IndexWriter;

 

@@ -80,15 +81,15 @@
 

         doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));

         doc.add(new TextField("placename", placeName, Field.Store.YES));

-        doc.add(new TextField("latitude", lat, Field.Store.YES));

-        doc.add(new TextField("longitude", lon, Field.Store.YES));

-        doc.add(new TextField("loctype", dsg, Field.Store.YES));

-        doc.add(new TextField("admincode", "", Field.Store.YES));

-        doc.add(new TextField("countrycode", id, Field.Store.YES));

-        doc.add(new TextField("countycode", "", Field.Store.YES));

+        doc.add(new StringField("latitude", lat, Field.Store.YES));

+        doc.add(new StringField("longitude", lon, Field.Store.YES));

+        doc.add(new StringField("loctype", dsg, Field.Store.YES));

+        doc.add(new StringField("admincode", "", Field.Store.YES));

+        doc.add(new StringField("countrycode", id, Field.Store.YES));

+        doc.add(new StringField("countycode", "", Field.Store.YES));

 

-        doc.add(new TextField("locid", id, Field.Store.YES));

-        doc.add(new TextField("gazsource", "region", Field.Store.YES));

+        doc.add(new StringField("locid", id, Field.Store.YES));

+        doc.add(new StringField("gazsource", "region", Field.Store.YES));

         //countrycontext file format

         // US	KY	131	United States	Kentucky	Leslie

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index 8240bfd..274e2e2 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java

@@ -29,6 +29,7 @@
 import opennlp.addons.geoentitylinker.AdminBoundary;

 import org.apache.lucene.document.Document;

 import org.apache.lucene.document.Field;

+import org.apache.lucene.document.StringField;

 import org.apache.lucene.document.TextField;

 

 import org.apache.lucene.index.IndexWriter;

@@ -86,24 +87,24 @@
         String countyname = "";

         String countyCode = get.getCountyCode();

         if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {

-          countyname =  get.getCountyName();

+          countyname = get.getCountyName();

         }

         if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {

           countyCode = get.getCountyCode();

         }

-        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() +", "+ countyname + ", " + placeName;

-

+        String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName;

+       // doc.add(new TextField("countryname", "united states", Field.Store.YES));

         doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));

         doc.add(new TextField("placename", placeName, Field.Store.YES));

         doc.add(new TextField("latitude", lat, Field.Store.YES));

         doc.add(new TextField("longitude", lon, Field.Store.YES));

-        doc.add(new TextField("loctype", dsg, Field.Store.YES));

-        doc.add(new TextField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));

-        doc.add(new TextField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));

-        doc.add(new TextField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));

+        doc.add(new StringField("loctype", dsg, Field.Store.YES));

+        doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));

+        doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));

+        doc.add(new StringField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));

 

-        doc.add(new TextField("locid", id, Field.Store.YES));

-        doc.add(new TextField("gazsource", "usgs", Field.Store.YES));

+        doc.add(new StringField("locid", id, Field.Store.YES));

+        doc.add(new StringField("gazsource", "usgs", Field.Store.YES));

         w.addDocument(doc);

       }

       counter++;

@@ -118,7 +119,7 @@
   }

 

   private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {

- System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());

+    System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());

     Map<String, AdminBoundary> outmap = new HashMap<>();

     BufferedReader reader;

 

@@ -153,7 +154,7 @@
     } catch (IOException ex) {

       ex.printStackTrace();

     }

-  System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());

+    System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());

 

     return outmap;

 

@@ -176,7 +177,7 @@
          */

         String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\n";

         writer.write(line);

-      ///  System.out.println(line);

+        ///  System.out.println(line);

 

       }

     } catch (IOException ex) {


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
new file mode 100644
index 0000000..6a30f18
--- /dev/null
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java

@@ -0,0 +1,75 @@
+/*

+ * Copyright 2014 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.addons.geoentitylinker.scoring;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import opennlp.addons.geoentitylinker.AdminBoundaryContext;

+import opennlp.addons.geoentitylinker.GazetteerEntry;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.LinkedSpan;

+import opennlp.tools.util.Span;

+

+/**

+ *

+ * @author mgiaconia

+ */

+public class PlacetypeScorer implements LinkedEntityScorer<AdminBoundaryContext> {

+

+  private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");

+  private Map<String, Double> boosetedTypes = new HashMap<>();

+

+  public PlacetypeScorer() {

+    fillMap();

+  }

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

+    for(LinkedSpan<GazetteerEntry> geospan : linkedSpans){

+      ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries();

+      for (GazetteerEntry gazetteerEntry : linkedEntries) {

+        String type = gazetteerEntry.getItemType().toLowerCase();

+        Double score = getScore(type);

+        if(score==null){

+          score = 0d;

+        }

+        gazetteerEntry.getScoreMap().put("typescore", score);

+      }

+    }

+  }

+

+  private Double getScore(String type) {

+    Double ret = boosetedTypes.get(type.toLowerCase());

+    return ret == null ? 0d : ret;

+  }

+

+  private void fillMap() {

+    if (boosetedTypes.isEmpty()) {

+      for (String type : boosts) {

+        if (type.equals("PCLI")) {

+          boosetedTypes.put(type.toLowerCase(), 1d);

+        } else if (type.startsWith("PCL") && !type.equals("PCLI")) {

+          boosetedTypes.put(type.toLowerCase(), .5d);

+        } else if (type.startsWith("ADM")) {

+          boosetedTypes.put(type.toLowerCase(), .75d);

+        }

+      }

+    }

+  }

+

+}
commit	81a91cde31904a86429e795d45324cf5adaee82c	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Fri Aug 15 18:10:51 2014 +0000
committer	Mark Giaconia <markg@apache.org>	Fri Aug 15 18:10:51 2014 +0000
tree	fbca7f6375944ff46390ac980a041a4e2d7ee9b4
parent	36759584cec4a4ed6126c038dbb48101d9fb3de3 [diff]