OPENNLP-706 fixed caching, ensured indexing and searching are using the same analyzer wrapper, included provinceproximity scorer

commit: 8594dae3913553df8dae329bda59d0ab8bceb901 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Mon Aug 18 14:49:42 2014 +0000
committer: Mark Giaconia <markg@apache.org> Mon Aug 18 14:49:42 2014 +0000
tree: 138b26f1b3feb3bb7bd75d0dc542a69e24655477
parent: ff90b7c68a88dcd0c9c9ee4d84283f90090c5276 [diff]
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index f62951d..abe2550 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java

@@ -40,6 +40,7 @@
 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import org.apache.log4j.Logger;

 import org.apache.lucene.analysis.core.KeywordAnalyzer;

+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;

 import org.apache.lucene.analysis.util.CharArraySet;

 

 /**

@@ -56,12 +57,7 @@
   private boolean doubleQuoteAllSearchTerms = false;

   private boolean useHierarchyField = false;

 

-  private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

-  private Analyzer geonamesAnalyzer;

-  //usgs US gazateer

 

-  private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);

-  private Analyzer usgsAnalyzer;

   private EntityLinkerProperties properties;

 

   private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));

@@ -167,7 +163,7 @@
          */

         int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();

 

-        Double normLev = Math.abs(1 - (sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();

+        Double normLev = Math.abs(1 - (sc / (double) maxLen));

         /**

          * only want hits above the levenstein thresh. This should be a low

          * thresh due to the use of the hierarchy field in the index

@@ -226,7 +222,6 @@
       opennlpIndex = new MMapDirectory(new File(indexloc));

       opennlpReader = DirectoryReader.open(opennlpIndex);

       opennlpSearcher = new IndexSearcher(opennlpReader);

-      //TODO: a language code switch statement should be employed here at some point

       opennlpAnalyzer

               = //new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

               new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

@@ -237,6 +232,11 @@
       analyMap.put("loctype", new KeywordAnalyzer());

       analyMap.put("countycode", new KeywordAnalyzer());

       analyMap.put("gazsource", new KeywordAnalyzer());

+      

+      

+    opennlpAnalyzer

+            = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);

+

 

       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

       String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 2e2cb0c..0b2fff6 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

@@ -27,6 +27,7 @@
 import java.util.HashMap;

 import java.util.List;

 import opennlp.addons.geoentitylinker.scoring.PlacetypeScorer;

+import opennlp.addons.geoentitylinker.scoring.ProvinceProximityScorer;

 import opennlp.tools.entitylinker.BaseLink;

 import opennlp.tools.entitylinker.LinkedSpan;

 import opennlp.tools.util.Span;

@@ -89,7 +90,8 @@
       }

     }

     /**

-     * sort the data with the best score on top based on the sum of the scores below from the score map for each baselink object

+     * sort the data with the best score on top based on the sum of the scores

+     * below from the score map for each baselink object

      */

     for (LinkedSpan<BaseLink> s : spans) {

       ArrayList<BaseLink> linkedData = s.getLinkedEntries();

@@ -124,6 +126,7 @@
 

   private void loadScorers() {

     if (scorers.isEmpty()) {

+      scorers.add(new ProvinceProximityScorer());

       scorers.add(new GeoHashBinningScorer());

       scorers.add(new CountryProximityScorer());

       scorers.add(new ModelBasedScorer());

@@ -132,7 +135,6 @@
     }

   }

 

-  

   @Override

   public void init(EntityLinkerProperties properties) throws IOException {

     try {


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
index 6a30f18..f6f9fa7 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java

@@ -40,12 +40,12 @@
 

   @Override

   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {

-    for(LinkedSpan<GazetteerEntry> geospan : linkedSpans){

+    for (LinkedSpan<GazetteerEntry> geospan : linkedSpans) {

       ArrayList<GazetteerEntry> linkedEntries = geospan.getLinkedEntries();

       for (GazetteerEntry gazetteerEntry : linkedEntries) {

         String type = gazetteerEntry.getItemType().toLowerCase();

         Double score = getScore(type);

-        if(score==null){

+        if (score == null) {

           score = 0d;

         }

         gazetteerEntry.getScoreMap().put("typescore", score);

@@ -63,12 +63,14 @@
       for (String type : boosts) {

         if (type.equals("PCLI")) {

           boosetedTypes.put(type.toLowerCase(), 1d);

-        } else if (type.startsWith("PCL") && !type.equals("PCLI")) {

+        } else if (type.startsWith("P") && !type.equals("PCLI")) {

           boosetedTypes.put(type.toLowerCase(), .5d);

         } else if (type.startsWith("ADM")) {

           boosetedTypes.put(type.toLowerCase(), .75d);

         }

+

       }

+      boosetedTypes.put("pplc", .9);

     }

   }

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
index f3199a1..afdb4b1 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java

@@ -34,7 +34,7 @@
  * heuristic that toponymn mentions are more likely close to their parent

  * province mentions. For instance, if the toponym Berlin is mentioned near an

  * indicator of Connecticut, it is more likely to be Berlin Connecticut than

- * Berlin Germany (if Germany did not exist in, or is mentioned further down in,

+ * Berlin Germany (if Germany did not exist in, or is mentioned further away in

  * the article).

  *

  *

@@ -51,7 +51,7 @@
     } else {

       for (LinkedSpan<BaseLink> span : linkedSpans) {

         for (BaseLink link : span.getLinkedEntries()) {

-          link.getScoreMap().put("provincecontext", Double.NaN);

+          link.getScoreMap().put("provincecontext", 0d);

         }

       }

     }

@@ -163,7 +163,7 @@
       if (scoreMap.containsKey(spanCountryCode)) {

 

         score = scoreMap.get(spanCountryCode);

-        ///does the name extracted match a country name?

+        ///does the name extracted match a province name?

         if (nameCodesMap.containsKey(entry.getItemName().toLowerCase())) {

           //if so, is it the correct country code for that name?

           if (nameCodesMap.get(entry.getItemName().toLowerCase()).contains(entry.getProvinceCode())) {

@@ -183,8 +183,8 @@
   }

 

   /**

-   * takes a map of distances from the toponym to each country mention and

-   * generates a map of scores for each country code. The map is then correlated

+   * takes a map of distances from the toponym to each province mention and

+   * generates a map of scores for each province code. The map is then correlated

    * to the code of the BaseLink parentid for retrieval. Then the score is added

    * to the overall list.

    *

@@ -232,7 +232,7 @@
    * together to smooth out the average, so one distant outlier does not kill

    * the score for an obviously good hit. More elegant solution is possible

    * using Math.pow, and making the score decay with distance by using an

-   * increasing negative exponent (I think)

+   * increasing negative exponent

    *

    * @param normDis the normalized and sorted set of distances as a list

    * @return
commit	8594dae3913553df8dae329bda59d0ab8bceb901	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Mon Aug 18 14:49:42 2014 +0000
committer	Mark Giaconia <markg@apache.org>	Mon Aug 18 14:49:42 2014 +0000
tree	138b26f1b3feb3bb7bd75d0dc542a69e24655477
parent	ff90b7c68a88dcd0c9c9ee4d84283f90090c5276 [diff]