OPENNLP-615 Added a scoring impl that utilizes a doccat model to help with toponym resolution. The ModelBasedScorer also contains two static methods for training the model based on the CountryContext information used by the GeoEntityLinker.

commit: c570240fc7ea34982bc63b851abf279b20ec6395 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Tue Nov 12 11:54:20 2013 +0000
committer: Mark Giaconia <markg@apache.org> Tue Nov 12 11:54:20 2013 +0000
tree: b7850ba4e4691d0c750dee12d2c89126290f6537
parent: eb4698419276468abcfdc4e96867342994121484 [diff]
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
index 1702f85..541d042 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java

@@ -82,7 +82,7 @@
      //   countrydata = getCountryData(properties);

       }

       for (CountryContextEntry entry : countrydata) {

-        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

+        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

         Matcher rs = regex.matcher(docText);

         String code = entry.getCc1().toLowerCase();

 


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
index 4b24b11..3198650 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java

@@ -22,6 +22,7 @@
 import java.util.Map;

 import java.util.Set;

 import java.util.TreeSet;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.domain.BaseLink;

 import opennlp.tools.entitylinker.domain.LinkedSpan;

 import opennlp.tools.util.Span;

@@ -35,7 +36,7 @@
   String dominantCode = "";

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {

 

     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);

 

@@ -191,8 +192,12 @@
       all.addAll(distanceMap.get(key));

     }

     //get min max for normalization, this could be more efficient

+

     Integer min = all.first();

     Integer max = all.last();

+    if(min==max){

+      min=0;

+    }

     for (String key : distanceMap.keySet()) {

 

       TreeSet<Double> normalizedDistances = new TreeSet<Double>();


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
index c21f5e2..8216e93 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java

@@ -18,6 +18,7 @@
 import java.util.HashSet;

 import java.util.List;

 import java.util.Set;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.domain.BaseLink;

 import opennlp.tools.entitylinker.domain.LinkedSpan;

 import opennlp.tools.ngram.NGramGenerator;

@@ -30,7 +31,7 @@
 public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

     for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {

       for (BaseLink link : linkedSpan.getLinkedEntries()) {

         Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
index e89e8a5..13cce97 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java

@@ -42,7 +42,7 @@
  */

 public class GazateerSearcher {

 

-  private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();

+  //private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();

   private double scoreCutoff = .75;

   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

@@ -72,7 +72,7 @@
         geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);

       }

 

-      String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " & CC1:" + code.toUpperCase();// + "~1.0";

+      String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " AND CC1:" + code.toLowerCase() + "^100";

       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);

       Query q = parser.parse(luceneQueryString);

 


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
index fe58e0d..b9401a3 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java

@@ -24,6 +24,7 @@
 import opennlp.tools.util.Span;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.EntityLinker;

+

 /**

  * Links location entities to gazatteers. Currently supports gazateers in a

  * MySql database (NGA and USGS)

@@ -71,7 +72,7 @@
           // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);

           for (String code : countryMentions.keySet()) {

             if (!code.equals("us")) {

-              geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code, linkerProperties));

+              geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));

             }

           }

 

@@ -100,9 +101,9 @@
     scorers.add(new FuzzyStringMatchScorer());

     scorers.add(new GeoHashBinningScorer());

     scorers.add(new CountryProximityScorer());

-

+    scorers.add(new ModelBasedScorer());

     for (LinkedEntityScorer scorer : scorers) {

-      scorer.score(spans, doctext, sentences, countryContext);

+      scorer.score(spans, doctext, sentences, linkerProperties, countryContext);

     }

     return spans;

   }


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
index 7a87ee6..26b69c1 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java

@@ -22,6 +22,7 @@
 import java.util.Set;

 import java.util.TreeMap;

 import java.util.TreeSet;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.domain.BaseLink;

 import opennlp.tools.entitylinker.domain.LinkedSpan;

 import opennlp.tools.util.Span;

@@ -34,7 +35,7 @@
 public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {

 

   @Override

-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {

      score( linkedSpans);

   }

 


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
index a70a628..3f7d5fa 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java

@@ -16,6 +16,7 @@
 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

 

 import java.util.List;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.domain.LinkedSpan;

 import opennlp.tools.util.Span;

 

@@ -33,5 +34,5 @@
  * @param additionalContext any additional data required to perform the scoring operation

  * @return void

  */

-  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, T additionalContext);

+  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);

 }


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
new file mode 100644
index 0000000..be07240
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java

@@ -0,0 +1,255 @@
+/*

+ * To change this template, choose Tools | Templates

+ * and open the template in the editor.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.io.BufferedOutputStream;

+import java.io.File;

+import java.io.FileInputStream;

+import java.io.FileNotFoundException;

+import java.io.FileOutputStream;

+import java.io.FileWriter;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.OutputStream;

+import java.util.ArrayList;

+import java.util.Collection;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Level;

+import java.util.logging.Logger;

+import opennlp.tools.doccat.BagOfWordsFeatureGenerator;

+import opennlp.tools.doccat.DoccatModel;

+import opennlp.tools.doccat.DocumentCategorizerME;

+import opennlp.tools.doccat.DocumentSample;

+import opennlp.tools.doccat.DocumentSampleStream;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.entitylinker.domain.BaseLink;

+import opennlp.tools.entitylinker.domain.LinkedSpan;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.PlainTextByLineStream;

+import opennlp.tools.util.Span;

+

+/**

+ *

+ * @author Owner

+ */

+public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {

+

+  public static ModelBasedScorer scorer;

+

+  static {

+    scorer = new ModelBasedScorer();

+  }

+  DocumentCategorizerME documentCategorizerME;

+  DoccatModel doccatModel;

+  public static final int RADIUS = 100;

+

+  @Override

+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {

+    try {

+      if (doccatModel == null) {

+        String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");

+        if (path.equals("")) {

+          return;

+        }

+        doccatModel = new DoccatModel(new File(path));

+

+        documentCategorizerME = new DocumentCategorizerME(doccatModel);

+      }

+      Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);

+      for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {

+        Map<String, Double> scores = this.getScore(entry.getValue());

+        for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {

+          double score = 0d;

+          if (scores.containsKey(link.getItemParentID())) {

+            score = scores.get(link.getItemParentID());

+          }

+          link.getScoreMap().put("countrymodel", score);

+        }

+      }

+

+    } catch (FileNotFoundException ex) {

+      System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");

+    } catch (IOException ex) {

+      System.err.println(ex);

+    } catch (Exception ex) {

+      Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex);

+    }

+  }

+

+  /**

+   * generates features using a BagOfWordsfeatureGenerator that are within the

+   * radius of a mention within the doctext

+   *

+   * @param linkedSpans

+   * @param docText

+   * @param additionalContext

+   * @param radius

+   * @return a map of the index of the linked span to the string of surrounding

+   *         text: Map<indexofspan,surrounding text>

+   */

+  public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {

+    Map<Integer, String> featureBags = new HashMap<>();

+    Map<Integer, Integer> nameMentionMap = new HashMap<>();

+    /**

+     * iterator over the map that contains a mapping of every country code to

+     * all of its mentions in the document

+     */

+    for (int i = 0; i < linkedSpans.size(); i++) {

+      LinkedSpan span = linkedSpans.get(i);

+      if (span.getLinkedEntries().isEmpty()) {

+        //don't care about spans that did not get linked to anything at all; nothing to work with

+        continue;

+      }

+      /**

+       * get the sentence the name span was found in, the beginning of the

+       * sentence will suffice as a centroid for feature generation around the

+       * named entity

+       */

+      Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();

+      nameMentionMap.put(i, mentionIdx);

+    }

+    /**

+     * now associate each span to a string that will be used for categorization

+     * against the model.

+     */

+    for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {

+      featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));

+    }

+

+

+    return featureBags;

+  }

+

+  private String getTextChunk(int mentionIdx, String docText, int radius) {

+    int docSize = docText.length();

+    int left = 0, right = 0;

+    left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;

+    right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;

+    String chunk = "";

+    if (right <= left) {

+      chunk = "";

+    } else {

+      /**

+       * don't want to chop any words in half, so take fron the first space to

+       * the last space in the chunk string

+       */

+      chunk = docText.substring(left, right);

+      if (left != 0) {

+        left = chunk.indexOf(" ");

+      }

+      right = chunk.lastIndexOf(" ");

+      /**

+       * now get the substring again with only whole words

+       */

+      if (left < right) {

+        chunk = chunk.substring(left, right);

+      }

+    }

+

+    return chunk;

+  }

+

+  private Map<String, Double> getScore(String text) throws Exception {

+    Map<String, Double> scoreMap = new HashMap<>();

+    if (documentCategorizerME == null) {

+      documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File("")));

+    }

+    double[] categorize = documentCategorizerME.categorize(text);

+    int catSize = documentCategorizerME.getNumberOfCategories();

+    for (int i = 0; i < catSize; i++) {

+      String category = documentCategorizerME.getCategory(i);

+      scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);

+    }

+    return scoreMap;

+  }

+

+  /**

+   *

+   * @param documents         A list of document texts, for best results try to

+   *                          ensure each country you care about will be

+   *                          represented by the collection

+   * @param annotationOutFile the location where the annotated doccat text file

+   *                          will be stored

+   * @param modelOutFile      the location where the doccat model will be stored

+   * @param properties        the properties where the country context object

+   *                          will find it's country data from this property:

+   *                          opennlp.geoentitylinker.countrycontext.filepath

+   * @throws IOException

+   */

+  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {

+    CountryContext context = new CountryContext();

+    FileWriter writer = new FileWriter(annotationOutFile, true);

+    for (String docText : documents) {

+

+      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);

+      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);

+      for (String key : modelCountryContext.keySet()) {

+        for (String wordbag : modelCountryContext.get(key)) {

+          writer.write(key + " " + wordbag + "\n");

+        }

+      }

+    }

+

+    writer.close();

+

+    DoccatModel model = null;

+

+    InputStream dataIn = new FileInputStream(annotationOutFile);

+    try {

+

+      ObjectStream<String> lineStream =

+              new PlainTextByLineStream(dataIn, "UTF-8");

+      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

+

+      model = DocumentCategorizerME.train("en", sampleStream);

+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));

+      model.serialize(modelOut);

+    } catch (IOException e) {

+      // Failed to read or parse training data, training failed

+      e.printStackTrace();

+    }

+

+  }

+

+  /**

+   * generates proximal wordbags within the radius of a country mention within

+   * the doctext based on the country context object

+   *

+   *

+   * @param docText

+   * @param additionalContext

+   * @param radius

+   * @return

+   */

+  public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {

+    Map<String, ArrayList< String>> featureBags = new HashMap<>();

+    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

+    /**

+     * iterator over the map that contains a mapping of every country code to

+     * all of its mentions in the document

+     */

+    for (String code : countryMentions.keySet()) {

+      /**

+       * for each mention, collect features from around each mention, then

+       * consolidate the features into another map

+       */

+      for (int mentionIdx : countryMentions.get(code)) {

+        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);

+        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));

+        if (featureBags.containsKey(code)) {

+          featureBags.get(code).add(chunk);

+        } else {

+          ArrayList<String> newlist = new ArrayList<>();

+          newlist.add(chunk);

+          featureBags.put(code, newlist);

+        }

+      }

+    }

+    return featureBags;

+  }

+}
commit	c570240fc7ea34982bc63b851abf279b20ec6395	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Tue Nov 12 11:54:20 2013 +0000
committer	Mark Giaconia <markg@apache.org>	Tue Nov 12 11:54:20 2013 +0000
tree	b7850ba4e4691d0c750dee12d2c89126290f6537
parent	eb4698419276468abcfdc4e96867342994121484 [diff]