OPENNLP-579 Added a SetupUtils class so users can get the Lucene indexes and Country Doccat models built very easily. Also many other small efficiencies.

commit: 17332b2db33a8918cb56fb2def6bec8db11302c5 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Thu Nov 14 00:42:03 2013 +0000
committer: Mark Giaconia <markg@apache.org> Thu Nov 14 00:42:03 2013 +0000
tree: 6fdf421f73e5c42c3e4a1a6bccec896a50e31bcc
parent: 8ed861d8319528010678625381afbd118b85522a [diff]
diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
index 8216e93..af1aa1c 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java

@@ -41,7 +41,7 @@
       }

     }

 

-  

+

   }

 

   /**


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
index cbe8a0d..5ea08ad 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java

@@ -1,6 +1,17 @@
 /*

- * To change this template, choose Tools | Templates

- * and open the template in the editor.

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

  */

 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

 

@@ -23,7 +34,7 @@
 

 /**

  *

- * @author Owner

+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker

  */

 public class GazateerIndexer {

 


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
index b9401a3..1404ce9 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java

@@ -33,13 +33,11 @@
  */

 public class GeoEntityLinker implements EntityLinker<LinkedSpan> {

 

-  // CountryProximityScorer scorer = new CountryProximityScorer();

-//  private MySQLGeoNamesGazLinkable geoNamesGaz;// = new MySQLGeoNamesGazLinkable();

-//  private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();

   private CountryContext countryContext;

   private Map<String, Set<Integer>> countryMentions;

   private EntityLinkerProperties linkerProperties;

   private GazateerSearcher gazateerSearcher = new GazateerSearcher();

+  private List<LinkedEntityScorer> scorers = new ArrayList<>();

   /**

    * Flag for deciding whether to search gaz only for toponyms within countries

    * that are mentioned in the document

@@ -97,11 +95,12 @@
       }

     }

 

-    List<LinkedEntityScorer<CountryContext>> scorers = new ArrayList<>();

-    scorers.add(new FuzzyStringMatchScorer());

-    scorers.add(new GeoHashBinningScorer());

-    scorers.add(new CountryProximityScorer());

-    scorers.add(new ModelBasedScorer());

+    if (scorers.isEmpty()) {

+      scorers.add(new FuzzyStringMatchScorer());

+      scorers.add(new GeoHashBinningScorer());

+      scorers.add(new CountryProximityScorer());

+      scorers.add(new ModelBasedScorer());

+    }

     for (LinkedEntityScorer scorer : scorers) {

       scorer.score(spans, doctext, sentences, linkerProperties, countryContext);

     }


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
new file mode 100644
index 0000000..05fe374
--- /dev/null
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java

@@ -0,0 +1,146 @@
+/*

+ * Copyright 2013 The Apache Software Foundation.

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

+

+import java.io.BufferedOutputStream;

+import java.io.File;

+import java.io.FileInputStream;

+import java.io.FileOutputStream;

+import java.io.FileWriter;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.OutputStream;

+import java.util.ArrayList;

+import java.util.Collection;

+import java.util.HashMap;

+import java.util.Map;

+import java.util.Set;

+import opennlp.tools.doccat.DoccatModel;

+import opennlp.tools.doccat.DocumentCategorizerME;

+import opennlp.tools.doccat.DocumentSample;

+import opennlp.tools.doccat.DocumentSampleStream;

+import opennlp.tools.entitylinker.EntityLinkerProperties;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.PlainTextByLineStream;

+import static org.apache.opennlp.addons.tools.entitylinker.geoentitylinker.ModelBasedScorer.RADIUS;

+

+

+/**

+ *

+ * Tools for setting up GeoEntityLinker gazateers and doccat scoring model

+ */

+public class GeoEntityLinkerSetupUtils {

+  public static ModelBasedScorer scorer;

+

+  static {

+    scorer = new ModelBasedScorer();

+  }

+    public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){

+      GazateerIndexer indexer = new GazateerIndexer();

+      try {

+        indexer.index(outputIndexDir, gazateerInputData, type);

+      } catch (Exception ex) {

+       ex.printStackTrace();

+      }

+    }

+    /**

+   *

+   * @param documents         A list of document texts, for best results try to

+   *                          ensure each country you care about will be

+   *                          represented in the collection

+   * @param annotationOutFile the location where the annotated doccat text file

+   *                          will be stored

+   * @param modelOutFile      the location where the doccat model will be stored

+   * @param properties        the properties where the country context object

+   *                          will find it's country data from this property:

+   *                          opennlp.geoentitylinker.countrycontext.filepath

+   * @throws IOException

+   */

+  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {

+    CountryContext context = new CountryContext();

+    FileWriter writer = new FileWriter(annotationOutFile, true);

+    System.out.println("processing " + documents.size() + " documents");

+    for (String docText : documents) {

+      System.out.append(".");

+      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);

+      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);

+      for (String key : modelCountryContext.keySet()) {

+        for (String wordbag : modelCountryContext.get(key)) {

+          writer.write(key + " " + wordbag + "\n");

+        }

+      }

+    }

+    System.out.println("Document processing complete. Writing traininf data to file");

+    writer.close();

+    System.out.println("Building Doccat model...");

+    DoccatModel model = null;

+

+    InputStream dataIn = new FileInputStream(annotationOutFile);

+    try {

+

+      ObjectStream<String> lineStream =

+              new PlainTextByLineStream(dataIn, "UTF-8");

+      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

+

+      model = DocumentCategorizerME.train("en", sampleStream);

+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));

+      model.serialize(modelOut);

+       System.out.println("Model complete!");

+    } catch (IOException e) {

+      // Failed to read or parse training data, training failed

+      e.printStackTrace();

+    }

+

+  }

+

+  /**

+   * generates proximal wordbags within the radius of a country mention within

+   * the doctext based on the country context object

+   *

+   *

+   * @param docText

+   * @param additionalContext

+   * @param radius

+   * @return

+   */

+  public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {

+    Map<String, ArrayList< String>> featureBags = new HashMap<>();

+    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

+    /**

+     * iterator over the map that contains a mapping of every country code to

+     * all of its mentions in the document

+     */

+    for (String code : countryMentions.keySet()) {

+      /**

+       * for each mention, collect features from around each mention, then

+       * consolidate the features into another map

+       */

+      for (int mentionIdx : countryMentions.get(code)) {

+        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);

+        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));

+        if (featureBags.containsKey(code)) {

+          featureBags.get(code).add(chunk);

+        } else {

+          ArrayList<String> newlist = new ArrayList<>();

+          newlist.add(chunk);

+          featureBags.put(code, newlist);

+        }

+      }

+    }

+    return featureBags;

+  }

+

+}


diff --git a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
index 5e3abdd..e25ba07 100644
--- a/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
+++ b/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java

@@ -15,46 +15,26 @@
  */

 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

 

-import java.io.BufferedOutputStream;

 import java.io.File;

-import java.io.FileInputStream;

 import java.io.FileNotFoundException;

-import java.io.FileOutputStream;

-import java.io.FileWriter;

 import java.io.IOException;

-import java.io.InputStream;

-import java.io.OutputStream;

-import java.util.ArrayList;

-import java.util.Collection;

 import java.util.HashMap;

 import java.util.List;

 import java.util.Map;

-import java.util.Set;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.tools.doccat.BagOfWordsFeatureGenerator;

 import opennlp.tools.doccat.DoccatModel;

 import opennlp.tools.doccat.DocumentCategorizerME;

-import opennlp.tools.doccat.DocumentSample;

-import opennlp.tools.doccat.DocumentSampleStream;

 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import opennlp.tools.entitylinker.domain.BaseLink;

 import opennlp.tools.entitylinker.domain.LinkedSpan;

-import opennlp.tools.util.ObjectStream;

-import opennlp.tools.util.PlainTextByLineStream;

 import opennlp.tools.util.Span;

 

 /**

  *

- *Utilizes a doccat model to score toponyms based on surrounding context

+ * Utilizes a doccat model to score toponyms based on surrounding context

  */

 public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {

 

-  public static ModelBasedScorer scorer;

 

-  static {

-    scorer = new ModelBasedScorer();

-  }

   DocumentCategorizerME documentCategorizerME;

   DoccatModel doccatModel;

   public static final int RADIUS = 100;

@@ -68,7 +48,6 @@
           return;

         }

         doccatModel = new DoccatModel(new File(path));

-

         documentCategorizerME = new DocumentCategorizerME(doccatModel);

       }

       Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);

@@ -88,7 +67,7 @@
     } catch (IOException ex) {

       System.err.println(ex);

     } catch (Exception ex) {

-      Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex);

+      System.err.println(ex);

     }

   }

 

@@ -136,7 +115,7 @@
     return featureBags;

   }

 

-  private String getTextChunk(int mentionIdx, String docText, int radius) {

+  public String getTextChunk(int mentionIdx, String docText, int radius) {

     int docSize = docText.length();

     int left = 0, right = 0;

     left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;

@@ -167,9 +146,6 @@
 

   private Map<String, Double> getScore(String text) throws Exception {

     Map<String, Double> scoreMap = new HashMap<>();

-    if (documentCategorizerME == null) {

-      documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File("")));

-    }

     double[] categorize = documentCategorizerME.categorize(text);

     int catSize = documentCategorizerME.getNumberOfCategories();

     for (int i = 0; i < catSize; i++) {

@@ -179,88 +155,5 @@
     return scoreMap;

   }

 

-  /**

-   *

-   * @param documents         A list of document texts, for best results try to

-   *                          ensure each country you care about will be

-   *                          represented by the collection

-   * @param annotationOutFile the location where the annotated doccat text file

-   *                          will be stored

-   * @param modelOutFile      the location where the doccat model will be stored

-   * @param properties        the properties where the country context object

-   *                          will find it's country data from this property:

-   *                          opennlp.geoentitylinker.countrycontext.filepath

-   * @throws IOException

-   */

-  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {

-    CountryContext context = new CountryContext();

-    FileWriter writer = new FileWriter(annotationOutFile, true);

-    for (String docText : documents) {

-

-      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);

-      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);

-      for (String key : modelCountryContext.keySet()) {

-        for (String wordbag : modelCountryContext.get(key)) {

-          writer.write(key + " " + wordbag + "\n");

-        }

-      }

-    }

-

-    writer.close();

-

-    DoccatModel model = null;

-

-    InputStream dataIn = new FileInputStream(annotationOutFile);

-    try {

-

-      ObjectStream<String> lineStream =

-              new PlainTextByLineStream(dataIn, "UTF-8");

-      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

-

-      model = DocumentCategorizerME.train("en", sampleStream);

-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));

-      model.serialize(modelOut);

-    } catch (IOException e) {

-      // Failed to read or parse training data, training failed

-      e.printStackTrace();

-    }

-

-  }

-

-  /**

-   * generates proximal wordbags within the radius of a country mention within

-   * the doctext based on the country context object

-   *

-   *

-   * @param docText

-   * @param additionalContext

-   * @param radius

-   * @return

-   */

-  public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {

-    Map<String, ArrayList< String>> featureBags = new HashMap<>();

-    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

-    /**

-     * iterator over the map that contains a mapping of every country code to

-     * all of its mentions in the document

-     */

-    for (String code : countryMentions.keySet()) {

-      /**

-       * for each mention, collect features from around each mention, then

-       * consolidate the features into another map

-       */

-      for (int mentionIdx : countryMentions.get(code)) {

-        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);

-        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));

-        if (featureBags.containsKey(code)) {

-          featureBags.get(code).add(chunk);

-        } else {

-          ArrayList<String> newlist = new ArrayList<>();

-          newlist.add(chunk);

-          featureBags.put(code, newlist);

-        }

-      }

-    }

-    return featureBags;

-  }

+  

 }
commit	17332b2db33a8918cb56fb2def6bec8db11302c5	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Thu Nov 14 00:42:03 2013 +0000
committer	Mark Giaconia <markg@apache.org>	Thu Nov 14 00:42:03 2013 +0000
tree	6fdf421f73e5c42c3e4a1a6bccec896a50e31bcc
parent	8ed861d8319528010678625381afbd118b85522a [diff]