OPENNLP-706 Addressed issues from Joern's code review, also made use of hierarchy configurable, as well as added boosting at index time to administrative boundary types and populated place types so that these hits are more heavily weighted in the index.

commit: 36759584cec4a4ed6126c038dbb48101d9fb3de3 [log] [tgz]
author: Mark Giaconia <markg@apache.org> Wed Aug 13 12:28:23 2014 +0000
committer: Mark Giaconia <markg@apache.org> Wed Aug 13 12:28:23 2014 +0000
tree: ba34ee71199afacb2d024083b66678a7b169e68e
parent: 33446b94d0612eadb13492b3ea7e5f9a3d245595 [diff]
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
index c09afbd..5f1d149 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java

@@ -140,9 +140,7 @@
    */

   private AdminBoundaryContext process(String text) {

     try {

-      if (text.contains("Convoy of terror")) {

-        System.out.println("");

-      }

+    

       reset();

       Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);

       if (!countryhitMap.isEmpty()) {

@@ -282,7 +280,7 @@
         if (name == null) {

           continue;

         }

-        name = "[^\\p{L}\\p{Nd}]" + name.replace(", the", "") + "[^\\p{L}\\p{Nd}]";

+        name = "(^|[^\\p{L}\\p{Nd}])" + name.replace(", the", "") + "([^\\p{L}\\p{Nd}]|$)";

         Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

         Matcher rs = regex.matcher(docText);

         String code = entry.toLowerCase();


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
index 9a8be47..22211d7 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java

@@ -38,7 +38,6 @@
 import opennlp.tools.entitylinker.EntityLinkerProperties;

 import org.apache.log4j.Logger;

 import org.apache.lucene.analysis.util.CharArraySet;

-import org.apache.lucene.search.Sort;

 

 /**

  *

@@ -49,10 +48,13 @@
  */

 public class GazetteerSearcher {

 

+  //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) ";

+

   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";

   private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);

   private double scoreCutoff = .70;

-  private boolean doubleQuoteAllSearchTerms = false;

+  private boolean doubleQuoteAllSearchTerms = true;

+  private boolean useHierarchyField = false;

   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));

   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);

   private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);

@@ -85,13 +87,16 @@
     this.properties = properties;

     init();

   }

-/**

- * Searches the single lucene index that includes the location hierarchy.

- * @param searchString the location name to search for

- * @param rowsReturned how many index entries to return (top N...)

- * @param whereClause the conditional statement that defines the index type and the country oode.

- * @return 

- */

+

+  /**

+   * Searches the single lucene index that includes the location hierarchy.

+   *

+   * @param searchString the location name to search for

+   * @param rowsReturned how many index entries to return (top N...)

+   * @param whereClause the conditional statement that defines the index type

+   * and the country oode.

+   * @return

+   */

   public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) {

     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

     searchString = cleanInput(searchString);

@@ -104,12 +109,11 @@
        * case the code variables will be empty strings

        */

       String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;

-      if (searchString.trim().contains(" ")) {

+      if (searchString.trim().contains(" ") && useHierarchyField) {

         placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"

                 + " AND " + whereClause;

       }

-

-      //  luceneQueryString = "hierarchy:(tampa florida) AND gazsource:usgs";

+       

       /**

        * check the cache and go no further if the records already exist

        */

@@ -123,14 +127,13 @@
        */

       QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);

       Query q = parser.parse(placeNameQueryString);

-      

-      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned, Sort.RELEVANCE);

-  

+

+      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);

+

       for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {

         GazetteerEntry entry = new GazetteerEntry();

         int docId = bestDocs.scoreDocs[i].doc;

         double sc = bestDocs.scoreDocs[i].score;

-

         entry.getScoreMap().put("lucene", sc);

         entry.setIndexID(docId + "");

 

@@ -165,23 +168,23 @@
          */

         int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();

 

-        Double normLev = Math.abs(1-(sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();

+        Double normLev = Math.abs(1 - (sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();

         /**

          * only want hits above the levenstein thresh. This should be a low

          * thresh due to the use of the hierarchy field in the index

          */

-        if (normLev.compareTo(scoreCutoff) >= 0) {

-//          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {

-          entry.getScoreMap().put("normlucene", normLev);

-          //make sure we don't produce a duplicate

-          if (!linkedData.contains(entry)) {

-            linkedData.add(entry);

-            /**

-             * add the records to the cache for this query

-             */

-            GazetteerSearchCache.put(placeNameQueryString, linkedData);

+        if (normLev > scoreCutoff) {

+          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {

+            entry.getScoreMap().put("normlucene", normLev);

+            //make sure we don't produce a duplicate

+            if (!linkedData.contains(entry)) {

+              linkedData.add(entry);

+              /**

+               * add the records to the cache for this query

+               */

+              GazetteerSearchCache.put(placeNameQueryString, linkedData);

+            }

           }

-//          }

         }

       }

 

@@ -311,7 +314,7 @@
    *

    * @return

    */

-    @Deprecated

+  @Deprecated

   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {

     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();

     searchString = cleanInput(searchString);

@@ -406,6 +409,7 @@
    */

   private String cleanInput(String input) {

     String output = input.replaceAll(REGEX_CLEAN, " ").trim();

+    output = output.replace("  ", " ");

     if (doubleQuoteAllSearchTerms) {

       return "\"" + output + "\"";

     } else {

@@ -415,56 +419,35 @@
   }

 

   private void init() throws Exception {

-//    if (usgsIndex == null) {

-//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");

-//      if (indexloc.equals("")) {

-//        // System.out.println("USGS Gaz location not found");

-//        LOGGER.error(new Exception("USGS Gaz location not found"));

-//      }

-//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

-//

-//      scoreCutoff = Double.valueOf(cutoff);

-//      String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));

-//      doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);

-//      usgsIndex = new MMapDirectory(new File(indexloc));

-//      usgsReader = DirectoryReader.open(usgsIndex);

-//      usgsSearcher = new IndexSearcher(usgsReader);

-//      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-//    }

-//    if (geonamesIndex == null) {

-//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");

-//      if (indexloc.equals("")) {

-//        LOGGER.error(new Exception("Geonames Gaz location not found"));

-//

-//      }

-//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

-//      scoreCutoff = Double.valueOf(cutoff);

-//      geonamesIndex = new MMapDirectory(new File(indexloc));

-//      geonamesReader = DirectoryReader.open(geonamesIndex);

-//      geonamesSearcher = new IndexSearcher(geonamesReader);

-//      //TODO: a language code switch statement should be employed here at some point

-//      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

-//

-//    }

+

     if (opennlpIndex == null) {

       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");

       if (indexloc.equals("")) {

         LOGGER.error(new Exception("Opennlp combined Gaz directory location not found"));

 

       }

-      //  String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

-      //  scoreCutoff = Double.valueOf(cutoff);

+

       opennlpIndex = new MMapDirectory(new File(indexloc));

       opennlpReader = DirectoryReader.open(opennlpIndex);

       opennlpSearcher = new IndexSearcher(opennlpReader);

       //TODO: a language code switch statement should be employed here at some point

       opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));

+      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));

+      String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));

+      if (cutoff != null && !cutoff.isEmpty()) {

+        scoreCutoff = Double.valueOf(cutoff);

+      }

+      if (usehierarchy != null && !usehierarchy.isEmpty()) {

+        useHierarchyField = Boolean.valueOf(usehierarchy);

+      }

+      //  opennlp.geoentitylinker.gaz.doublequote=false

+      //opennlp.geoentitylinker.gaz.hierarchyfield=false

 

     }

   }

 

   private String formatForHierarchy(String searchTerm) {

-    String[] parts = searchTerm.split(" ");

+    String[] parts = cleanInput(searchTerm).split(" ");

     String out = "";

     if (parts.length != 0) {

       for (String string : parts) {


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
index 367a082..f608780 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

@@ -15,6 +15,7 @@
  */

 package opennlp.addons.geoentitylinker;

 

+import java.io.IOException;

 import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;

 import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;

 import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;

@@ -65,10 +66,12 @@
           for (String whereclause : context.getWhereClauses()) {

             geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));

           }

-        }else{//this means there were no where clauses generated so the where clause will default to look at the entire index

-          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));

+        } else {//this means there were no where clauses generated so the where clause will default to look at the entire index

+          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:usgs geonames regions "));

         }

-        //start generating queries

+        if (geoNamesEntries.isEmpty()) {

+          continue;

+        }

         LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);

         newspan.setSearchTerm(matches[i]);

         newspan.setLinkedEntries(geoNamesEntries);

@@ -93,19 +96,19 @@
       scorers.add(new CountryProximityScorer());

       scorers.add(new ModelBasedScorer());

       scorers.add(new FuzzyStringMatchScorer());

-     // scorers.add(new ProvinceProximityScorer());

+      // scorers.add(new ProvinceProximityScorer());

     }

   }

 

   @Override

-  public void init(EntityLinkerProperties properties) {

+  public void init(EntityLinkerProperties properties) throws IOException {

     try {

       this.linkerProperties = properties;

       countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);

       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);

       loadScorers();

     } catch (Exception ex) {

-      throw new RuntimeException(ex);

+      throw new IOException(ex);

     }

   }

 


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
deleted file mode 100644
index 63cb88c..0000000
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
+++ /dev/null

@@ -1,163 +0,0 @@
-/*

- * Copyright 2013 The Apache Software Foundation.

- *

- * Licensed under the Apache License, Version 2.0 (the "License");

- * you may not use this file except in compliance with the License.

- * You may obtain a copy of the License at

- *

- *      http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-package opennlp.addons.geoentitylinker.indexing;

-

-import java.io.BufferedOutputStream;

-import java.io.File;

-import java.io.FileInputStream;

-import java.io.FileOutputStream;

-import java.io.FileWriter;

-import java.io.IOException;

-import java.io.InputStream;

-import java.io.OutputStream;

-import java.util.ArrayList;

-import java.util.Collection;

-import java.util.HashMap;

-import java.util.Map;

-import java.util.Set;

-import opennlp.addons.geoentitylinker.AdminBoundaryContextGenerator;

-import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;

-

-import opennlp.tools.doccat.DoccatModel;

-import opennlp.tools.doccat.DocumentCategorizerME;

-import opennlp.tools.doccat.DocumentSample;

-import opennlp.tools.doccat.DocumentSampleStream;

-import opennlp.tools.entitylinker.EntityLinkerProperties;

-import opennlp.tools.util.ObjectStream;

-import opennlp.tools.util.PlainTextByLineStream;

-

-/**

- *

- * Tools for setting up GeoEntityLinker gazateers and doccat scoring model

- */

-@Deprecated

-public class GeoEntityLinkerSetupUtils {

-  private static final int RADIUS = 200;

-  public static ModelBasedScorer scorer;

-

-  static {

-    scorer = new ModelBasedScorer();

-  }

-

-  /**

-   * Generates the lucene indexes of the USGS and GEONAMES gazateers.

-   *

-   * @param outputIndexDir    the destination directory of the index. Must be a

-   *                          directory

-   * @param gazateerInputData the input data file. Must be in geonames gaz

-   *                          format, or USGS format

-   * @param type              the type, USGS, or GEONAMES

-   */

-  public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazetteerIndexer.GazType type) {

-    GazetteerIndexer indexer = new GazetteerIndexer();

-    try {

-      indexer.index(outputIndexDir, gazateerInputData, type);

-    } catch (Exception ex) {

-      ex.printStackTrace();

-    }

-  }

-

-  /**

-   * Generates a doccat model from proximal features generated from surrounding

-   * context of country mentions. This model is used as a basis for a score

-   * called coutrymodel, which takes the context from around a toponym, and uses

-   * this model to return a score for the country code of the toponym hit in the

-   * gazateer.

-   *

-   * @param documents         A list of document texts, for best results try to

-   *                          ensure each country you care about will be well

-   *                          represented in the collection

-   * @param annotationOutFile the location where the annotated doccat text file

-   *                          will be stored

-   * @param modelOutFile      the location where the doccat model will be stored

-   * @param properties        the properties where the country context object

-   *                          will find it's country data from this property:

-   *                          opennlp.geoentitylinker.countrycontext.filepath

-   * @throws IOException

-   */

-  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {

-    AdminBoundaryContextGenerator context = new AdminBoundaryContextGenerator(properties);

-    FileWriter writer = new FileWriter(annotationOutFile, true);

-    System.out.println("processing " + documents.size() + " documents");

-    for (String docText : documents) {

-      System.out.append(".");

-      Map<String, Set<Integer>> regexfind = context.regexfind(docText);

-      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);

-      for (String key : modelCountryContext.keySet()) {

-        for (String wordbag : modelCountryContext.get(key)) {

-          writer.write(key + " " + wordbag + "\n");

-        }

-      }

-    }

-    System.out.println("Document processing complete. Writing training data to " + annotationOutFile.getAbsolutePath());

-    writer.close();

-    System.out.println("Building Doccat model...");

-    DoccatModel model = null;

-

-    InputStream dataIn = new FileInputStream(annotationOutFile);

-    try {

-    

-      ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, "UTF-8");

-      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

-

-      model = DocumentCategorizerME.train("en", sampleStream);

-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));

-      model.serialize(modelOut);

-      System.out.println("Model complete!");

-    } catch (IOException e) {

-      // Failed to read or parse training data, training failed

-      e.printStackTrace();

-    }

-

-  }

-

-  /**

-   * generates proximal wordbags within the radius of a country mention within

-   * the doctext based on the country context object

-   *

-   *

-   * @param docText

-   * @param additionalContext

-   * @param radius

-   * @return

-   */

-  private static Map<String, ArrayList<String>> modelCountryContext(String docText, AdminBoundaryContextGenerator additionalContext, int radius) {

-    Map<String, ArrayList< String>> featureBags = new HashMap<>();

-    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();

-    /**

-     * iterator over the map that contains a mapping of every country code to

-     * all of its mentions in the document

-     */

-    for (String code : countryMentions.keySet()) {

-      /**

-       * for each mention, collect features from around each mention, then

-       * consolidate the features into another map

-       */

-      for (int mentionIdx : countryMentions.get(code)) {

-        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);

-        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));

-        if (featureBags.containsKey(code)) {

-          featureBags.get(code).add(chunk);

-        } else {

-          ArrayList<String> newlist = new ArrayList<>();

-          newlist.add(chunk);

-          featureBags.put(code, newlist);

-        }

-      }

-    }

-    return featureBags;

-  }

-}


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
index bd73bb9..c529676 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java

@@ -20,7 +20,6 @@
 import java.io.FileReader;

 import java.io.FileWriter;

 import java.io.IOException;

-import java.util.ArrayList;

 import java.util.Arrays;

 import java.util.HashMap;

 import java.util.HashSet;

@@ -164,25 +163,30 @@
         // System.out.println(line);

 

       }

-      writer.close();

+

     } catch (IOException ex) {

       ex.printStackTrace();

     }

     System.out.println("successfully wrote Geonames entries to country oontext file");

   }

 

- /**

-  * 

-  * @param gazateerInputData the Geonames allCounties.txt file

-  * @param type the types of gaz entry, usgs, geonames, or regions

-  * @param adms the province info

-  * @param countrycodes the country code info

-  * @param w the lucene index writer

-  * @throws Exception 

-  */

+  /**

+   *

+   * @param gazateerInputData the Geonames allCounties.txt file

+   * @param type the types of gaz entry, usgs, geonames, or regions

+   * @param adms the province info

+   * @param countrycodes the country code info

+   * @param w the lucene index writer

+   * @throws Exception

+   */

   public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {

 

     BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));

+    String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");

+    Map<String, Float> boostMap = new HashMap<>();

+    for (String boost : boosts) {

+      boostMap.put(boost.toLowerCase(), 10f);

+    }

     String[] fieldStrings = new String[]{

       "geonameid",

       "name",

@@ -225,7 +229,7 @@
       String placeName = values[2];

       String lat = values[4];

       String lon = values[5];

-      String dsg = values[7];

+      String dsg = values[7].toLowerCase();

       String id = values[0];

       String concatIndexEntry = "";

       if (adm != null) {

@@ -255,13 +259,20 @@
       doc.add(new TextField("placename", placeName, Field.Store.YES));

       doc.add(new TextField("latitude", lat, Field.Store.YES));

       doc.add(new TextField("longitude", lon, Field.Store.YES));

-      doc.add(new TextField("loctype", dsg, Field.Store.YES));

+      if (boostMap.containsKey(dsg)) {

+        TextField f = new TextField("loctype", dsg, Field.Store.YES);

+        f.setBoost(boostMap.get(dsg));

+        doc.add(f);

+      } else {

+        doc.add(new TextField("loctype", dsg, Field.Store.YES));

+      }

       doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));

       doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));

       doc.add(new TextField("countycode", "", Field.Store.YES));

 

       doc.add(new TextField("locid", id, Field.Store.YES));

       doc.add(new TextField("gazsource", "geonames", Field.Store.YES));

+

       w.addDocument(doc);

 

       counter++;

@@ -272,7 +283,7 @@
 

     }

 

-    System.out.println("Completed indexing gaz! index name is: " + type.toString());

+    System.out.println("Completed indexing geonames gaz! index name is: " + type.toString());

   }

 

 }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
index 3b667cf..6e5b974 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java

@@ -21,10 +21,6 @@
 import java.io.FileWriter;

 import java.util.ArrayList;

 import java.util.List;

-import java.util.Map;

-import java.util.logging.Level;

-import java.util.logging.Logger;

-import opennlp.addons.geoentitylinker.AdminBoundary;

 import org.apache.lucene.document.Document;

 import org.apache.lucene.document.Field;

 import org.apache.lucene.document.TextField;


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index cdb5ed2..8240bfd 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java

@@ -179,7 +179,6 @@
       ///  System.out.println(line);

 

       }

-      writer.close();

     } catch (IOException ex) {

       Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);

     }


diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index 9101829..ce1bf45 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java

@@ -61,7 +61,7 @@
    * @return

    */

   public double getDiceCoefficient(String s1, String s2, int nGrams) {

-    if (s1.equals("") || s1.equals("")) {

+    if (s1.isEmpty() || s2.isEmpty()) {

       return 0d;

     }

     List<String> s1Grams = new ArrayList<>();
commit	36759584cec4a4ed6126c038dbb48101d9fb3de3	[log] [tgz]
author	Mark Giaconia <markg@apache.org>	Wed Aug 13 12:28:23 2014 +0000
committer	Mark Giaconia <markg@apache.org>	Wed Aug 13 12:28:23 2014 +0000
tree	ba34ee71199afacb2d024083b66678a7b169e68e
parent	33446b94d0612eadb13492b3ea7e5f9a3d245595 [diff]