Merge pull request #476 from sebastian-nagel/NUTCH-2482-index-geoip-npe

NUTCH-2482 index-geoip not to add null values to document fields
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index aa56a68..1a5d1be 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.10.0" >
+    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" >
       <!-- Exlude due to classpath issues -->
       <exclude org="org.apache.httpcomponents" name="httpclient" />
       <exclude org="org.apache.httpcomponents" name="httpcore" />
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
index 821ecc0..6148f59 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -25,10 +25,10 @@
       <library name="index-geoip.jar">
          <export name="*"/>
       </library>
-      <library name="geoip2-2.10.0.jar"/>
-      <library name="jackson-annotations-2.9.0.jar"/>
-      <library name="jackson-core-2.9.2.jar"/>
-      <library name="jackson-databind-2.9.2.jar"/>
+      <library name="geoip2-2.12.0.jar"/>
+      <library name="jackson-annotations-2.9.5.jar"/>
+      <library name="jackson-core-2.9.5.jar"/>
+      <library name="jackson-databind-2.9.5.jar"/>
       <library name="maxmind-db-1.2.2.jar"/>
    </runtime>
 
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index 8e48529..d42ccdd 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -54,66 +54,76 @@
  */
 public class GeoIPDocumentCreator {
 
-  /**
-   * Default constructor.
-   */
-  public GeoIPDocumentCreator() {
+  /** Add field to document but only if value isn't null */
+  public static void addIfNotNull(NutchDocument doc, String name,
+      String value) {
+    if (value != null) {
+      doc.add(name, value);
+    }
+  }
+
+  /** Add field to document but only if value isn't null */
+  public static void addIfNotNull(NutchDocument doc, String name,
+      Integer value) {
+    if (value != null) {
+      doc.add(name, value);
+    }
   }
 
   public static NutchDocument createDocFromInsightsService(String serverIp,
       NutchDocument doc, WebServiceClient client) throws UnknownHostException,
       IOException, GeoIp2Exception {
-    doc.add("ip", serverIp);
+    addIfNotNull(doc, "ip", serverIp);
     InsightsResponse response = client
         .insights(InetAddress.getByName(serverIp));
     // CityResponse response = client.city(InetAddress.getByName(serverIp));
 
     City city = response.getCity();
-    doc.add("cityName", city.getName()); // 'Minneapolis'
-    doc.add("cityConfidence", city.getConfidence()); // 50
-    doc.add("cityGeoNameId", city.getGeoNameId());
+    addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
+    addIfNotNull(doc, "cityConfidence", city.getConfidence()); // 50
+    addIfNotNull(doc, "cityGeoNameId", city.getGeoNameId());
 
     Continent continent = response.getContinent();
-    doc.add("continentCode", continent.getCode());
-    doc.add("continentGeoNameId", continent.getGeoNameId());
-    doc.add("continentName", continent.getName());
+    addIfNotNull(doc, "continentCode", continent.getCode());
+    addIfNotNull(doc, "continentGeoNameId", continent.getGeoNameId());
+    addIfNotNull(doc, "continentName", continent.getName());
 
     Country country = response.getCountry();
-    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
-    doc.add("countryName", country.getName()); // 'United States'
-    doc.add("countryConfidence", country.getConfidence()); // 99
-    doc.add("countryGeoName", country.getGeoNameId());
+    addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
+    addIfNotNull(doc, "countryName", country.getName()); // 'United States'
+    addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
+    addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
 
     Location location = response.getLocation();
-    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+    addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
                                                                                // -93.2323
-    doc.add("accRadius", location.getAccuracyRadius()); // 3
-    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
-    doc.add("metroCode", location.getMetroCode());
+    addIfNotNull(doc, "accRadius", location.getAccuracyRadius()); // 3
+    addIfNotNull(doc, "timeZone", location.getTimeZone()); // 'America/Chicago'
+    addIfNotNull(doc, "metroCode", location.getMetroCode());
 
     Postal postal = response.getPostal();
-    doc.add("postalCode", postal.getCode()); // '55455'
-    doc.add("postalConfidence", postal.getConfidence()); // 40
+    addIfNotNull(doc, "postalCode", postal.getCode()); // '55455'
+    addIfNotNull(doc, "postalConfidence", postal.getConfidence()); // 40
 
     RepresentedCountry rCountry = response.getRepresentedCountry();
-    doc.add("countryType", rCountry.getType());
+    addIfNotNull(doc, "countryType", rCountry.getType());
 
     Subdivision subdivision = response.getMostSpecificSubdivision();
-    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
-    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
-    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
-    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+    addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
+    addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
+    addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
 
     Traits traits = response.getTraits();
-    doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
-    doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
-    doc.add("domain", traits.getDomain());
-    doc.add("isp", traits.getIsp());
-    doc.add("org", traits.getOrganization());
-    doc.add("userType", traits.getUserType());
+    addIfNotNull(doc, "autonSystemNum", traits.getAutonomousSystemNumber());
+    addIfNotNull(doc, "autonSystemOrg", traits.getAutonomousSystemOrganization());
+    addIfNotNull(doc, "domain", traits.getDomain());
+    addIfNotNull(doc, "isp", traits.getIsp());
+    addIfNotNull(doc, "org", traits.getOrganization());
+    addIfNotNull(doc, "userType", traits.getUserType());
     //for better results, users should upgrade to
     //https://www.maxmind.com/en/solutions/geoip2-enterprise-product-suite/anonymous-ip-database
-    doc.add("isAnonProxy", traits.isAnonymousProxy());
+    addIfNotNull(doc, "isAnonProxy", String.valueOf(traits.isAnonymousProxy()));
     return doc;
   }
 
@@ -137,11 +147,11 @@
       NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
       IOException, GeoIp2Exception {
     IspResponse response = reader.isp(InetAddress.getByName(serverIp));
-    doc.add("ip", serverIp);
-    doc.add("autonSystemNum", response.getAutonomousSystemNumber());
-    doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
-    doc.add("isp", response.getIsp());
-    doc.add("org", response.getOrganization());
+    addIfNotNull(doc, "ip", serverIp);
+    addIfNotNull(doc, "autonSystemNum", response.getAutonomousSystemNumber());
+    addIfNotNull(doc, "autonSystemOrg", response.getAutonomousSystemOrganization());
+    addIfNotNull(doc, "isp", response.getIsp());
+    addIfNotNull(doc, "org", response.getOrganization());
     return doc;
   }
 
@@ -149,8 +159,8 @@
       NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
       IOException, GeoIp2Exception {
     DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
-    doc.add("ip", serverIp);
-    doc.add("domain", response.getDomain());
+    addIfNotNull(doc, "ip", serverIp);
+    addIfNotNull(doc, "domain", response.getDomain());
     return doc;
   }
 
@@ -159,52 +169,53 @@
       IOException, GeoIp2Exception {
     ConnectionTypeResponse response = reader.connectionType(InetAddress
         .getByName(serverIp));
-    doc.add("ip", serverIp);
-    doc.add("connType", response.getConnectionType().toString());
+    addIfNotNull(doc, "ip", serverIp);
+    addIfNotNull(doc, "connType", response.getConnectionType().toString());
     return doc;
   }
 
   public static NutchDocument createDocFromCityDb(String serverIp,
       NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
       IOException, GeoIp2Exception {
-    doc.add("ip", serverIp);
+    addIfNotNull(doc, "ip", serverIp);
     CityResponse response = reader.city(InetAddress.getByName(serverIp));
 
     City city = response.getCity();
-    doc.add("cityName", city.getName()); // 'Minneapolis'
-    doc.add("cityConfidence", city.getConfidence()); // 50
-    doc.add("cityGeoNameId", city.getGeoNameId());
+    addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
+    addIfNotNull(doc, "cityConfidence", city.getConfidence()); // 50
+    addIfNotNull(doc, "cityGeoNameId", city.getGeoNameId());
+
 
     Continent continent = response.getContinent();
-    doc.add("continentCode", continent.getCode());
-    doc.add("continentGeoNameId", continent.getGeoNameId());
-    doc.add("continentName", continent.getName());
+    addIfNotNull(doc, "continentCode", continent.getCode());
+    addIfNotNull(doc, "continentGeoNameId", continent.getGeoNameId());
+    addIfNotNull(doc, "continentName", continent.getName());
 
     Country country = response.getCountry();
-    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
-    doc.add("countryName", country.getName()); // 'United States'
-    doc.add("countryConfidence", country.getConfidence()); // 99
-    doc.add("countryGeoName", country.getGeoNameId());
+    addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
+    addIfNotNull(doc, "countryName", country.getName()); // 'United States'
+    addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
+    addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
 
     Location location = response.getLocation();
-    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+    addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
                                                                                // -93.2323
-    doc.add("accRadius", location.getAccuracyRadius()); // 3
-    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
-    doc.add("metroCode", location.getMetroCode());
+    addIfNotNull(doc, "accRadius", location.getAccuracyRadius()); // 3
+    addIfNotNull(doc, "timeZone", location.getTimeZone()); // 'America/Chicago'
+    addIfNotNull(doc, "metroCode", location.getMetroCode());
 
     Postal postal = response.getPostal();
-    doc.add("postalCode", postal.getCode()); // '55455'
-    doc.add("postalConfidence", postal.getConfidence()); // 40
+    addIfNotNull(doc, "postalCode", postal.getCode()); // '55455'
+    addIfNotNull(doc, "postalConfidence", postal.getConfidence()); // 40
 
     RepresentedCountry rCountry = response.getRepresentedCountry();
-    doc.add("countryType", rCountry.getType());
+    addIfNotNull(doc, "countryType", rCountry.getType());
 
     Subdivision subdivision = response.getMostSpecificSubdivision();
-    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
-    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
-    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
-    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+    addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
+    addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
+    addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
     return doc;
   }
 
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
index 8bdc9fa..0d19aec 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -17,6 +17,7 @@
 package org.apache.nutch.indexer.geoip;
 
 import java.lang.invoke.MethodHandles;
+import java.net.URL;
 import java.io.File;
 import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
@@ -118,8 +119,6 @@
 
   private String usage = null;
 
-  private File geoDb = null;
-
   WebServiceClient client = null;
 
   DatabaseReader reader = null;
@@ -146,49 +145,45 @@
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
-    String use = conf.get("index.geoip.usage", "insightsService");
-    LOG.debug("GeoIP usage medium set to: {}", use);
-    if (use.equalsIgnoreCase("cityDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    usage = conf.get("index.geoip.usage", "insightsService");
+    LOG.debug("GeoIP usage medium set to: {}", usage);
+    if (usage.equalsIgnoreCase("insightsService")) {
+      client = new WebServiceClient.Builder(
+          conf.getInt("index.geoip.userid", 12345),
+          conf.get("index.geoip.licensekey")).build();
+    } else {
+      String db = null;
+      if (usage.equalsIgnoreCase("cityDatabase")) {
+        db = "GeoIP2-City.mmdb";
+      } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+        db = "GeoIP2-Connection-Type.mmdb";
+      } else if (usage.equalsIgnoreCase("domainDatabase")) {
+        db = "GeoIP2-Domain.mmdb";
+      } else if (usage.equalsIgnoreCase("ispDatabase")) {
+        db = "GeoIP2-ISP.mmdb";
       }
-    } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
-            .getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      URL dbFileUrl = conf.getResource(db);
+      if (dbFileUrl == null) {
+        LOG.error("GeoDb file {} not found on classpath", db);
+      } else {
+        try {
+          buildDb(new File(dbFileUrl.getFile()));
+        } catch (Exception e) {
+          LOG.error("Failed to read geoDb file {}: ", db, e);
+        }
       }
-    } else if (use.equalsIgnoreCase("domainDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    } else if (use.equalsIgnoreCase("ispDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    } else if (use.equalsIgnoreCase("insightsService")) {
-      client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
-          12345), conf.get("index.geoip.licensekey")).build();
     }
-    usage = use;
+    if (!conf.getBoolean("store.ip.address", false)) {
+      LOG.warn("Plugin index-geoip is active but IP address is not stored"
+          + "(store.ip.address == false)");
+    }
   }
 
-  private void buildDb() {
+  private void buildDb(File geoDb) {
     try {
       reader = new DatabaseReader.Builder(geoDb).build();
     } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Failed to build geoDb:", e);
     }
   }
 
@@ -207,30 +202,25 @@
   private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
       String url) {
 
-    if (conf.getBoolean("store.ip.address", false) == true) {
+    String serverIp = data.getContentMeta().get("_ip_");
+    if (serverIp != null && reader != null) {
       try {
-        String serverIp = data.getContentMeta().get("_ip_");
-        if (serverIp != null) {
-          if (usage.equalsIgnoreCase("cityDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
-                reader);
-          } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
-                reader);
-          } else if (usage.equalsIgnoreCase("domainDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
-                reader);
-          } else if (usage.equalsIgnoreCase("ispDatabase")) {
-            doc = GeoIPDocumentCreator
-                .createDocFromIspDb(serverIp, doc, reader);
-          } else if (usage.equalsIgnoreCase("insightsService")) {
-            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
-                doc, client);
-          }
+        if (usage.equalsIgnoreCase("cityDatabase")) {
+          doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, reader);
+        } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+          doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
+              reader);
+        } else if (usage.equalsIgnoreCase("domainDatabase")) {
+          doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
+              reader);
+        } else if (usage.equalsIgnoreCase("ispDatabase")) {
+          doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, reader);
+        } else if (usage.equalsIgnoreCase("insightsService")) {
+          doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc,
+              client);
         }
       } catch (Exception e) {
-        LOG.error(e.getMessage());
-        e.printStackTrace();
+        LOG.error("Failed to determine geoip:", e);
       }
     }
     return doc;