NUTCH-2956 index-geoip: dependency upgrades and improvements
- upgrade to geoip2 3.0.1
- exclude transitive dependencies (Jackson) provided as Nutch core deps
- read also GeoLite2-*.mmdb files
- review index field names in plugin and Nutch Solr schema:
  - fix typos in field names
  - remove unused fields from schema
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 7faa6fd..bb9aae1 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2112,7 +2112,8 @@
   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and
-  available at runtime.
+  available at runtime. Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb)
+  can be used.
   </description>
 </property>
 
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index 4fa6f71..2eda5a6 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -36,12 +36,11 @@
   </publications>
 
   <dependencies>
-    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" >
-      <!-- Exlude due to classpath issues -->
-      <exclude org="org.apache.httpcomponents" name="httpclient" />
-      <exclude org="org.apache.httpcomponents" name="httpcore" />
-      <exclude org="commons-codec" name="commons-codec" />
-      <exclude org="commons-logging" name="commons-logging" />
+    <dependency org="com.maxmind.geoip2" name="geoip2" rev="3.0.1">
+      <!-- Exlude libs provided in Nutch core -->
+      <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-core" />
     </dependency>
   </dependencies>
   
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
index 6148f59..c4efadf 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -25,11 +25,8 @@
       <library name="index-geoip.jar">
          <export name="*"/>
       </library>
-      <library name="geoip2-2.12.0.jar"/>
-      <library name="jackson-annotations-2.9.5.jar"/>
-      <library name="jackson-core-2.9.5.jar"/>
-      <library name="jackson-databind-2.9.5.jar"/>
-      <library name="maxmind-db-1.2.2.jar"/>
+      <library name="geoip2-3.0.1.jar"/>
+      <library name="maxmind-db-2.0.0.jar"/>
    </runtime>
 
    <requires>
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index 1c697a2..64b3862 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -17,13 +17,17 @@
 package org.apache.nutch.indexer.geoip;
 
 import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 
 import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.maxmind.geoip2.DatabaseReader;
 import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.AddressNotFoundException;
 import com.maxmind.geoip2.exception.GeoIp2Exception;
 import com.maxmind.geoip2.model.InsightsResponse;
 import com.maxmind.geoip2.model.CityResponse;
@@ -54,6 +58,9 @@
  */
 public class GeoIPDocumentCreator {
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   /**
    * Add field to document but only if value isn't null
    * @param doc the {@link NutchDocument} to augment
@@ -61,21 +68,7 @@
    * @param value the String value to associate with the target field
    */
   public static void addIfNotNull(NutchDocument doc, String name,
-      String value) {
-    if (value != null) {
-      doc.add(name, value);
-    }
-  }
-
-  /**
-   * Add field to document but only if value isn't null
-   * @param doc the {@link NutchDocument} to augment
-   * @param name the name of the target field
-   * @param value the {@link java.lang.Integer} value to 
-   * associate with the target field
-   */
-  public static void addIfNotNull(NutchDocument doc, String name,
-      Integer value) {
+      Object value) {
     if (value != null) {
       doc.add(name, value);
     }
@@ -87,7 +80,6 @@
     addIfNotNull(doc, "ip", serverIp);
     InsightsResponse response = client
         .insights(InetAddress.getByName(serverIp));
-    // CityResponse response = client.city(InetAddress.getByName(serverIp));
 
     City city = response.getCity();
     addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
@@ -103,7 +95,7 @@
     addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
     addIfNotNull(doc, "countryName", country.getName()); // 'United States'
     addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
-    addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
+    addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());
 
     Location location = response.getLocation();
     addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
@@ -121,7 +113,7 @@
 
     Subdivision subdivision = response.getMostSpecificSubdivision();
     addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
-    addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
     addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
     addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
 
@@ -169,7 +161,13 @@
   public static NutchDocument createDocFromDomainDb(String serverIp,
       NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
       IOException, GeoIp2Exception {
-    DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+    DomainResponse response;
+    try {
+      response = reader.domain(InetAddress.getByName(serverIp));
+    } catch (AddressNotFoundException e) {
+      LOG.debug("IP address not found: {}", serverIp);
+      return doc;
+    }
     addIfNotNull(doc, "ip", serverIp);
     addIfNotNull(doc, "domain", response.getDomain());
     return doc;
@@ -189,7 +187,14 @@
       NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
       IOException, GeoIp2Exception {
     addIfNotNull(doc, "ip", serverIp);
-    CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+    CityResponse response;
+    try {
+      response = reader.city(InetAddress.getByName(serverIp));
+    } catch (AddressNotFoundException e) {
+      LOG.debug("IP address not found: {}", serverIp);
+      return doc;
+    }
 
     City city = response.getCity();
     addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
@@ -206,7 +211,7 @@
     addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
     addIfNotNull(doc, "countryName", country.getName()); // 'United States'
     addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
-    addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
+    addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());
 
     Location location = response.getLocation();
     addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
@@ -224,7 +229,7 @@
 
     Subdivision subdivision = response.getMostSpecificSubdivision();
     addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
-    addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
     addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
     addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
     return doc;
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
index 4e21273..ea30b8c 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -87,7 +87,8 @@
  *   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
  *   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
  *   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
- *   and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ *   and available at runtime. This can be achieved by adding it to `$NUTCH_HOME/conf`.
+ *   Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) can be used.
  *   </description>
  * </property>
  * 
@@ -152,24 +153,29 @@
           conf.getInt("index.geoip.userid", 12345),
           conf.get("index.geoip.licensekey")).build();
     } else {
-      String db = null;
+      String dbSuffix = null;
       if (usage.equalsIgnoreCase("cityDatabase")) {
-        db = "GeoIP2-City.mmdb";
+        dbSuffix = "-City.mmdb";
       } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
-        db = "GeoIP2-Connection-Type.mmdb";
+        dbSuffix = "-Connection-Type.mmdb";
       } else if (usage.equalsIgnoreCase("domainDatabase")) {
-        db = "GeoIP2-Domain.mmdb";
+        dbSuffix = "-Domain.mmdb";
       } else if (usage.equalsIgnoreCase("ispDatabase")) {
-        db = "GeoIP2-ISP.mmdb";
+        dbSuffix = "-ISP.mmdb";
       }
-      URL dbFileUrl = conf.getResource(db);
-      if (dbFileUrl == null) {
-        LOG.error("GeoDb file {} not found on classpath", db);
-      } else {
-        try {
-          buildDb(new File(dbFileUrl.getFile()));
-        } catch (Exception e) {
-          LOG.error("Failed to read geoDb file {}: ", db, e);
+      String[] dbPrefixes = {"GeoIP2", "GeoLite2"};
+      for (String dbPrefix : dbPrefixes) {
+        String db = dbPrefix + dbSuffix;
+        URL dbFileUrl = conf.getResource(db);
+        if (dbFileUrl == null) {
+          LOG.error("GeoDb file {} not found on classpath", db);
+        } else {
+          try {
+            LOG.info("Reading GeoDb file {}", db);
+            buildDb(new File(dbFileUrl.getFile()));
+          } catch (Exception e) {
+            LOG.error("Failed to read geoDb file {}: ", db, e);
+          }
         }
       }
     }
diff --git a/src/plugin/indexer-solr/schema.xml b/src/plugin/indexer-solr/schema.xml
index 6865eb0..ba71fe1 100644
--- a/src/plugin/indexer-solr/schema.xml
+++ b/src/plugin/indexer-solr/schema.xml
@@ -356,7 +356,7 @@
     <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
     <field name="continentCode" type="string" stored="true" indexed="true" />
     <field name="continentGeoNameId" type="int" stored="true" indexed="true" />
-    <field name="contentName" type="string" stored="true" indexed="true" />
+    <field name="continentName" type="string" stored="true" indexed="true" />
     <field name="countryIsoCode" type="string" stored="true" indexed="true"/>
     <field name="countryName" type="string" stored="true" indexed="true" />
     <field name="countryConfidence" type="int" stored="true" indexed="true"/>
@@ -379,7 +379,6 @@
     <field name="org" type="string" stored="true" indexed="true" />
     <field name="userType" type="string" stored="true" indexed="true" />
     <field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
-    <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" />
     <field name="connType" type="string" stored="true" indexed="true" />
     <field name="location" type="location" stored="true" indexed="true" />