Merge pull request #476 from sebastian-nagel/NUTCH-2482-index-geoip-npe
NUTCH-2482 index-geoip not to add null values to document fields
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index aa56a68..1a5d1be 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.10.0" >
+ <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" >
<!-- Exlude due to classpath issues -->
<exclude org="org.apache.httpcomponents" name="httpclient" />
<exclude org="org.apache.httpcomponents" name="httpcore" />
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
index 821ecc0..6148f59 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -25,10 +25,10 @@
<library name="index-geoip.jar">
<export name="*"/>
</library>
- <library name="geoip2-2.10.0.jar"/>
- <library name="jackson-annotations-2.9.0.jar"/>
- <library name="jackson-core-2.9.2.jar"/>
- <library name="jackson-databind-2.9.2.jar"/>
+ <library name="geoip2-2.12.0.jar"/>
+ <library name="jackson-annotations-2.9.5.jar"/>
+ <library name="jackson-core-2.9.5.jar"/>
+ <library name="jackson-databind-2.9.5.jar"/>
<library name="maxmind-db-1.2.2.jar"/>
</runtime>
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index 8e48529..d42ccdd 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -54,66 +54,76 @@
*/
public class GeoIPDocumentCreator {
- /**
- * Default constructor.
- */
- public GeoIPDocumentCreator() {
+ /** Add field to document but only if value isn't null */
+ public static void addIfNotNull(NutchDocument doc, String name,
+ String value) {
+ if (value != null) {
+ doc.add(name, value);
+ }
+ }
+
+ /** Add field to document but only if value isn't null */
+ public static void addIfNotNull(NutchDocument doc, String name,
+ Integer value) {
+ if (value != null) {
+ doc.add(name, value);
+ }
}
public static NutchDocument createDocFromInsightsService(String serverIp,
NutchDocument doc, WebServiceClient client) throws UnknownHostException,
IOException, GeoIp2Exception {
- doc.add("ip", serverIp);
+ addIfNotNull(doc, "ip", serverIp);
InsightsResponse response = client
.insights(InetAddress.getByName(serverIp));
// CityResponse response = client.city(InetAddress.getByName(serverIp));
City city = response.getCity();
- doc.add("cityName", city.getName()); // 'Minneapolis'
- doc.add("cityConfidence", city.getConfidence()); // 50
- doc.add("cityGeoNameId", city.getGeoNameId());
+ addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
+ addIfNotNull(doc, "cityConfidence", city.getConfidence()); // 50
+ addIfNotNull(doc, "cityGeoNameId", city.getGeoNameId());
Continent continent = response.getContinent();
- doc.add("continentCode", continent.getCode());
- doc.add("continentGeoNameId", continent.getGeoNameId());
- doc.add("continentName", continent.getName());
+ addIfNotNull(doc, "continentCode", continent.getCode());
+ addIfNotNull(doc, "continentGeoNameId", continent.getGeoNameId());
+ addIfNotNull(doc, "continentName", continent.getName());
Country country = response.getCountry();
- doc.add("countryIsoCode", country.getIsoCode()); // 'US'
- doc.add("countryName", country.getName()); // 'United States'
- doc.add("countryConfidence", country.getConfidence()); // 99
- doc.add("countryGeoName", country.getGeoNameId());
+ addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
+ addIfNotNull(doc, "countryName", country.getName()); // 'United States'
+ addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
+ addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
Location location = response.getLocation();
- doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+ addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
// -93.2323
- doc.add("accRadius", location.getAccuracyRadius()); // 3
- doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
- doc.add("metroCode", location.getMetroCode());
+ addIfNotNull(doc, "accRadius", location.getAccuracyRadius()); // 3
+ addIfNotNull(doc, "timeZone", location.getTimeZone()); // 'America/Chicago'
+ addIfNotNull(doc, "metroCode", location.getMetroCode());
Postal postal = response.getPostal();
- doc.add("postalCode", postal.getCode()); // '55455'
- doc.add("postalConfidence", postal.getConfidence()); // 40
+ addIfNotNull(doc, "postalCode", postal.getCode()); // '55455'
+ addIfNotNull(doc, "postalConfidence", postal.getConfidence()); // 40
RepresentedCountry rCountry = response.getRepresentedCountry();
- doc.add("countryType", rCountry.getType());
+ addIfNotNull(doc, "countryType", rCountry.getType());
Subdivision subdivision = response.getMostSpecificSubdivision();
- doc.add("subDivName", subdivision.getName()); // 'Minnesota'
- doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
- doc.add("subDivConfidence", subdivision.getConfidence()); // 90
- doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+ addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
+ addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
+ addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
Traits traits = response.getTraits();
- doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
- doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
- doc.add("domain", traits.getDomain());
- doc.add("isp", traits.getIsp());
- doc.add("org", traits.getOrganization());
- doc.add("userType", traits.getUserType());
+ addIfNotNull(doc, "autonSystemNum", traits.getAutonomousSystemNumber());
+ addIfNotNull(doc, "autonSystemOrg", traits.getAutonomousSystemOrganization());
+ addIfNotNull(doc, "domain", traits.getDomain());
+ addIfNotNull(doc, "isp", traits.getIsp());
+ addIfNotNull(doc, "org", traits.getOrganization());
+ addIfNotNull(doc, "userType", traits.getUserType());
//for better results, users should upgrade to
//https://www.maxmind.com/en/solutions/geoip2-enterprise-product-suite/anonymous-ip-database
- doc.add("isAnonProxy", traits.isAnonymousProxy());
+ addIfNotNull(doc, "isAnonProxy", String.valueOf(traits.isAnonymousProxy()));
return doc;
}
@@ -137,11 +147,11 @@
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
IspResponse response = reader.isp(InetAddress.getByName(serverIp));
- doc.add("ip", serverIp);
- doc.add("autonSystemNum", response.getAutonomousSystemNumber());
- doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
- doc.add("isp", response.getIsp());
- doc.add("org", response.getOrganization());
+ addIfNotNull(doc, "ip", serverIp);
+ addIfNotNull(doc, "autonSystemNum", response.getAutonomousSystemNumber());
+ addIfNotNull(doc, "autonSystemOrg", response.getAutonomousSystemOrganization());
+ addIfNotNull(doc, "isp", response.getIsp());
+ addIfNotNull(doc, "org", response.getOrganization());
return doc;
}
@@ -149,8 +159,8 @@
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
- doc.add("ip", serverIp);
- doc.add("domain", response.getDomain());
+ addIfNotNull(doc, "ip", serverIp);
+ addIfNotNull(doc, "domain", response.getDomain());
return doc;
}
@@ -159,52 +169,53 @@
IOException, GeoIp2Exception {
ConnectionTypeResponse response = reader.connectionType(InetAddress
.getByName(serverIp));
- doc.add("ip", serverIp);
- doc.add("connType", response.getConnectionType().toString());
+ addIfNotNull(doc, "ip", serverIp);
+ addIfNotNull(doc, "connType", response.getConnectionType().toString());
return doc;
}
public static NutchDocument createDocFromCityDb(String serverIp,
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
- doc.add("ip", serverIp);
+ addIfNotNull(doc, "ip", serverIp);
CityResponse response = reader.city(InetAddress.getByName(serverIp));
City city = response.getCity();
- doc.add("cityName", city.getName()); // 'Minneapolis'
- doc.add("cityConfidence", city.getConfidence()); // 50
- doc.add("cityGeoNameId", city.getGeoNameId());
+ addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
+ addIfNotNull(doc, "cityConfidence", city.getConfidence()); // 50
+ addIfNotNull(doc, "cityGeoNameId", city.getGeoNameId());
+
Continent continent = response.getContinent();
- doc.add("continentCode", continent.getCode());
- doc.add("continentGeoNameId", continent.getGeoNameId());
- doc.add("continentName", continent.getName());
+ addIfNotNull(doc, "continentCode", continent.getCode());
+ addIfNotNull(doc, "continentGeoNameId", continent.getGeoNameId());
+ addIfNotNull(doc, "continentName", continent.getName());
Country country = response.getCountry();
- doc.add("countryIsoCode", country.getIsoCode()); // 'US'
- doc.add("countryName", country.getName()); // 'United States'
- doc.add("countryConfidence", country.getConfidence()); // 99
- doc.add("countryGeoName", country.getGeoNameId());
+ addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
+ addIfNotNull(doc, "countryName", country.getName()); // 'United States'
+ addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
+ addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
Location location = response.getLocation();
- doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+ addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
// -93.2323
- doc.add("accRadius", location.getAccuracyRadius()); // 3
- doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
- doc.add("metroCode", location.getMetroCode());
+ addIfNotNull(doc, "accRadius", location.getAccuracyRadius()); // 3
+ addIfNotNull(doc, "timeZone", location.getTimeZone()); // 'America/Chicago'
+ addIfNotNull(doc, "metroCode", location.getMetroCode());
Postal postal = response.getPostal();
- doc.add("postalCode", postal.getCode()); // '55455'
- doc.add("postalConfidence", postal.getConfidence()); // 40
+ addIfNotNull(doc, "postalCode", postal.getCode()); // '55455'
+ addIfNotNull(doc, "postalConfidence", postal.getConfidence()); // 40
RepresentedCountry rCountry = response.getRepresentedCountry();
- doc.add("countryType", rCountry.getType());
+ addIfNotNull(doc, "countryType", rCountry.getType());
Subdivision subdivision = response.getMostSpecificSubdivision();
- doc.add("subDivName", subdivision.getName()); // 'Minnesota'
- doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
- doc.add("subDivConfidence", subdivision.getConfidence()); // 90
- doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+ addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
+ addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
+ addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
return doc;
}
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
index 8bdc9fa..0d19aec 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -17,6 +17,7 @@
package org.apache.nutch.indexer.geoip;
import java.lang.invoke.MethodHandles;
+import java.net.URL;
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
@@ -118,8 +119,6 @@
private String usage = null;
- private File geoDb = null;
-
WebServiceClient client = null;
DatabaseReader reader = null;
@@ -146,49 +145,45 @@
@Override
public void setConf(Configuration conf) {
this.conf = conf;
- String use = conf.get("index.geoip.usage", "insightsService");
- LOG.debug("GeoIP usage medium set to: {}", use);
- if (use.equalsIgnoreCase("cityDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ usage = conf.get("index.geoip.usage", "insightsService");
+ LOG.debug("GeoIP usage medium set to: {}", usage);
+ if (usage.equalsIgnoreCase("insightsService")) {
+ client = new WebServiceClient.Builder(
+ conf.getInt("index.geoip.userid", 12345),
+ conf.get("index.geoip.licensekey")).build();
+ } else {
+ String db = null;
+ if (usage.equalsIgnoreCase("cityDatabase")) {
+ db = "GeoIP2-City.mmdb";
+ } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+ db = "GeoIP2-Connection-Type.mmdb";
+ } else if (usage.equalsIgnoreCase("domainDatabase")) {
+ db = "GeoIP2-Domain.mmdb";
+ } else if (usage.equalsIgnoreCase("ispDatabase")) {
+ db = "GeoIP2-ISP.mmdb";
}
- } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
- .getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ URL dbFileUrl = conf.getResource(db);
+ if (dbFileUrl == null) {
+ LOG.error("GeoDb file {} not found on classpath", db);
+ } else {
+ try {
+ buildDb(new File(dbFileUrl.getFile()));
+ } catch (Exception e) {
+ LOG.error("Failed to read geoDb file {}: ", db, e);
+ }
}
- } else if (use.equalsIgnoreCase("domainDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- } else if (use.equalsIgnoreCase("ispDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- } else if (use.equalsIgnoreCase("insightsService")) {
- client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
- 12345), conf.get("index.geoip.licensekey")).build();
}
- usage = use;
+ if (!conf.getBoolean("store.ip.address", false)) {
+ LOG.warn("Plugin index-geoip is active but IP address is not stored"
+ + "(store.ip.address == false)");
+ }
}
- private void buildDb() {
+ private void buildDb(File geoDb) {
try {
reader = new DatabaseReader.Builder(geoDb).build();
} catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ LOG.error("Failed to build geoDb:", e);
}
}
@@ -207,30 +202,25 @@
private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
String url) {
- if (conf.getBoolean("store.ip.address", false) == true) {
+ String serverIp = data.getContentMeta().get("_ip_");
+ if (serverIp != null && reader != null) {
try {
- String serverIp = data.getContentMeta().get("_ip_");
- if (serverIp != null) {
- if (usage.equalsIgnoreCase("cityDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
- reader);
- } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
- reader);
- } else if (usage.equalsIgnoreCase("domainDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
- reader);
- } else if (usage.equalsIgnoreCase("ispDatabase")) {
- doc = GeoIPDocumentCreator
- .createDocFromIspDb(serverIp, doc, reader);
- } else if (usage.equalsIgnoreCase("insightsService")) {
- doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
- doc, client);
- }
+ if (usage.equalsIgnoreCase("cityDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, reader);
+ } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
+ reader);
+ } else if (usage.equalsIgnoreCase("domainDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
+ reader);
+ } else if (usage.equalsIgnoreCase("ispDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, reader);
+ } else if (usage.equalsIgnoreCase("insightsService")) {
+ doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc,
+ client);
}
} catch (Exception e) {
- LOG.error(e.getMessage());
- e.printStackTrace();
+ LOG.error("Failed to determine geoip:", e);
}
}
return doc;