Merge pull request #697 from sebastian-nagel/NUTCH-2896-okhttp-connection-pool
NUTCH-2896 Protocol-okhttp: make connection pool configurable
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6a3c828..1ad02a0 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2133,7 +2133,8 @@
'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and
- available at runtime.
+ available at runtime. Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb)
+ can be used.
</description>
</property>
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index a03bce4..175443e 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -36,10 +36,10 @@
</publications>
<dependencies>
- <dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.17.0" conf="*->master" />
- <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.17.0" conf="*->master" />
- <dependency org="org.apache.logging.log4j" name="log4j-slf4j-impl" rev="2.17.0" conf="*->master" />
- <dependency org="org.slf4j" name="slf4j-api" rev="1.7.32" conf="*->master" />
+ <dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.17.2" conf="*->master" />
+ <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.17.2" conf="*->master" />
+ <dependency org="org.apache.logging.log4j" name="log4j-slf4j-impl" rev="2.17.2" conf="*->master" />
+ <dependency org="org.slf4j" name="slf4j-api" rev="1.7.36" conf="*->master" />
<dependency org="org.apache.commons" name="commons-lang3" rev="3.12.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
@@ -50,7 +50,7 @@
<dependency org="com.tdunning" name="t-digest" rev="3.2" />
<!-- Hadoop Dependencies -->
- <dependency org="org.apache.hadoop" name="hadoop-common" rev="3.1.3" conf="*->default">
+ <dependency org="org.apache.hadoop" name="hadoop-common" rev="3.3.3" conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
<exclude org="net.sf.kosmosfs" name="kfs" />
<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -58,23 +58,23 @@
<exclude org="org.mortbay.jetty" name="jsp-*" />
<exclude org="ant" name="ant" />
</dependency>
- <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.1.3" conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.1.3" conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.1.3" conf="*->default" />
+ <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="3.3.3" conf="*->default" />
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="3.3.3" conf="*->default" />
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.3.3" conf="*->default" />
<!-- End of Hadoop Dependencies -->
<dependency org="org.apache.tika" name="tika-core" rev="2.3.0" />
<dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!-- force this version as it is required by Tika -->
- <dependency org="xerces" name="xercesImpl" rev="2.12.1" />
+ <dependency org="xerces" name="xercesImpl" rev="2.12.2" />
- <dependency org="com.ibm.icu" name="icu4j" rev="68.2" />
+ <dependency org="com.ibm.icu" name="icu4j" rev="71.1" />
- <dependency org="com.google.guava" name="guava" rev="30.1-jre" />
+ <dependency org="com.google.guava" name="guava" rev="31.1-jre" />
- <dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.2" />
+ <dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.3" />
- <dependency org="com.google.code.gson" name="gson" rev="2.8.9"/>
+ <dependency org="com.google.code.gson" name="gson" rev="2.9.0"/>
<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">
<exclude module="hadoop-client" />
</dependency>
@@ -84,10 +84,10 @@
<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.4.1" conf="*->default" />
<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.4.1" conf="*->default" />
<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.4.1" conf="test->default" />
- <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.12.0" conf="*->default" />
- <dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.12.0" conf="*->default" />
- <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.12.0" conf="*->default" />
- <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.12.0" conf="*->default" />
+ <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.13.3" conf="*->default" />
+ <dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.13.3" conf="*->default" />
+ <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.13.3" conf="*->default" />
+ <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.13.3" conf="*->default" />
<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.9" conf="*->default">
@@ -111,16 +111,12 @@
<artifact name="mrunit" ns0:classifier="hadoop2" />
<exclude org="log4j" module="log4j" />
</dependency>
- <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.26" conf="test->default" />
- <!-- web app dependencies -->
- <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26" />
+ <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.26" conf="test->default" />
+ <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26" conf="test->default" />
<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->default" />
- <!-- RabbitMQ dependencies -->
- <dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0" conf="*->default" />
-
<!--Added Because of Elasticsearch JEST client-->
<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve correctly-->
<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.9" />
@@ -139,4 +135,4 @@
</dependencies>
-</ivy-module>
\ No newline at end of file
+</ivy-module>
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 15fedbf..0fce6b3 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -238,10 +238,11 @@
LongWritable oldGenTime = (LongWritable) crawlDatum.getMetaData()
.get(Nutch.WRITABLE_GENERATE_TIME_KEY);
if (oldGenTime != null) { // awaiting fetch & update
- if (oldGenTime.get() + genDelay > curTime) // still wait for
+ if (oldGenTime.get() + genDelay > curTime) { // still wait for
// update
context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1);
- return;
+ return;
+ }
}
float sort = 1.0f;
try {
diff --git a/src/java/org/apache/nutch/plugin/Extension.java b/src/java/org/apache/nutch/plugin/Extension.java
index 246e8ff..e949ea3 100644
--- a/src/java/org/apache/nutch/plugin/Extension.java
+++ b/src/java/org/apache/nutch/plugin/Extension.java
@@ -143,15 +143,15 @@
* Return an instance of the extension implementation. Before we create a
* extension instance we startup the plugin if it is not already done. The
* plugin instance and the extension instance use the same
- * {@link org.apache.nutch.plugin.PluginClassLoader}.
- * Each Plugin use its own classloader. The
- * {@link org.apache.nutch.plugin.PluginClassLoader} knows only its own
- * <i>plugin runtime libraries</i> defined
- * in the <code>plugin.xml</code> manifest file and exported libraries
- * of the dependent plugins.
+ * {@link org.apache.nutch.plugin.PluginClassLoader}. Each Plugin uses its own
+ * classloader. The {@link org.apache.nutch.plugin.PluginClassLoader} knows
+ * only its own <i>plugin runtime libraries</i> defined in the
+ * <code>plugin.xml</code> manifest file and exported libraries of the
+ * dependent plugins.
*
* @return Object An instance of the extension implementation
- * @throws PluginRuntimeException if there is a fatal runtime error
+ * @throws PluginRuntimeException
+ * if there is a fatal runtime error
*/
public Object getExtensionInstance() throws PluginRuntimeException {
// Must synchronize here to make sure creation and initialization
diff --git a/src/java/org/apache/nutch/plugin/Plugin.java b/src/java/org/apache/nutch/plugin/Plugin.java
index 314a866..306ada3 100644
--- a/src/java/org/apache/nutch/plugin/Plugin.java
+++ b/src/java/org/apache/nutch/plugin/Plugin.java
@@ -27,7 +27,7 @@
* provide a API and invoke one or a set of installed extensions.
*
* Each plugin may extend the base <code>Plugin</code>. <code>Plugin</code>
- * instances are used as the point of life cycle managemet of plugin related
+ * instances are used as the point of life cycle management of plugin related
* functionality.
*
* The <code>Plugin</code> will be started up and shutdown by the nutch plugin
diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java
index 3c55409..d80f971 100644
--- a/src/java/org/apache/nutch/plugin/PluginRepository.java
+++ b/src/java/org/apache/nutch/plugin/PluginRepository.java
@@ -38,11 +38,11 @@
import org.slf4j.LoggerFactory;
/**
- * <p>The plugin repositority is a registry of all plugins.</p>
+ * <p>The plugin repository is a registry of all plugins.</p>
*
- * <p>At system boot up a repositority is built by parsing the mainifest files of
+ * <p>At system boot up a repository is built by parsing the manifest files of
* all plugins. Plugins that require other plugins which do not exist are not
- * registed. For each plugin a plugin descriptor instance will be created. The
+ * registered. For each plugin a plugin descriptor instance will be created. The
* descriptor represents all meta information about a plugin. So a plugin
* instance will be created later when it is required, this allow lazy plugin
* loading.</p>
@@ -64,8 +64,7 @@
private HashMap<String, Plugin> fActivatedPlugins;
- @SuppressWarnings("rawtypes")
- private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<>();
+ private static final Map<String, Map<PluginClassLoader, Class<?>>> CLASS_CACHE = new HashMap<>();
private Configuration conf;
@@ -267,14 +266,14 @@
}
/**
- * <p>Returns a instance of a plugin. Plugin instances are cached. So a plugin
- * exist only as one instance. This allow a central management of plugin own
+ * <p>Returns an instance of a plugin. Plugin instances are cached. So a plugin
+ * exist only as one instance. This allow a central management of plugin's own
* resources.</p>
*
* <p>After creating the plugin instance the startUp() method is invoked. The
* plugin use a own classloader that is used as well by all instance of
* extensions of the same plugin. This class loader use all exported libraries
- * from the dependend plugins and all plugin libraries.</p>
+ * from the dependent plugins and all plugin libraries.</p>
*
* @param pDescriptor a {@link PluginDescriptor} for which to retrieve a
* {@link Plugin} instance
@@ -337,16 +336,15 @@
}
}
- @SuppressWarnings("rawtypes")
- public static Class getCachedClass(PluginDescriptor pDescriptor, String className)
+ public Class<?> getCachedClass(PluginDescriptor pDescriptor, String className)
throws ClassNotFoundException {
- Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className);
+ Map<PluginClassLoader, Class<?>> descMap = CLASS_CACHE.get(className);
if (descMap == null) {
descMap = new HashMap<>();
CLASS_CACHE.put(className, descMap);
}
PluginClassLoader loader = pDescriptor.getClassLoader();
- Class clazz = descMap.get(loader);
+ Class<?> clazz = descMap.get(loader);
if (clazz == null) {
clazz = loader.loadClass(className);
descMap.put(loader, clazz);
@@ -543,8 +541,8 @@
/**
* Registers this PluginRepository to be invoked whenever URLs have to be
- * parsed. This allows to check the registered protocol plugins for uncommon
- * protocols.
+ * parsed. This allows to check the registered protocol plugins for custom
+ * protocols not covered by standard {@link URLStreamHandler}s of the JVM.
*/
private void registerURLStreamHandlerFactory() {
org.apache.nutch.plugin.URLStreamHandlerFactory.getInstance().registerPluginRepository(this);
diff --git a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
index 5aed76a..bd7e377 100644
--- a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
+++ b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
@@ -20,6 +20,9 @@
import java.net.URL;
import java.net.URLStreamHandler;
import java.util.ArrayList;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -35,80 +38,126 @@
*/
public class URLStreamHandlerFactory
implements java.net.URLStreamHandlerFactory {
-
+
protected static final Logger LOG = LoggerFactory
.getLogger(URLStreamHandlerFactory.class);
-
+
/** The singleton instance. */
private static URLStreamHandlerFactory instance;
-
- /** Here we register all PluginRepositories.
- * In this class we do not know why several instances of PluginRepository
- * are kept, nor do we know how long they will be used. To prevent
- * a memory leak, this class must not keep references to PluginRepository
- * but use WeakReference which allows PluginRepository to still be
- * garbage collected. The prize is we need to clean the list for
- * outdated references which is done in the {@link #removeInvalidRefs()} method.
+
+ /**
+ * Here we register all PluginRepositories. In this class we do not know why
+ * several instances of PluginRepository are kept, nor do we know how long
+ * they will be used. To prevent a memory leak, this class must not keep
+ * references to PluginRepository but use WeakReference which allows
+ * PluginRepository to still be garbage collected. The prize is we need to
+ * clean the list for outdated references which is done in the
+ * {@link #removeInvalidRefs()} method.
*/
private ArrayList<WeakReference<PluginRepository>> prs;
-
+
+ /**
+ * Cache of URLStreamHandlers for each protocol supported by
+ * <ul>
+ * <li>one of the registered and active plugins</li>
+ * <li>or by the JVM</li>
+ * </ul>
+ * Using the cache avoids that {@link URLStreamHandler} instances are created
+ * multiple times anew. The cache is also pre-populated with protocols handled
+ * obligatorily by the JVM, see {@link SYSTEM_PROTOCOLS}.
+ */
+ private Map<String, Optional<URLStreamHandler>> cache;
+
+ /**
+ * Protocols covered by standard JVM URL handlers. These protocols must not be
+ * handled by Nutch plugins, in order to avoid that basic actions (eg. loading
+ * of classes and configuration files) break.
+ */
+ public static final String[] SYSTEM_PROTOCOLS = { //
+ "http", "https", "file", "jar" };
+
static {
instance = new URLStreamHandlerFactory();
URL.setURLStreamHandlerFactory(instance);
LOG.debug("Registered URLStreamHandlerFactory with the JVM.");
}
-
+
private URLStreamHandlerFactory() {
this.prs = new ArrayList<>();
+ initCache();
+ }
+
+ /** Reset and initialize cache (protocol -> URLStreamHandler) */
+ private synchronized void initCache() {
+ cache = new ConcurrentHashMap<>();
+ // pre-populate cache with protocols to be handled by the JVM
+ for (String protocol : SYSTEM_PROTOCOLS) {
+ cache.put(protocol, Optional.empty());
+ }
}
/**
* Get the singleton instance of this class.
- * @return a {@link org.apache.nutch.plugin.URLStreamHandlerFactory} instance
+ * @return a {@link org.apache.nutch.plugin.URLStreamHandlerFactory} instance
*/
public static URLStreamHandlerFactory getInstance() {
return instance;
}
-
+
/** Use this method once a new PluginRepository was created to register it.
*
* @param pr The PluginRepository to be registered.
*/
public void registerPluginRepository(PluginRepository pr) {
this.prs.add(new WeakReference<PluginRepository>(pr));
-
+
+ // reset the cache, so that the new PluginRepository is used from now on
+ initCache();
+
removeInvalidRefs();
}
@Override
public URLStreamHandler createURLStreamHandler(String protocol) {
+
+ if (cache.containsKey(protocol)) {
+ // use the cached handler, including "null" for standard
+ // handlers implemented by the JVM
+ return cache.get(protocol).orElse(null);
+ }
+
LOG.debug("Creating URLStreamHandler for protocol: {}", protocol);
-
+
removeInvalidRefs();
-
+
// find the 'correct' PluginRepository. For now we simply take the first.
// then ask it to return the URLStreamHandler
- for(WeakReference<PluginRepository> ref: this.prs) {
+ for (WeakReference<PluginRepository> ref : this.prs) {
PluginRepository pr = ref.get();
- if(pr != null) {
+ if (pr != null) {
// found PluginRepository. Let's get the URLStreamHandler...
- return pr.createURLStreamHandler(protocol);
+ URLStreamHandler handler = pr.createURLStreamHandler(protocol);
+ cache.put(protocol, Optional.of(handler));
+ return handler;
}
}
+
+ cache.put(protocol, Optional.empty());
return null;
}
- /** Maintains the list of PluginRepositories by
- * removing the references whose referents have been
- * garbage collected meanwhile.
+ /**
+ * Maintains the list of PluginRepositories by removing the references whose
+ * referents have been garbage collected meanwhile.
*/
private void removeInvalidRefs() {
ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(this.prs);
- for(WeakReference<PluginRepository> ref: copy) {
- if(ref.get() == null) {
+ for (WeakReference<PluginRepository> ref : copy) {
+ if (ref.get() == null) {
this.prs.remove(ref);
}
}
- LOG.debug("Removed '{}' invalid references. '{}' remaining.", copy.size()-this.prs.size(), this.prs.size());
+ LOG.debug("Removed '{}' invalid references. '{}' remaining.",
+ copy.size() - this.prs.size(), this.prs.size());
}
}
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index 4fa6f71..2eda5a6 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -36,12 +36,11 @@
</publications>
<dependencies>
- <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" >
- <!-- Exlude due to classpath issues -->
- <exclude org="org.apache.httpcomponents" name="httpclient" />
- <exclude org="org.apache.httpcomponents" name="httpcore" />
- <exclude org="commons-codec" name="commons-codec" />
- <exclude org="commons-logging" name="commons-logging" />
+ <dependency org="com.maxmind.geoip2" name="geoip2" rev="3.0.1">
+ <!-- Exlude libs provided in Nutch core -->
+ <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" />
+ <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
+ <exclude org="com.fasterxml.jackson.core" name="jackson-core" />
</dependency>
</dependencies>
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
index 6148f59..c4efadf 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -25,11 +25,8 @@
<library name="index-geoip.jar">
<export name="*"/>
</library>
- <library name="geoip2-2.12.0.jar"/>
- <library name="jackson-annotations-2.9.5.jar"/>
- <library name="jackson-core-2.9.5.jar"/>
- <library name="jackson-databind-2.9.5.jar"/>
- <library name="maxmind-db-1.2.2.jar"/>
+ <library name="geoip2-3.0.1.jar"/>
+ <library name="maxmind-db-2.0.0.jar"/>
</runtime>
<requires>
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index 1c697a2..64b3862 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -17,13 +17,17 @@
package org.apache.nutch.indexer.geoip;
import java.io.IOException;
+import java.lang.invoke.MethodHandles;
import java.net.InetAddress;
import java.net.UnknownHostException;
import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.maxmind.geoip2.DatabaseReader;
import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.AddressNotFoundException;
import com.maxmind.geoip2.exception.GeoIp2Exception;
import com.maxmind.geoip2.model.InsightsResponse;
import com.maxmind.geoip2.model.CityResponse;
@@ -54,6 +58,9 @@
*/
public class GeoIPDocumentCreator {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
/**
* Add field to document but only if value isn't null
* @param doc the {@link NutchDocument} to augment
@@ -61,21 +68,7 @@
* @param value the String value to associate with the target field
*/
public static void addIfNotNull(NutchDocument doc, String name,
- String value) {
- if (value != null) {
- doc.add(name, value);
- }
- }
-
- /**
- * Add field to document but only if value isn't null
- * @param doc the {@link NutchDocument} to augment
- * @param name the name of the target field
- * @param value the {@link java.lang.Integer} value to
- * associate with the target field
- */
- public static void addIfNotNull(NutchDocument doc, String name,
- Integer value) {
+ Object value) {
if (value != null) {
doc.add(name, value);
}
@@ -87,7 +80,6 @@
addIfNotNull(doc, "ip", serverIp);
InsightsResponse response = client
.insights(InetAddress.getByName(serverIp));
- // CityResponse response = client.city(InetAddress.getByName(serverIp));
City city = response.getCity();
addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
@@ -103,7 +95,7 @@
addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
addIfNotNull(doc, "countryName", country.getName()); // 'United States'
addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
- addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
+ addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());
Location location = response.getLocation();
addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
@@ -121,7 +113,7 @@
Subdivision subdivision = response.getMostSpecificSubdivision();
addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
- addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
@@ -169,7 +161,13 @@
public static NutchDocument createDocFromDomainDb(String serverIp,
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
- DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+ DomainResponse response;
+ try {
+ response = reader.domain(InetAddress.getByName(serverIp));
+ } catch (AddressNotFoundException e) {
+ LOG.debug("IP address not found: {}", serverIp);
+ return doc;
+ }
addIfNotNull(doc, "ip", serverIp);
addIfNotNull(doc, "domain", response.getDomain());
return doc;
@@ -189,7 +187,14 @@
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
addIfNotNull(doc, "ip", serverIp);
- CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+ CityResponse response;
+ try {
+ response = reader.city(InetAddress.getByName(serverIp));
+ } catch (AddressNotFoundException e) {
+ LOG.debug("IP address not found: {}", serverIp);
+ return doc;
+ }
City city = response.getCity();
addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
@@ -206,7 +211,7 @@
addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
addIfNotNull(doc, "countryName", country.getName()); // 'United States'
addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
- addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
+ addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());
Location location = response.getLocation();
addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
@@ -224,7 +229,7 @@
Subdivision subdivision = response.getMostSpecificSubdivision();
addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
- addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
return doc;
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
index 4e21273..ea30b8c 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -87,7 +87,8 @@
* 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
* Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
* GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath
- * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ * and available at runtime. This can be achieved by adding it to `$NUTCH_HOME/conf`.
+ * Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) can be used.
* </description>
* </property>
*
@@ -152,24 +153,29 @@
conf.getInt("index.geoip.userid", 12345),
conf.get("index.geoip.licensekey")).build();
} else {
- String db = null;
+ String dbSuffix = null;
if (usage.equalsIgnoreCase("cityDatabase")) {
- db = "GeoIP2-City.mmdb";
+ dbSuffix = "-City.mmdb";
} else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
- db = "GeoIP2-Connection-Type.mmdb";
+ dbSuffix = "-Connection-Type.mmdb";
} else if (usage.equalsIgnoreCase("domainDatabase")) {
- db = "GeoIP2-Domain.mmdb";
+ dbSuffix = "-Domain.mmdb";
} else if (usage.equalsIgnoreCase("ispDatabase")) {
- db = "GeoIP2-ISP.mmdb";
+ dbSuffix = "-ISP.mmdb";
}
- URL dbFileUrl = conf.getResource(db);
- if (dbFileUrl == null) {
- LOG.error("GeoDb file {} not found on classpath", db);
- } else {
- try {
- buildDb(new File(dbFileUrl.getFile()));
- } catch (Exception e) {
- LOG.error("Failed to read geoDb file {}: ", db, e);
+ String[] dbPrefixes = {"GeoIP2", "GeoLite2"};
+ for (String dbPrefix : dbPrefixes) {
+ String db = dbPrefix + dbSuffix;
+ URL dbFileUrl = conf.getResource(db);
+ if (dbFileUrl == null) {
+ LOG.error("GeoDb file {} not found on classpath", db);
+ } else {
+ try {
+ LOG.info("Reading GeoDb file {}", db);
+ buildDb(new File(dbFileUrl.getFile()));
+ } catch (Exception e) {
+ LOG.error("Failed to read geoDb file {}: ", db, e);
+ }
}
}
}
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index 7885a52..053bfd6 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -25,14 +25,20 @@
import java.util.Map;
import java.util.concurrent.TimeUnit;
+import javax.net.ssl.SSLContext;
+
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
+import org.apache.http.conn.ssl.NoopHostnameVerifier;
+import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.nio.client.HttpAsyncClientBuilder;
+import org.apache.http.ssl.SSLContextBuilder;
+import org.apache.http.ssl.SSLContexts;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.IndexWriterParams;
import org.apache.nutch.indexer.NutchDocument;
@@ -181,6 +187,7 @@
hostsList[i++] = new HttpHost(host, port, scheme);
}
RestClientBuilder restClientBuilder = RestClient.builder(hostsList);
+
if (auth) {
restClientBuilder
.setHttpClientConfigCallback(new HttpClientConfigCallback() {
@@ -191,6 +198,28 @@
}
});
}
+
+ // In case of HTTPS, set the client up for ignoring problems with self-signed
+ // certificates and stuff
+ if ("https".equals(scheme)) {
+ try {
+ SSLContextBuilder sslBuilder = SSLContexts.custom();
+ sslBuilder.loadTrustMaterial(null, new TrustSelfSignedStrategy());
+ final SSLContext sslContext = sslBuilder.build();
+
+ restClientBuilder.setHttpClientConfigCallback(new HttpClientConfigCallback() {
+ @Override
+ public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) {
+ // ignore issues with self-signed certificates
+ httpClientBuilder.setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE);
+ return httpClientBuilder.setSSLContext(sslContext);
+ }
+ });
+ } catch (Exception e) {
+ LOG.error("Error setting up SSLContext because: " + e.getMessage(), e);
+ }
+ }
+
client = new RestHighLevelClient(restClientBuilder);
} else {
throw new IOException(
@@ -344,4 +373,4 @@
public Configuration getConf() {
return config;
}
-}
\ No newline at end of file
+}
diff --git a/src/plugin/indexer-solr/schema.xml b/src/plugin/indexer-solr/schema.xml
index 6865eb0..ba71fe1 100644
--- a/src/plugin/indexer-solr/schema.xml
+++ b/src/plugin/indexer-solr/schema.xml
@@ -356,7 +356,7 @@
<field name="cityGeoNameId" type="int" stored="true" indexed="true" />
<field name="continentCode" type="string" stored="true" indexed="true" />
<field name="continentGeoNameId" type="int" stored="true" indexed="true" />
- <field name="contentName" type="string" stored="true" indexed="true" />
+ <field name="continentName" type="string" stored="true" indexed="true" />
<field name="countryIsoCode" type="string" stored="true" indexed="true"/>
<field name="countryName" type="string" stored="true" indexed="true" />
<field name="countryConfidence" type="int" stored="true" indexed="true"/>
@@ -379,7 +379,6 @@
<field name="org" type="string" stored="true" indexed="true" />
<field name="userType" type="string" stored="true" indexed="true" />
<field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
- <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" />
<field name="connType" type="string" stored="true" indexed="true" />
<field name="location" type="location" stored="true" indexed="true" />
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 9fbcda7..63fa328 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -90,21 +90,6 @@
}
} };
- private static final SSLContext trustAllSslContext;
-
- static {
- try {
- trustAllSslContext = SSLContext.getInstance("SSL");
- trustAllSslContext.init(null, trustAllCerts,
- new java.security.SecureRandom());
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- private static final SSLSocketFactory trustAllSslSocketFactory = trustAllSslContext
- .getSocketFactory();
-
public OkHttp() {
super(LOG);
}
@@ -129,8 +114,18 @@
.readTimeout(this.timeout, TimeUnit.MILLISECONDS);
if (!this.tlsCheckCertificate) {
- builder.sslSocketFactory(trustAllSslSocketFactory,
- (X509TrustManager) trustAllCerts[0]);
+ try {
+ SSLContext trustAllSslContext = SSLContext.getInstance("TLS");
+ trustAllSslContext.init(null, trustAllCerts, null);
+ SSLSocketFactory trustAllSslSocketFactory = trustAllSslContext
+ .getSocketFactory();
+ builder.sslSocketFactory(trustAllSslSocketFactory,
+ (X509TrustManager) trustAllCerts[0]);
+ } catch (Exception e) {
+ LOG.error(
+ "Failed to disable TLS certificate verification (property http.tls.certificates.check)",
+ e);
+ }
builder.hostnameVerifier(new HostnameVerifier() {
@Override
public boolean verify(String hostname, SSLSession session) {
diff --git a/src/plugin/publish-rabbitmq/ivy.xml b/src/plugin/publish-rabbitmq/ivy.xml
index dd450cf..7b5e3dd 100644
--- a/src/plugin/publish-rabbitmq/ivy.xml
+++ b/src/plugin/publish-rabbitmq/ivy.xml
@@ -34,5 +34,5 @@
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>
-
+
</ivy-module>