Fix for NUTCH-2649: Optionally skip TLS/SSL certificate validation for protocol-selenium and protocol-htmlunit
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index e76bc04..cf47f9d 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -30,8 +30,10 @@
import java.util.HashSet;
import java.util.Set;
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
@@ -40,6 +42,7 @@
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.httpclient.DummyX509TrustManager;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.protocol.http.api.HttpException;
@@ -129,10 +132,20 @@
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
+
+ // Optionally skip TLS/SSL certificate validation
+ SSLSocketFactory factory;
+ if (http.isTlsCheckCertificates()) {
+ factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+ } else {
+ SSLContext sslContext = SSLContext.getInstance("TLS");
+ sslContext.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ factory = sslContext.getSocketFactory();
+ }
+
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
+ sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
@@ -199,8 +212,8 @@
reqStr.append("\r\n");
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -216,9 +229,8 @@
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(
- new BufferedInputStream(socket.getInputStream(),
- Http.BUFFER_SIZE), Http.BUFFER_SIZE);
+ new PushbackInputStream(new BufferedInputStream(
+ socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
@@ -227,7 +239,8 @@
httpHeaders = new StringBuffer();
}
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+ headers.add("nutch.fetch.time",
+ Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
@@ -243,27 +256,30 @@
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with HtmlUnit only if content type in HTML or XHTML
+ // handle with HtmlUnit only if content type in HTML or XHTML
if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ if (contentType.contains("text/html")
+ || contentType.contains("application/xhtml")) {
readContentFromHtmlUnit(url);
} else {
String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
- if (transferEncoding != null && "chunked"
- .equalsIgnoreCase(transferEncoding.trim())) {
+ if (transferEncoding != null
+ && "chunked".equalsIgnoreCase(transferEncoding.trim())) {
readChunkedContent(in, line);
} else {
readPlainContent(in);
}
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
- if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+ if ("gzip".equals(contentEncoding)
+ || "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
} else {
if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+ Http.LOG
+ .trace("fetched " + content.length + " bytes from " + url);
}
}
}
@@ -272,6 +288,8 @@
}
}
+ }catch(Exception e) {
+ Http.LOG.error(e.getLocalizedMessage());
} finally {
if (socket != null)
socket.close();
@@ -313,7 +331,7 @@
String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
content = page.getBytes("UTF-8");
}
-
+
private void readPlainContent(InputStream in)
throws HttpException, IOException {
@@ -328,8 +346,7 @@
throw new HttpException("bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 && contentLength > http
- .getMaxContent()) // limit
+ if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit
// download
// size
contentLength = http.getMaxContent();
@@ -408,17 +425,17 @@
break;
}
- if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
- .getMaxContent())
+ if (http.getMaxContent() >= 0
+ && (contentBytesRead + chunkLen) > http.getMaxContent())
chunkLen = http.getMaxContent() - contentBytesRead;
// read one chunk
int chunkBytesRead = 0;
while (chunkBytesRead < chunkLen) {
- int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
- (chunkLen - chunkBytesRead) :
- Http.BUFFER_SIZE;
+ int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE
+ ? (chunkLen - chunkBytesRead)
+ : Http.BUFFER_SIZE;
int len = in.read(bytes, 0, toRead);
if (len == -1)
@@ -510,9 +527,9 @@
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
- (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
- != -1)) {
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
+ || ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
@@ -570,4 +587,4 @@
return value;
}
-}
+}
\ No newline at end of file
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
index 8ebd898..a2b0b0c 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
@@ -30,8 +30,10 @@
import java.util.HashSet;
import java.util.Set;
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -42,6 +44,7 @@
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.DummyX509TrustManager;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.openqa.selenium.WebDriver;
@@ -65,10 +68,12 @@
protected enum Scheme {
HTTP, HTTPS,
}
+
/** The nutch configuration */
private Configuration conf = null;
- public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+ public HttpResponse(Http http, URL url, CrawlDatum datum)
+ throws ProtocolException, IOException {
this.conf = http.getConf();
this.http = http;
@@ -122,33 +127,43 @@
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
+
+ // Optionally skip TLS/SSL certificate validation
+ SSLSocketFactory factory;
+ if (http.isTlsCheckCertificates()) {
+ factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+ } else {
+ SSLContext sslContext = SSLContext.getInstance("TLS");
+ sslContext.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ factory = sslContext.getSocketFactory();
+ }
+
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
+ sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
Set<String> protocols = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedProtocols()));
+ Arrays.asList(sslsocket.getSupportedProtocols()));
Set<String> ciphers = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedCipherSuites()));
+ Arrays.asList(sslsocket.getSupportedCipherSuites()));
// Intersect with preferred protocols and ciphers
protocols.retainAll(http.getTlsPreferredProtocols());
ciphers.retainAll(http.getTlsPreferredCipherSuites());
sslsocket.setEnabledProtocols(
- protocols.toArray(new String[protocols.size()]));
+ protocols.toArray(new String[protocols.size()]));
sslsocket.setEnabledCipherSuites(
- ciphers.toArray(new String[ciphers.size()]));
+ ciphers.toArray(new String[ciphers.size()]));
sslsocket.startHandshake();
socket = sslsocket;
}
if (sockAddr != null
- && conf.getBoolean("store.ip.address", false) == true) {
+ && conf.getBoolean("store.ip.address", false) == true) {
headers.add("_ip_", sockAddr.getAddress().getHostAddress());
}
// make request
@@ -203,17 +218,17 @@
}
if (http.isCookieEnabled()
- && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+ && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
- .toString();
+ .toString();
reqStr.append("Cookie: ");
reqStr.append(cookie);
reqStr.append("\r\n");
}
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -223,25 +238,24 @@
headers.add("_request_", reqStr.toString());
}
-
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE);
+ new PushbackInputStream(new BufferedInputStream(
+ socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
-
// store the http headers verbatim
if (conf.getBoolean("store.http.headers", false) == true) {
httpHeaders = new StringBuffer();
}
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+ headers.add("nutch.fetch.time",
+ Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
@@ -257,9 +271,10 @@
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with Selenium only if content type in HTML or XHTML
+ // handle with Selenium only if content type in HTML or XHTML
if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ if (contentType.contains("text/html")
+ || contentType.contains("application/xhtml")) {
readPlainContent(url);
} else {
try {
@@ -269,11 +284,13 @@
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " + contentLengthString);
+ throw new HttpException(
+ "bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+ if (http.getMaxContent() >= 0
+ && contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
@@ -299,17 +316,20 @@
}
}
}
- }
+ }
+ }catch(Exception e) {
+ Http.LOG.error(e.getLocalizedMessage());
} finally {
if (socket != null)
socket.close();
}
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
public URL getUrl() {
return url;
@@ -331,56 +351,63 @@
return content;
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
private void loadSeleniumHandlers() {
- if (handlers != null) return;
+ if (handlers != null)
+ return;
- String handlerConfig = this.conf.get("interactiveselenium.handlers", "DefaultHandler");
+ String handlerConfig = this.conf.get("interactiveselenium.handlers",
+ "DefaultHandler");
String[] handlerNames = handlerConfig.split(",");
handlers = new InteractiveSeleniumHandler[handlerNames.length];
for (int i = 0; i < handlerNames.length; i++) {
+ try {
+ String classToLoad = this.getClass().getPackage().getName()
+ + ".handlers." + handlerNames[i];
try {
- String classToLoad = this.getClass().getPackage().getName() + ".handlers." + handlerNames[i];
- try {
- handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).getConstructor().newInstance());
- } catch (IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) {
- e.printStackTrace();
- }
- Http.LOG.info("Successfully loaded " + classToLoad);
- } catch (ClassNotFoundException e) {
- Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]);
- } catch (InstantiationException e) {
- Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
- } catch (IllegalAccessException e) {
- Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+ handlers[i] = InteractiveSeleniumHandler.class
+ .cast(Class.forName(classToLoad).getConstructor().newInstance());
+ } catch (IllegalArgumentException | InvocationTargetException
+ | NoSuchMethodException | SecurityException e) {
+ e.printStackTrace();
}
+ Http.LOG.info("Successfully loaded " + classToLoad);
+ } catch (ClassNotFoundException e) {
+ Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]);
+ } catch (InstantiationException e) {
+ Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
+ } catch (IllegalAccessException e) {
+ Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+ }
}
}
private void readPlainContent(URL url) throws IOException {
if (handlers == null)
- loadSeleniumHandlers();
+ loadSeleniumHandlers();
String processedPage = "";
for (InteractiveSeleniumHandler handler : this.handlers) {
- if (! handler.shouldProcessURL(url.toString())) {
- continue;
- }
+ if (!handler.shouldProcessURL(url.toString())) {
+ continue;
+ }
- WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
+ WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
- processedPage += handler.processDriver(driver);
+ processedPage += handler.processDriver(driver);
- HttpWebClient.cleanUpDriver(driver);
+ HttpWebClient.cleanUpDriver(driver);
}
content = processedPage.getBytes("UTF-8");
}
- private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
readLine(in, line, false);
int codeStart = line.indexOf(" ");
@@ -395,13 +422,15 @@
try {
code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
} catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+ throw new HttpException(
+ "bad status line '" + line + "': " + e.getMessage(), e);
}
return code;
}
- private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+ private void processHeaderLine(StringBuffer line)
+ throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
@@ -427,24 +456,26 @@
}
// Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private void parseHeaders(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
|| ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
+ // TODO: (CM) We don't know the header names here
+ // since we're just handling them generically. It would
+ // be nice to provide some sort of mapping function here
+ // for the returned header names to the standard metadata
+ // names in the ParseData class
processHeaderLine(line);
} catch (Exception e) {
// fixme:
@@ -457,8 +488,8 @@
}
}
- private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
- throws IOException {
+ private static int readLine(PushbackInputStream in, StringBuffer line,
+ boolean allowContinuedLine) throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
@@ -491,4 +522,4 @@
in.unread(value);
return value;
}
-}
+}
\ No newline at end of file
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index 4a20b04..3f7bc5b 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -29,9 +29,10 @@
import java.util.HashSet;
import java.util.Set;
-
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -42,6 +43,7 @@
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.DummyX509TrustManager;
import org.apache.nutch.protocol.http.api.HttpBase;
/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
@@ -61,10 +63,12 @@
protected enum Scheme {
HTTP, HTTPS,
}
+
/** The nutch configuration */
private Configuration conf = null;
- public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+ public HttpResponse(Http http, URL url, CrawlDatum datum)
+ throws ProtocolException, IOException {
this.conf = http.getConf();
this.http = http;
@@ -118,33 +122,43 @@
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
+
+ // Optionally skip TLS/SSL certificate validation
+ SSLSocketFactory factory;
+ if (http.isTlsCheckCertificates()) {
+ factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+ } else {
+ SSLContext sslContext = SSLContext.getInstance("TLS");
+ sslContext.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ factory = sslContext.getSocketFactory();
+ }
+
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
+ sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
Set<String> protocols = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedProtocols()));
+ Arrays.asList(sslsocket.getSupportedProtocols()));
Set<String> ciphers = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedCipherSuites()));
+ Arrays.asList(sslsocket.getSupportedCipherSuites()));
// Intersect with preferred protocols and ciphers
protocols.retainAll(http.getTlsPreferredProtocols());
ciphers.retainAll(http.getTlsPreferredCipherSuites());
sslsocket.setEnabledProtocols(
- protocols.toArray(new String[protocols.size()]));
+ protocols.toArray(new String[protocols.size()]));
sslsocket.setEnabledCipherSuites(
- ciphers.toArray(new String[ciphers.size()]));
+ ciphers.toArray(new String[ciphers.size()]));
sslsocket.startHandshake();
socket = sslsocket;
}
if (sockAddr != null
- && conf.getBoolean("store.ip.address", false) == true) {
+ && conf.getBoolean("store.ip.address", false) == true) {
headers.add("_ip_", sockAddr.getAddress().getHostAddress());
}
// make request
@@ -199,17 +213,17 @@
}
if (http.isCookieEnabled()
- && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+ && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
- .toString();
+ .toString();
reqStr.append("Cookie: ");
reqStr.append(cookie);
reqStr.append("\r\n");
}
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -219,25 +233,24 @@
headers.add("_request_", reqStr.toString());
}
-
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE);
+ new PushbackInputStream(new BufferedInputStream(
+ socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
-
// store the http headers verbatim
if (conf.getBoolean("store.http.headers", false) == true) {
httpHeaders = new StringBuffer();
}
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+ headers.add("nutch.fetch.time",
+ Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
@@ -253,9 +266,10 @@
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with Selenium only if content type in HTML or XHTML
+ // handle with Selenium only if content type in HTML or XHTML
if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ if (contentType.contains("text/html")
+ || contentType.contains("application/xhtml")) {
readPlainContent(url);
} else {
try {
@@ -265,11 +279,13 @@
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " + contentLengthString);
+ throw new HttpException(
+ "bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+ if (http.getMaxContent() >= 0
+ && contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
@@ -298,17 +314,20 @@
if (httpHeaders != null) {
headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
}
- }
+ }
+ } catch (Exception e) {
+ Http.LOG.error(e.getLocalizedMessage());
} finally {
if (socket != null)
socket.close();
}
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
public URL getUrl() {
return url;
@@ -330,9 +349,10 @@
return content;
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
private void readPlainContent(URL url) throws IOException {
String page = HttpWebClient.getHtmlPage(url.toString(), conf);
@@ -340,7 +360,8 @@
content = page.getBytes("UTF-8");
}
- private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
readLine(in, line, false);
int codeStart = line.indexOf(" ");
@@ -355,13 +376,15 @@
try {
code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
} catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+ throw new HttpException(
+ "bad status line '" + line + "': " + e.getMessage(), e);
}
return code;
}
- private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+ private void processHeaderLine(StringBuffer line)
+ throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
@@ -387,24 +410,26 @@
}
// Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private void parseHeaders(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
|| ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
+ // TODO: (CM) We don't know the header names here
+ // since we're just handling them generically. It would
+ // be nice to provide some sort of mapping function here
+ // for the returned header names to the standard metadata
+ // names in the ParseData class
processHeaderLine(line);
} catch (Exception e) {
// fixme:
@@ -417,8 +442,8 @@
}
}
- private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
- throws IOException {
+ private static int readLine(PushbackInputStream in, StringBuffer line,
+ boolean allowContinuedLine) throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
@@ -451,4 +476,4 @@
in.unread(value);
return value;
}
-}
+}
\ No newline at end of file