Merge pull request #495 from sebastian-nagel/NUTCH-2672-build-docs-use-https
NUTCH-2762 Replace http:// URLs by https:// (build files and documentation)
diff --git a/src/bin/crawl b/src/bin/crawl
index 81d30cc..56bb237 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -34,8 +34,9 @@
# --hostdbupdate Boolean indicator if we call hostdbupdate or not
# --hostdbgenerate Boolean indicator if we use hostdb in generate or not
#
-# --num-slaves <num_slaves> Number of slave nodes [default: 1]
-# Note: This can only be set when running in distribution mode
+# --num-fetchers <num_fetchers> Number of tasks used for fetching (fetcher map tasks) [default: 1]
+# Note: This can only be set when running in distributed mode and
+# should correspond to the number of worker nodes in the cluster.
# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
# --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000]
# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
@@ -83,10 +84,11 @@
echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
- echo -e " --hostdbupdate\t\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
+ echo -e " --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
- echo -e " --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
- echo -e " \t\t\t\t\tNote: This can only be set when running in distribution mode"
+ echo -e " --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
+ echo -e " \t\t\t\t\tNote: This can only be set when running in distributed mode and"
+ echo -e " \t\t\t\t\t should correspond to the number of worker nodes in the cluster."
echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
@@ -107,8 +109,8 @@
JAVA_PROPERTIES=""
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
-NUM_SLAVES=1
-NUM_TASKS=2 # 2 x NUM_SLAVES
+NUM_FETCHERS=1
+NUM_TASKS=2 # 2 x NUM_FETCHERS
SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
@@ -138,7 +140,12 @@
shift 2
;;
--num-slaves)
- NUM_SLAVES="${2}"
+ # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
+ NUM_FETCHERS="${2}"
+ shift 2
+ ;;
+ --num-fetchers)
+ NUM_FETCHERS="${2}"
shift 2
;;
--num-tasks)
@@ -203,7 +210,10 @@
mode=distributed
fi
if [[ "$mode" = "local" ]]; then
- NUM_SLAVES=1
+ if [[ "$NUM_FETCHERS" -ne 1 ]]; then
+ echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
+ fi
+ NUM_FETCHERS=1
fi
# note that some of the options listed here could be set in the
@@ -296,9 +306,9 @@
echo "Generating a new segment"
if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb "$CRAWL_PATH"/hostdb)
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
else
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
fi
echo "$bin/nutch generate ${generate_args[@]}"
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index be56377..3d4f9c5 100644
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -42,7 +42,7 @@
public class MetadataIndexer implements IndexingFilter {
private Configuration conf;
private String[] dbFieldnames;
- private Map<String, String> parseFieldnames;
+ private String[] parseFieldnames;
private String[] contentFieldnames;
private String separator;
private Set<String> mvFields;
@@ -70,10 +70,10 @@
// add the fields from parsemd
if (parseFieldnames != null) {
- for (String metatag : parseFieldnames.keySet()) {
+ for (String metatag : parseFieldnames) {
for (String value : parse.getData().getParseMeta().getValues(metatag)) {
if (value != null)
- add(doc, parseFieldnames.get(metatag), value);
+ add(doc, metatag, value);
}
}
}
@@ -111,14 +111,12 @@
public void setConf(Configuration conf) {
this.conf = conf;
dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
- parseFieldnames = new HashMap<String, String>();
- for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
- parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
- }
+ parseFieldnames = conf.getStrings(parse_CONF_PROPERTY);
contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
separator = conf.get(separator_CONF_PROPERTY, null);
- mvFields = new HashSet(Arrays.asList(conf.getStrings(mvfields_CONF_PROPERTY, new String[0])));
+ mvFields = new HashSet<>(
+ Arrays.asList(conf.getStrings(mvfields_CONF_PROPERTY, new String[0])));
// TODO check conflict between field names e.g. could have same label
// from different sources
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java
new file mode 100644
index 0000000..6092e78
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
+public class DummyX509TrustManager implements X509TrustManager {
+ private X509TrustManager standardTrustManager = null;
+
+ /**
+ * Constructor for DummyX509TrustManager.
+ */
+ public DummyX509TrustManager(KeyStore keystore)
+ throws NoSuchAlgorithmException, KeyStoreException {
+ super();
+ String algo = TrustManagerFactory.getDefaultAlgorithm();
+ TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+ factory.init(keystore);
+ TrustManager[] trustmanagers = factory.getTrustManagers();
+ if (trustmanagers.length == 0) {
+ throw new NoSuchAlgorithmException(algo + " trust manager not supported");
+ }
+ this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isClientTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isServerTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ */
+ public X509Certificate[] getAcceptedIssuers() {
+ return this.standardTrustManager.getAcceptedIssuers();
+ }
+
+ public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+
+ public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+}
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index e76bc04..ced2e0f 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -26,12 +26,17 @@
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
@@ -129,10 +134,20 @@
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
+
+ // Optionally skip TLS/SSL certificate validation
+ SSLSocketFactory factory;
+ if (http.isTlsCheckCertificates()) {
+ factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+ } else {
+ SSLContext sslContext = SSLContext.getInstance("TLS");
+ sslContext.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ factory = sslContext.getSocketFactory();
+ }
+
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
+ sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
@@ -199,8 +214,8 @@
reqStr.append("\r\n");
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -216,9 +231,8 @@
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(
- new BufferedInputStream(socket.getInputStream(),
- Http.BUFFER_SIZE), Http.BUFFER_SIZE);
+ new PushbackInputStream(new BufferedInputStream(
+ socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
@@ -227,7 +241,8 @@
httpHeaders = new StringBuffer();
}
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+ headers.add("nutch.fetch.time",
+ Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
@@ -243,27 +258,30 @@
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with HtmlUnit only if content type in HTML or XHTML
+ // handle with HtmlUnit only if content type in HTML or XHTML
if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ if (contentType.contains("text/html")
+ || contentType.contains("application/xhtml")) {
readContentFromHtmlUnit(url);
} else {
String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
- if (transferEncoding != null && "chunked"
- .equalsIgnoreCase(transferEncoding.trim())) {
+ if (transferEncoding != null
+ && "chunked".equalsIgnoreCase(transferEncoding.trim())) {
readChunkedContent(in, line);
} else {
readPlainContent(in);
}
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
- if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+ if ("gzip".equals(contentEncoding)
+ || "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
} else {
if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+ Http.LOG
+ .trace("fetched " + content.length + " bytes from " + url);
}
}
}
@@ -272,6 +290,8 @@
}
}
+ }catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) {
+ throw new ProtocolException(e);
} finally {
if (socket != null)
socket.close();
@@ -313,7 +333,7 @@
String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
content = page.getBytes("UTF-8");
}
-
+
private void readPlainContent(InputStream in)
throws HttpException, IOException {
@@ -328,8 +348,7 @@
throw new HttpException("bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 && contentLength > http
- .getMaxContent()) // limit
+ if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit
// download
// size
contentLength = http.getMaxContent();
@@ -408,17 +427,17 @@
break;
}
- if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
- .getMaxContent())
+ if (http.getMaxContent() >= 0
+ && (contentBytesRead + chunkLen) > http.getMaxContent())
chunkLen = http.getMaxContent() - contentBytesRead;
// read one chunk
int chunkBytesRead = 0;
while (chunkBytesRead < chunkLen) {
- int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
- (chunkLen - chunkBytesRead) :
- Http.BUFFER_SIZE;
+ int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE
+ ? (chunkLen - chunkBytesRead)
+ : Http.BUFFER_SIZE;
int len = in.read(bytes, 0, toRead);
if (len == -1)
@@ -510,9 +529,9 @@
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
- (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
- != -1)) {
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
+ || ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
@@ -570,4 +589,4 @@
return value;
}
-}
+}
\ No newline at end of file
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java
new file mode 100644
index 0000000..ec1354f
--- /dev/null
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
+public class DummyX509TrustManager implements X509TrustManager {
+ private X509TrustManager standardTrustManager = null;
+
+ /**
+ * Constructor for DummyX509TrustManager.
+ */
+ public DummyX509TrustManager(KeyStore keystore)
+ throws NoSuchAlgorithmException, KeyStoreException {
+ super();
+ String algo = TrustManagerFactory.getDefaultAlgorithm();
+ TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+ factory.init(keystore);
+ TrustManager[] trustmanagers = factory.getTrustManagers();
+ if (trustmanagers.length == 0) {
+ throw new NoSuchAlgorithmException(algo + " trust manager not supported");
+ }
+ this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isClientTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isServerTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ */
+ public X509Certificate[] getAcceptedIssuers() {
+ return this.standardTrustManager.getAcceptedIssuers();
+ }
+
+ public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+
+ public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+}
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
index 8ebd898..a5793c6 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
@@ -26,12 +26,17 @@
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -65,10 +70,12 @@
protected enum Scheme {
HTTP, HTTPS,
}
+
/** The nutch configuration */
private Configuration conf = null;
- public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+ public HttpResponse(Http http, URL url, CrawlDatum datum)
+ throws ProtocolException, IOException {
this.conf = http.getConf();
this.http = http;
@@ -122,33 +129,43 @@
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
+
+ // Optionally skip TLS/SSL certificate validation
+ SSLSocketFactory factory;
+ if (http.isTlsCheckCertificates()) {
+ factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+ } else {
+ SSLContext sslContext = SSLContext.getInstance("TLS");
+ sslContext.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ factory = sslContext.getSocketFactory();
+ }
+
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
+ sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
Set<String> protocols = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedProtocols()));
+ Arrays.asList(sslsocket.getSupportedProtocols()));
Set<String> ciphers = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedCipherSuites()));
+ Arrays.asList(sslsocket.getSupportedCipherSuites()));
// Intersect with preferred protocols and ciphers
protocols.retainAll(http.getTlsPreferredProtocols());
ciphers.retainAll(http.getTlsPreferredCipherSuites());
sslsocket.setEnabledProtocols(
- protocols.toArray(new String[protocols.size()]));
+ protocols.toArray(new String[protocols.size()]));
sslsocket.setEnabledCipherSuites(
- ciphers.toArray(new String[ciphers.size()]));
+ ciphers.toArray(new String[ciphers.size()]));
sslsocket.startHandshake();
socket = sslsocket;
}
if (sockAddr != null
- && conf.getBoolean("store.ip.address", false) == true) {
+ && conf.getBoolean("store.ip.address", false) == true) {
headers.add("_ip_", sockAddr.getAddress().getHostAddress());
}
// make request
@@ -203,17 +220,17 @@
}
if (http.isCookieEnabled()
- && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+ && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
- .toString();
+ .toString();
reqStr.append("Cookie: ");
reqStr.append(cookie);
reqStr.append("\r\n");
}
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -223,25 +240,24 @@
headers.add("_request_", reqStr.toString());
}
-
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE);
+ new PushbackInputStream(new BufferedInputStream(
+ socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
-
// store the http headers verbatim
if (conf.getBoolean("store.http.headers", false) == true) {
httpHeaders = new StringBuffer();
}
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+ headers.add("nutch.fetch.time",
+ Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
@@ -257,9 +273,10 @@
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with Selenium only if content type in HTML or XHTML
+ // handle with Selenium only if content type in HTML or XHTML
if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ if (contentType.contains("text/html")
+ || contentType.contains("application/xhtml")) {
readPlainContent(url);
} else {
try {
@@ -269,11 +286,13 @@
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " + contentLengthString);
+ throw new HttpException(
+ "bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+ if (http.getMaxContent() >= 0
+ && contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
@@ -299,17 +318,20 @@
}
}
}
- }
+ }
+ }catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) {
+ throw new ProtocolException(e);
} finally {
if (socket != null)
socket.close();
}
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
public URL getUrl() {
return url;
@@ -331,56 +353,63 @@
return content;
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
private void loadSeleniumHandlers() {
- if (handlers != null) return;
+ if (handlers != null)
+ return;
- String handlerConfig = this.conf.get("interactiveselenium.handlers", "DefaultHandler");
+ String handlerConfig = this.conf.get("interactiveselenium.handlers",
+ "DefaultHandler");
String[] handlerNames = handlerConfig.split(",");
handlers = new InteractiveSeleniumHandler[handlerNames.length];
for (int i = 0; i < handlerNames.length; i++) {
+ try {
+ String classToLoad = this.getClass().getPackage().getName()
+ + ".handlers." + handlerNames[i];
try {
- String classToLoad = this.getClass().getPackage().getName() + ".handlers." + handlerNames[i];
- try {
- handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).getConstructor().newInstance());
- } catch (IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) {
- e.printStackTrace();
- }
- Http.LOG.info("Successfully loaded " + classToLoad);
- } catch (ClassNotFoundException e) {
- Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]);
- } catch (InstantiationException e) {
- Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
- } catch (IllegalAccessException e) {
- Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+ handlers[i] = InteractiveSeleniumHandler.class
+ .cast(Class.forName(classToLoad).getConstructor().newInstance());
+ } catch (IllegalArgumentException | InvocationTargetException
+ | NoSuchMethodException | SecurityException e) {
+ e.printStackTrace();
}
+ Http.LOG.info("Successfully loaded " + classToLoad);
+ } catch (ClassNotFoundException e) {
+ Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]);
+ } catch (InstantiationException e) {
+ Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
+ } catch (IllegalAccessException e) {
+ Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+ }
}
}
private void readPlainContent(URL url) throws IOException {
if (handlers == null)
- loadSeleniumHandlers();
+ loadSeleniumHandlers();
String processedPage = "";
for (InteractiveSeleniumHandler handler : this.handlers) {
- if (! handler.shouldProcessURL(url.toString())) {
- continue;
- }
+ if (!handler.shouldProcessURL(url.toString())) {
+ continue;
+ }
- WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
+ WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
- processedPage += handler.processDriver(driver);
+ processedPage += handler.processDriver(driver);
- HttpWebClient.cleanUpDriver(driver);
+ HttpWebClient.cleanUpDriver(driver);
}
content = processedPage.getBytes("UTF-8");
}
- private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
readLine(in, line, false);
int codeStart = line.indexOf(" ");
@@ -395,13 +424,15 @@
try {
code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
} catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+ throw new HttpException(
+ "bad status line '" + line + "': " + e.getMessage(), e);
}
return code;
}
- private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+ private void processHeaderLine(StringBuffer line)
+ throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
@@ -427,24 +458,26 @@
}
// Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private void parseHeaders(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
|| ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
+ // TODO: (CM) We don't know the header names here
+ // since we're just handling them generically. It would
+ // be nice to provide some sort of mapping function here
+ // for the returned header names to the standard metadata
+ // names in the ParseData class
processHeaderLine(line);
} catch (Exception e) {
// fixme:
@@ -457,8 +490,8 @@
}
}
- private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
- throws IOException {
+ private static int readLine(PushbackInputStream in, StringBuffer line,
+ boolean allowContinuedLine) throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
@@ -491,4 +524,4 @@
in.unread(value);
return value;
}
-}
+}
\ No newline at end of file
diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml
index f529892..c5c09c8 100644
--- a/src/plugin/protocol-okhttp/ivy.xml
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -36,7 +36,8 @@
</publications>
<dependencies>
- <dependency org="com.squareup.okhttp3" name="okhttp" rev="3.14.2"/>
+ <dependency org="com.squareup.okhttp3" name="okhttp" rev="4.3.1"/>
+ <dependency org="com.squareup.okhttp3" name="okhttp-brotli" rev="4.3.1"/>
</dependencies>
</ivy-module>
diff --git a/src/plugin/protocol-okhttp/plugin.xml b/src/plugin/protocol-okhttp/plugin.xml
index b843736..bbeb5da 100755
--- a/src/plugin/protocol-okhttp/plugin.xml
+++ b/src/plugin/protocol-okhttp/plugin.xml
@@ -25,8 +25,13 @@
<library name="protocol-okhttp.jar">
<export name="*"/>
</library>
- <library name="okhttp-3.14.2.jar"/>
- <library name="okio-1.17.2.jar"/>
+ <library name="okhttp-4.3.1.jar"/>
+ <library name="okio-2.4.1.jar"/>
+ <library name="kotlin-stdlib-1.3.61.jar"/>
+ <library name="kotlin-stdlib-common-1.3.61.jar"/>
+ <library name="annotations-13.0.jar"/>
+ <library name="okhttp-brotli-4.3.1.jar"/>
+ <library name="dec-0.1.2.jar"/>
</runtime>
<requires>
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index b4edb19..708863b 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -56,6 +56,7 @@
import okhttp3.OkHttpClient;
import okhttp3.Protocol;
import okhttp3.Request;
+import okhttp3.brotli.BrotliInterceptor;
public class OkHttp extends HttpBase {
@@ -216,6 +217,9 @@
builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
}
+ // enable support for Brotli compression (Content-Encoding)
+ builder.addInterceptor(BrotliInterceptor.INSTANCE);
+
client = builder.build();
}
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java
new file mode 100644
index 0000000..1eea806
--- /dev/null
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
+public class DummyX509TrustManager implements X509TrustManager {
+ private X509TrustManager standardTrustManager = null;
+
+ /**
+ * Constructor for DummyX509TrustManager.
+ */
+ public DummyX509TrustManager(KeyStore keystore)
+ throws NoSuchAlgorithmException, KeyStoreException {
+ super();
+ String algo = TrustManagerFactory.getDefaultAlgorithm();
+ TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+ factory.init(keystore);
+ TrustManager[] trustmanagers = factory.getTrustManagers();
+ if (trustmanagers.length == 0) {
+ throw new NoSuchAlgorithmException(algo + " trust manager not supported");
+ }
+ this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isClientTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+ * String)
+ */
+ public boolean isServerTrusted(X509Certificate[] certificates) {
+ return true;
+ }
+
+ /**
+ * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ */
+ public X509Certificate[] getAcceptedIssuers() {
+ return this.standardTrustManager.getAcceptedIssuers();
+ }
+
+ public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+
+ public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ // do nothing
+
+ }
+}
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index 4a20b04..b394d02 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -25,13 +25,17 @@
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.security.KeyManagementException;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -61,10 +65,12 @@
protected enum Scheme {
HTTP, HTTPS,
}
+
/** The nutch configuration */
private Configuration conf = null;
- public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+ public HttpResponse(Http http, URL url, CrawlDatum datum)
+ throws ProtocolException, IOException {
this.conf = http.getConf();
this.http = http;
@@ -118,33 +124,43 @@
socket.connect(sockAddr, http.getTimeout());
if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
+
+ // Optionally skip TLS/SSL certificate validation
+ SSLSocketFactory factory;
+ if (http.isTlsCheckCertificates()) {
+ factory = (SSLSocketFactory) SSLSocketFactory.getDefault();
+ } else {
+ SSLContext sslContext = SSLContext.getInstance("TLS");
+ sslContext.init(null,
+ new TrustManager[] { new DummyX509TrustManager(null) }, null);
+ factory = sslContext.getSocketFactory();
+ }
+
+ SSLSocket sslsocket = (SSLSocket) factory.createSocket(socket, sockHost,
+ sockPort, true);
sslsocket.setUseClientMode(true);
// Get the protocols and ciphers supported by this JVM
Set<String> protocols = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedProtocols()));
+ Arrays.asList(sslsocket.getSupportedProtocols()));
Set<String> ciphers = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedCipherSuites()));
+ Arrays.asList(sslsocket.getSupportedCipherSuites()));
// Intersect with preferred protocols and ciphers
protocols.retainAll(http.getTlsPreferredProtocols());
ciphers.retainAll(http.getTlsPreferredCipherSuites());
sslsocket.setEnabledProtocols(
- protocols.toArray(new String[protocols.size()]));
+ protocols.toArray(new String[protocols.size()]));
sslsocket.setEnabledCipherSuites(
- ciphers.toArray(new String[ciphers.size()]));
+ ciphers.toArray(new String[ciphers.size()]));
sslsocket.startHandshake();
socket = sslsocket;
}
if (sockAddr != null
- && conf.getBoolean("store.ip.address", false) == true) {
+ && conf.getBoolean("store.ip.address", false) == true) {
headers.add("_ip_", sockAddr.getAddress().getHostAddress());
}
// make request
@@ -199,17 +215,17 @@
}
if (http.isCookieEnabled()
- && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+ && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
- .toString();
+ .toString();
reqStr.append("Cookie: ");
reqStr.append(cookie);
reqStr.append("\r\n");
}
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
+ reqStr.append("If-Modified-Since: "
+ + HttpDateFormat.toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
@@ -219,25 +235,24 @@
headers.add("_request_", reqStr.toString());
}
-
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE);
+ new PushbackInputStream(new BufferedInputStream(
+ socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
-
// store the http headers verbatim
if (conf.getBoolean("store.http.headers", false) == true) {
httpHeaders = new StringBuffer();
}
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+ headers.add("nutch.fetch.time",
+ Long.toString(System.currentTimeMillis()));
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
@@ -253,9 +268,10 @@
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with Selenium only if content type in HTML or XHTML
+ // handle with Selenium only if content type in HTML or XHTML
if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ if (contentType.contains("text/html")
+ || contentType.contains("application/xhtml")) {
readPlainContent(url);
} else {
try {
@@ -265,11 +281,13 @@
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " + contentLengthString);
+ throw new HttpException(
+ "bad content length: " + contentLengthString);
}
}
- if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+ if (http.getMaxContent() >= 0
+ && contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
@@ -298,17 +316,20 @@
if (httpHeaders != null) {
headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
}
- }
+ }
+ } catch(KeyManagementException | NoSuchAlgorithmException | KeyStoreException e) {
+ throw new ProtocolException(e);
} finally {
if (socket != null)
socket.close();
}
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
public URL getUrl() {
return url;
@@ -330,9 +351,10 @@
return content;
}
- /* ------------------------- *
- * <implementation:Response> *
- * ------------------------- */
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
private void readPlainContent(URL url) throws IOException {
String page = HttpWebClient.getHtmlPage(url.toString(), conf);
@@ -340,7 +362,8 @@
content = page.getBytes("UTF-8");
}
- private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
readLine(in, line, false);
int codeStart = line.indexOf(" ");
@@ -355,13 +378,15 @@
try {
code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
} catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+ throw new HttpException(
+ "bad status line '" + line + "': " + e.getMessage(), e);
}
return code;
}
- private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+ private void processHeaderLine(StringBuffer line)
+ throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
@@ -387,24 +412,26 @@
}
// Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private void parseHeaders(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1)
+ || ((pos = line.indexOf("<HTML")) != -1)
|| ((pos = line.indexOf("<html")) != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
+ // TODO: (CM) We don't know the header names here
+ // since we're just handling them generically. It would
+ // be nice to provide some sort of mapping function here
+ // for the returned header names to the standard metadata
+ // names in the ParseData class
processHeaderLine(line);
} catch (Exception e) {
// fixme:
@@ -417,8 +444,8 @@
}
}
- private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
- throws IOException {
+ private static int readLine(PushbackInputStream in, StringBuffer line,
+ boolean allowContinuedLine) throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
@@ -451,4 +478,4 @@
in.unread(value);
return value;
}
-}
+}
\ No newline at end of file