NUTCH-2576 HTTP protocol implementation based on okhttp
- port unit tests for NUTCH-2549 from protocol-http
- ignore failing unit tests
diff --git a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
index dd74c32..72776c3 100644
--- a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
+++ b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
@@ -34,4 +34,10 @@
<value>true</value>
</property>
-</configuration>
\ No newline at end of file
+<property>
+ <name>http.content.limit</name>
+ <value>65536</value>
+ <description></description>
+</property>
+
+</configuration>
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
new file mode 100644
index 0000000..1006982
--- /dev/null
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.okhttp;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.lang.invoke.MethodHandles;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.junit.After;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test cases for protocol-http - robustness regarding bad server responses:
+ * malformed HTTP header lines, etc. See, NUTCH-2549.
+ */
+public class TestBadServerResponses {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private OkHttp http;
+ private ServerSocket server;
+ private Configuration conf;
+ private int port = 47505;
+
+ private static final String responseHeader = "HTTP/1.1 200 OK\r\n";
+ private static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
+
+ public void setUp() throws Exception {
+ conf = new Configuration();
+ conf.addResource("nutch-default.xml");
+ conf.addResource("nutch-site-test.xml");
+ conf.setBoolean("store.http.headers", true);
+
+ http = new OkHttp();
+ http.setConf(conf);
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ server.close();
+ }
+
+ /**
+ * Starts the test server at a specified port and constant response.
+ *
+ * @param portno
+ * Port number.
+ * @param response
+ * response sent on every request
+ */
+ private void runServer(int port, String response) throws Exception {
+ server = new ServerSocket();
+ server.bind(new InetSocketAddress("127.0.0.1", port));
+ Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
+ while (true) {
+ LOG.info("Listening on port {}", port);
+ Socket socket = server.accept();
+ LOG.info("Connection received");
+ try (
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ socket.getInputStream(), StandardCharsets.UTF_8));
+ PrintWriter out = new PrintWriter(new OutputStreamWriter(
+ socket.getOutputStream(), StandardCharsets.UTF_8), true)) {
+
+ String line;
+ while ((line = in.readLine()) != null) {
+ LOG.info("Request: {}", line);
+ if (line.trim().isEmpty()) {
+ break;
+ }
+ Matcher m = requestPattern.matcher(line);
+ if (m.find()) {
+ LOG.info("Requested {}", m.group(1));
+ if (!m.group(1).startsWith("/")) {
+ response = "HTTP/1.1 400 Bad request\r\n\r\n";
+ }
+ }
+ }
+ LOG.info("Response: {}",
+ response.substring(0, Math.min(1024, response.length())));
+ out.print(response);
+ } catch (Exception e) {
+ LOG.warn("Exception in test server:", e);
+ }
+ }
+ }
+
+ private void launchServer(String response) throws InterruptedException {
+ Thread serverThread = new Thread(() -> {
+ try {
+ runServer(port, response);
+ } catch (Exception e) {
+ LOG.warn("Test server died:", e);
+ }
+ });
+ serverThread.start();
+ Thread.sleep(50);
+ }
+
+ /**
+ * Fetches the specified <code>page</code> from the local test server and
+ * checks whether the HTTP response status code matches with the expected
+ * code.
+ *
+ * @param page
+ * Page to be fetched.
+ * @param expectedCode
+ * HTTP response status code expected while fetching the page.
+ */
+ private Response fetchPage(String page, int expectedCode) throws Exception {
+ URL url = new URL("http", "127.0.0.1", port, page);
+ LOG.info("Fetching {}", url);
+ CrawlDatum crawlDatum = new CrawlDatum();
+ Response response = http.getResponse(url, crawlDatum, true);
+ assertEquals("HTTP Status Code for " + url, expectedCode,
+ response.getCode());
+ return response;
+ }
+
+ @Test
+ public void testBadHttpServer() throws Exception {
+ setUp();
+ // test with trivial well-formed content, to make sure the server is
+ // responding
+ launchServer(responseHeader + simpleContent);
+ fetchPage("/", 200);
+ }
+
+ /**
+ * NUTCH-2555 URL normalization problem: path not starting with a '/'
+ */
+ @Test
+ public void testRequestNotStartingWithSlash() throws Exception {
+ setUp();
+ launchServer(responseHeader + simpleContent);
+ fetchPage("?171", 200);
+ }
+
+ /**
+ * NUTCH-2564 protocol-http throws an error when the content-length header is
+ * not a number
+ */
+ @Test
+ public void testContentLengthNotANumber() throws Exception {
+ setUp();
+ launchServer(
+ responseHeader + "Content-Length: thousand\r\n" + simpleContent);
+ fetchPage("/", 200);
+ }
+
+ /**
+ * NUTCH-2559 protocol-http cannot handle colons after the HTTP status code
+ */
+ @Ignore("Fails with okhttp 3.10.0")
+ @Test
+ public void testHeaderWithColon() throws Exception {
+ setUp();
+ launchServer("HTTP/1.1 200: OK\r\n" + simpleContent);
+ fetchPage("/", 200);
+ }
+
+ /**
+ * NUTCH-2563 HTTP header spellchecking issues
+ */
+ @Test
+ public void testHeaderSpellChecking() throws Exception {
+ setUp();
+ launchServer(responseHeader + "Client-Transfer-Encoding: chunked\r\n"
+ + simpleContent);
+ fetchPage("/", 200);
+ }
+
+ /**
+ * NUTCH-2557 protocol-http fails to follow redirections when an HTTP response
+ * body is invalid
+ */
+ @Ignore("Fails with okhttp 3.10.0")
+ @Test
+ public void testIgnoreErrorInRedirectPayload() throws Exception {
+ setUp();
+ launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n"
+ + "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
+ Response fetched = fetchPage("/", 302);
+ assertNotNull("No redirect Location.", fetched.getHeader("Location"));
+ assertEquals("Wrong redirect Location.", "http://example.com/",
+ fetched.getHeader("Location"));
+ }
+
+ /**
+ * NUTCH-2558 protocol-http cannot handle a missing HTTP status line
+ */
+ @Ignore("Fails with okhttp 3.10.0")
+ @Test
+ public void testNoStatusLine() throws Exception {
+ setUp();
+ String text = "This is a text containing non-ASCII characters: \u00e4\u00f6\u00fc\u00df";
+ launchServer(text);
+ Response fetched = fetchPage("/", 200);
+ assertEquals("Wrong text returned for response with no status line.", text,
+ new String(fetched.getContent(), StandardCharsets.UTF_8));
+ server.close();
+ text = "<!DOCTYPE html>\n<html>\n<head>\n"
+ + "<title>Testing no HTTP header èéâ</title>\n"
+ + "<meta charset=\"utf-8\">\n"
+ + "</head>\n<body>This is a text containing non-ASCII characters:"
+ + "\u00e4\u00f6\u00fc\u00df</body>\n</html";
+ launchServer(text);
+ fetched = fetchPage("/", 200);
+ assertEquals("Wrong text returned for response with no status line.", text,
+ new String(fetched.getContent(), StandardCharsets.UTF_8));
+ }
+
+ /**
+ * NUTCH-2560 protocol-http throws an error when an http header spans over
+ * multiple lines
+ */
+ @Ignore("Fails with okhttp 3.10.0")
+ @Test
+ public void testMultiLineHeader() throws Exception {
+ setUp();
+ launchServer(responseHeader
+ + "Set-Cookie: UserID=JohnDoe;\r\n Max-Age=3600;\r\n Version=1\r\n"
+ + simpleContent);
+ Response fetched = fetchPage("/", 200);
+ LOG.info("Headers: {}", fetched.getHeaders());
+ assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", fetched.getHeader("Set-Cookie"));
+ assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
+ fetched.getHeader("Set-Cookie").contains("Version=1"));
+ }
+
+ /**
+ * NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
+ * responses
+ */
+ @Test(expected = Exception.class)
+ public void testOverlongHeader() throws Exception {
+ setUp();
+ StringBuilder response = new StringBuilder();
+ response.append(responseHeader);
+ for (int i = 0; i < 80; i++) {
+ response.append("X-Custom-Header-");
+ for (int j = 0; j < 10000; j++) {
+ response.append('x');
+ }
+ response.append(": hello\r\n");
+ }
+ response.append("\r\n" + simpleContent);
+ launchServer(response.toString());
+ // should throw exception because of overlong header
+ fetchPage("/", 200);
+ }
+
+ /**
+ * NUTCH-2562 protocol-http fails to read large chunked HTTP responses,
+ * NUTCH-2575 protocol-http does not respect the maximum content-size for
+ * chunked responses
+ */
+ @Test
+ public void testChunkedContent() throws Exception {
+ setUp();
+ StringBuilder response = new StringBuilder();
+ response.append(responseHeader);
+ response.append("Content-Type: text/html\r\n");
+ response.append("Transfer-Encoding: chunked\r\n");
+ // 81920 bytes (80 chunks, 1024 bytes each)
+ // > 65536 (http.content.limit defined in nutch-site-test.xml)
+ for (int i = 0; i < 80; i++) {
+ response.append(String.format("\r\n400\r\n%02x\r\n", i));
+ for (int j = 0; j < 1012; j++) {
+ response.append('x');
+ }
+ response.append(String.format("\r\n%02x\r\n", i));
+ response.append("\r\n");
+ }
+ response.append("\r\n0\r\n\r\n");
+ launchServer(response.toString());
+ Response fetched = fetchPage("/", 200);
+ assertEquals(
+ "Chunked content not truncated according to http.content.limit", 65536,
+ fetched.getContent().length);
+ }
+
+}