blob: 03ddfe96249be98f397b81c33e5f074d1c58438e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.http;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.invoke.MethodHandles;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.junit.After;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Test cases for protocol-http - robustness regarding bad server responses:
* malformed HTTP header lines, etc. See, NUTCH-2549.
*/
public class TestBadServerResponses {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private Http http;
private ServerSocket server;
private Configuration conf;
private int port = 47505;
private static final String responseHeader = "HTTP/1.1 200 OK\r\n";
private static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
public void setUp() throws Exception {
conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site-test.xml");
conf.setBoolean("store.http.headers", true);
http = new Http();
http.setConf(conf);
}
@After
public void tearDown() throws Exception {
server.close();
}
/**
* Starts the test server at a specified port and constant response.
*
* @param portno
* Port number.
* @param response
* response sent on every request
*/
private void runServer(int port, String response) throws Exception {
server = new ServerSocket();
server.bind(new InetSocketAddress("127.0.0.1", port));
Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
while (true) {
LOG.info("Listening on port {}", port);
Socket socket = server.accept();
LOG.info("Connection received");
try (
BufferedReader in = new BufferedReader(new InputStreamReader(
socket.getInputStream(), StandardCharsets.UTF_8));
PrintWriter out = new PrintWriter(new OutputStreamWriter(
socket.getOutputStream(), StandardCharsets.UTF_8), true)) {
String line;
while ((line = in.readLine()) != null) {
LOG.info("Request: {}", line);
if (line.trim().isEmpty()) {
break;
}
Matcher m = requestPattern.matcher(line);
if (m.find()) {
LOG.info("Requested {}", m.group(1));
if (!m.group(1).startsWith("/")) {
response = "HTTP/1.1 400 Bad request\r\n\r\n";
}
}
}
LOG.info("Response: {}",
response.substring(0, Math.min(1024, response.length())));
out.print(response);
} catch (Exception e) {
LOG.warn("Exception in test server:", e);
}
}
}
private void launchServer(String response) throws InterruptedException {
Thread serverThread = new Thread(() -> {
try {
runServer(port, response);
} catch (Exception e) {
LOG.warn("Test server died:", e);
}
});
serverThread.start();
Thread.sleep(50);
}
/**
* Fetches the specified <code>page</code> from the local test server and
* checks whether the HTTP response status code matches with the expected
* code.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
*/
private Response fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
LOG.info("Fetching {}", url);
CrawlDatum crawlDatum = new CrawlDatum();
Response response = http.getResponse(url, crawlDatum, true);
assertEquals("HTTP Status Code for " + url, expectedCode,
response.getCode());
return response;
}
@Test
public void testBadHttpServer() throws Exception {
setUp();
// test with trivial well-formed content, to make sure the server is
// responding
launchServer(responseHeader + simpleContent);
fetchPage("/", 200);
}
/**
* NUTCH-2555 URL normalization problem: path not starting with a '/'
*/
@Test
public void testRequestNotStartingWithSlash() throws Exception {
setUp();
launchServer(responseHeader + simpleContent);
fetchPage("?171", 200);
}
/**
* NUTCH-2564 protocol-http throws an error when the content-length header is
* not a number
*/
@Test
public void testContentLengthNotANumber() throws Exception {
setUp();
launchServer(
responseHeader + "Content-Length: thousand\r\n" + simpleContent);
fetchPage("/", 200);
}
/**
* NUTCH-2559 protocol-http cannot handle colons after the HTTP status code
*/
@Test
public void testHeaderWithColon() throws Exception {
setUp();
launchServer("HTTP/1.1 200: OK\r\n" + simpleContent);
fetchPage("/", 200);
}
/**
* NUTCH-2563 HTTP header spellchecking issues
*/
@Test
public void testHeaderSpellChecking() throws Exception {
setUp();
launchServer(responseHeader + "Client-Transfer-Encoding: chunked\r\n"
+ simpleContent);
fetchPage("/", 200);
}
/**
* NUTCH-2557 protocol-http fails to follow redirections when an HTTP response
* body is invalid
*/
@Test
public void testIgnoreErrorInRedirectPayload() throws Exception {
setUp();
launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n"
+ "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
Response fetched = fetchPage("/", 302);
assertNotNull("No redirect Location.", fetched.getHeader("Location"));
assertEquals("Wrong redirect Location.", "http://example.com/",
fetched.getHeader("Location"));
}
/**
* NUTCH-2558 protocol-http cannot handle a missing HTTP status line
*/
@Test
public void testNoStatusLine() throws Exception {
setUp();
String text = "This is a text containing non-ASCII characters: \u00e4\u00f6\u00fc\u00df";
launchServer(text);
Response fetched = fetchPage("/", 200);
assertEquals("Wrong text returned for response with no status line.", text,
new String(fetched.getContent(), StandardCharsets.UTF_8));
server.close();
text = "<!DOCTYPE html>\n<html>\n<head>\n"
+ "<title>Testing no HTTP header èéâ</title>\n"
+ "<meta charset=\"utf-8\">\n"
+ "</head>\n<body>This is a text containing non-ASCII characters:"
+ "\u00e4\u00f6\u00fc\u00df</body>\n</html";
launchServer(text);
fetched = fetchPage("/", 200);
assertEquals("Wrong text returned for response with no status line.", text,
new String(fetched.getContent(), StandardCharsets.UTF_8));
}
/**
* NUTCH-2560 protocol-http throws an error when an http header spans over
* multiple lines
*/
@Test
public void testMultiLineHeader() throws Exception {
setUp();
launchServer(responseHeader
+ "Set-Cookie: UserID=JohnDoe;\r\n Max-Age=3600;\r\n Version=1\r\n"
+ simpleContent);
Response fetched = fetchPage("/", 200);
LOG.info("Headers: {}", fetched.getHeaders());
assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", fetched.getHeader("Set-Cookie"));
assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
fetched.getHeader("Set-Cookie").contains("Version=1"));
}
/**
* NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
* responses
*/
@Test(expected = Exception.class)
public void testOverlongHeader() throws Exception {
setUp();
StringBuilder response = new StringBuilder();
response.append(responseHeader);
for (int i = 0; i < 80; i++) {
response.append("X-Custom-Header-");
for (int j = 0; j < 10000; j++) {
response.append('x');
}
response.append(": hello\r\n");
}
response.append("\r\n" + simpleContent);
launchServer(response.toString());
// should throw exception because of overlong header
fetchPage("/", 200);
}
/**
* NUTCH-2562 protocol-http fails to read large chunked HTTP responses,
* NUTCH-2575 protocol-http does not respect the maximum content-size for
* chunked responses
*/
@Test
public void testChunkedContent() throws Exception {
setUp();
StringBuilder response = new StringBuilder();
response.append(responseHeader);
response.append("Content-Type: text/html\r\n");
response.append("Transfer-Encoding: chunked\r\n");
// 81920 bytes (80 chunks, 1024 bytes each)
// > 65536 (http.content.limit defined in nutch-site-test.xml)
for (int i = 0; i < 80; i++) {
response.append(String.format("\r\n400\r\n%02x\r\n", i));
for (int j = 0; j < 1012; j++) {
response.append('x');
}
response.append(String.format("\r\n%02x\r\n", i));
response.append("\r\n");
}
response.append("\r\n0\r\n\r\n");
launchServer(response.toString());
Response fetched = fetchPage("/", 200);
assertEquals(
"Chunked content not truncated according to http.content.limit", 65536,
fetched.getContent().length);
}
}