Merge pull request #460 from sebastian-nagel/NUTCH-2727-upgrade-Hadoop-2.9.2
NUTCH-2727 Upgrade Hadoop dependencies to 2.9.2
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index a50441f..e753c6f 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -52,7 +52,7 @@
<dependency org="com.tdunning" name="t-digest" rev="3.2" />
<!-- Hadoop Dependencies -->
- <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.4" conf="*->default">
+ <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.9.2" conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
<exclude org="net.sf.kosmosfs" name="kfs" />
<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -60,9 +60,9 @@
<exclude org="org.mortbay.jetty" name="jsp-*" />
<exclude org="ant" name="ant" />
</dependency>
- <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.7.4" conf="*->default"/>
- <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.7.4" conf="*->default"/>
- <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.9.2" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.9.2" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.9.2" conf="*->default"/>
<!-- End of Hadoop Dependencies -->
<dependency org="org.apache.tika" name="tika-core" rev="1.22" />
@@ -76,7 +76,9 @@
<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.0" />
- <dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
+ <dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">
+ <exclude module="hadoop-client" />
+ </dependency>
<!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/>
diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
index 7dcc400..a545a4c 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
@@ -215,4 +215,20 @@
return false;
}
+ /** Get a {@link Protocol} instance of the specified extension ID. */
+ /**
+ * @param id
+ * protocol plugin ID, e.g.,
+ * <code>org.apache.nutch.protocol.http</code>
+ * @return protocol instance for the given ID
+ * @throws PluginRuntimeException
+ * if plugin not found or failed to instantiate
+ */
+ public Protocol getProtocolById(String id) throws PluginRuntimeException {
+ Extension ext = getExtensionById(id);
+ if (ext == null) {
+ throw new PluginRuntimeException("ID " + id + " not found");
+ }
+ return getProtocolInstanceByExtension(ext);
+ }
}
diff --git a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
index 72776c3..1e9e4a6 100644
--- a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
+++ b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
@@ -20,6 +20,11 @@
<configuration>
<property>
+ <name>plugin.includes</name>
+ <value>protocol-okhttp</value>
+</property>
+
+<property>
<name>http.agent.name</name>
<value>Nutch-Test</value>
</property>
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
index 7dcd642..bf69893 100644
--- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
@@ -26,6 +26,7 @@
import java.io.PrintWriter;
import java.lang.invoke.MethodHandles;
import java.net.InetSocketAddress;
+import java.net.MalformedURLException;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.URL;
@@ -34,8 +35,14 @@
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
import org.junit.Ignore;
import org.junit.Test;
@@ -51,7 +58,7 @@
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private OkHttp http;
+ private Protocol http;
private ServerSocket server;
private Configuration conf;
private int port = 47506;
@@ -60,13 +67,15 @@
private static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
public void setUp() throws Exception {
- conf = new Configuration();
+ conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
+ // plugin tests specific config file - adds protocol-okhttp to
+ // plugin.includes
conf.addResource("nutch-site-test.xml");
conf.setBoolean("store.http.headers", true);
- http = new OkHttp();
- http.setConf(conf);
+ http = new ProtocolFactory(conf)
+ .getProtocolById("org.apache.nutch.protocol.okhttp.OkHttp");
}
@After
@@ -74,6 +83,20 @@
server.close();
}
+ public static String getHeaders(ProtocolOutput response) {
+ return response.getContent().getMetadata().get(Response.RESPONSE_HEADERS);
+ }
+
+ public static String getHeader(ProtocolOutput response, String header) {
+ for (String line : getHeaders(response).split("\r\n")) {
+ String[] parts = line.split(": ", 1);
+ if (parts[0].equals(header)) {
+ return parts[1];
+ }
+ }
+ return null;
+ }
+
/**
* Starts the test server at a specified port and constant response.
*
@@ -141,14 +164,25 @@
* @param expectedCode
* HTTP response status code expected while fetching the page.
*/
- private Response fetchPage(String page, int expectedCode) throws Exception {
+ private ProtocolOutput fetchPage(String page, int expectedCode)
+ throws MalformedURLException {
URL url = new URL("http", "127.0.0.1", port, page);
LOG.info("Fetching {}", url);
CrawlDatum crawlDatum = new CrawlDatum();
- Response response = http.getResponse(url, crawlDatum, true);
- assertEquals("HTTP Status Code for " + url, expectedCode,
- response.getCode());
- return response;
+ ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
+ crawlDatum);
+ if (expectedCode == -1) {
+ System.out.println(out);
+ }
+ int httpStatusCode = -1;
+ if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+ httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
+ .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ }
+
+ assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
+
+ return out;
}
@Test
@@ -214,10 +248,10 @@
setUp();
launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n"
+ "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
- Response fetched = fetchPage("/", 302);
- assertNotNull("No redirect Location.", fetched.getHeader("Location"));
+ ProtocolOutput fetched = fetchPage("/", 302);
+ assertNotNull("No redirect Location.", getHeader(fetched, "Location"));
assertEquals("Wrong redirect Location.", "http://example.com/",
- fetched.getHeader("Location"));
+ getHeader(fetched, "Location"));
}
/**
@@ -229,9 +263,9 @@
setUp();
String text = "This is a text containing non-ASCII characters: \u00e4\u00f6\u00fc\u00df";
launchServer(text);
- Response fetched = fetchPage("/", 200);
+ ProtocolOutput fetched = fetchPage("/", 200);
assertEquals("Wrong text returned for response with no status line.", text,
- new String(fetched.getContent(), StandardCharsets.UTF_8));
+ new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
server.close();
text = "<!DOCTYPE html>\n<html>\n<head>\n"
+ "<title>Testing no HTTP header èéâ</title>\n"
@@ -241,7 +275,7 @@
launchServer(text);
fetched = fetchPage("/", 200);
assertEquals("Wrong text returned for response with no status line.", text,
- new String(fetched.getContent(), StandardCharsets.UTF_8));
+ new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
}
/**
@@ -255,18 +289,18 @@
launchServer(responseHeader
+ "Set-Cookie: UserID=JohnDoe;\r\n Max-Age=3600;\r\n Version=1\r\n"
+ simpleContent);
- Response fetched = fetchPage("/", 200);
- LOG.info("Headers: {}", fetched.getHeaders());
- assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", fetched.getHeader("Set-Cookie"));
+ ProtocolOutput fetched = fetchPage("/", 200);
+ LOG.info("Headers: {}", getHeaders(fetched));
+ assertNotNull("Failed to set multi-line \"Set-Cookie\" header.",
+ getHeader(fetched, "Set-Cookie"));
assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
- fetched.getHeader("Set-Cookie").contains("Version=1"));
+ getHeader(fetched, "Set-Cookie").contains("Version=1"));
}
/**
* NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
* responses
*/
- @Test(expected = Exception.class)
public void testOverlongHeader() throws Exception {
setUp();
StringBuilder response = new StringBuilder();
@@ -281,7 +315,7 @@
response.append("\r\n" + simpleContent);
launchServer(response.toString());
// should throw exception because of overlong header
- fetchPage("/", 200);
+ fetchPage("/", -1);
}
/**
@@ -308,10 +342,10 @@
}
response.append("\r\n0\r\n\r\n");
launchServer(response.toString());
- Response fetched = fetchPage("/", 200);
+ ProtocolOutput fetched = fetchPage("/", 200);
assertEquals(
"Chunked content not truncated according to http.content.limit", 65536,
- fetched.getContent().length);
+ fetched.getContent().getContent().length);
}
}
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
index 542fb41..3650722 100644
--- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
@@ -23,10 +23,12 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.okhttp.OkHttp;
+import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
import org.junit.Test;
import org.mortbay.jetty.Server;
@@ -40,19 +42,21 @@
public class TestProtocolOkHttp {
private static final String RES_DIR = System.getProperty("test.data", ".");
- private OkHttp http;
+ private Protocol http;
private Server server;
private Context root;
private Configuration conf;
private int port;
public void setUp(boolean redirection) throws Exception {
- conf = new Configuration();
+ conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
+ // plugin tests specific config file - adds protocol-okhttp to
+ // plugin.includes
conf.addResource("nutch-site-test.xml");
- http = new OkHttp();
- http.setConf(conf);
+ http = new ProtocolFactory(conf)
+ .getProtocolById("org.apache.nutch.protocol.okhttp.OkHttp");
server = new Server();
@@ -123,12 +127,17 @@
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
CrawlDatum crawlDatum = new CrawlDatum();
- Response response = http.getResponse(url, crawlDatum, true);
+
ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
crawlDatum);
+ int httpStatusCode = -1;
+ if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+ httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
+ .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ }
Content content = out.getContent();
- assertEquals("HTTP Status Code for " + url, expectedCode,
- response.getCode());
+
+ assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
if (page.compareTo("/nonexists.html") != 0
&& page.compareTo("/brokenpage.jsp") != 0