blob: 5942486f2303eb0ee8612dbe4dcb3ac8180b293e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.httpclient;
import java.lang.invoke.MethodHandles;
import java.io.InputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NTCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
/**
* <p>
* This class is a protocol plugin that configures an HTTP client for Basic,
* Digest and NTLM authentication schemes for web server as well as proxy
* server. It takes care of HTTPS protocol as well as cookies in a single fetch
* session.
* </p>
* <p>
* Documentation can be found on the Nutch
* <a href="https://cwiki.apache.org/confluence/display/NUTCH/HttpAuthenticationSchemes" >
* HttpAuthenticationSchemes</a> wiki page.
* </p>
* <p>
* The original description of the motivation to support
* <a href="https://cwiki.apache.org/confluence/display/NUTCH/HttpPostAuthentication" >
* HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
* HttpPostAuthentication development is documented at the
* <a href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
* issue.
*
* @author Susam Pal
*/
public class Http extends HttpBase {
protected static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
// Since the Configuration has not yet been set,
// then an unconfigured client is returned.
private static HttpClient client = new HttpClient(connectionManager);
private static String defaultUsername;
private static String defaultPassword;
private static String defaultRealm;
private static String defaultScheme;
private static String authFile;
private static String agentHost;
private static boolean authRulesRead = false;
private static Configuration conf;
private int maxThreadsTotal = 10;
private String proxyUsername;
private String proxyPassword;
private String proxyRealm;
private static HttpFormAuthConfigurer formConfigurer;
/**
* Returns the configured HTTP client.
*
* @return HTTP client
*/
static synchronized HttpClient getClient() {
return client;
}
/**
* Constructs this plugin.
*/
public Http() {
super(LOG);
}
/**
* Reads the configuration from the Nutch configuration files and sets the
* configuration.
*
* @param conf
* Configuration
*/
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
Http.conf = conf;
this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
this.proxyUsername = conf.get("http.proxy.username", "");
this.proxyPassword = conf.get("http.proxy.password", "");
this.proxyRealm = conf.get("http.proxy.realm", "");
agentHost = conf.get("http.agent.host", "");
authFile = conf.get("http.auth.file", "");
configureClient();
try {
setCredentials();
} catch (Exception ex) {
if (LOG.isErrorEnabled()) {
LOG.error("Http ", ex);
LOG.error("Could not read " + authFile + " : " + ex.getMessage());
}
}
}
/**
* Main method.
*
* @param args
* Command line arguments
* @throws Exception if a fatal error is encountered whilst running
* the program
*/
public static void main(String[] args) throws Exception {
Http http = new Http();
http.setConf(NutchConfiguration.create());
main(http, args);
}
/**
* Fetches the <code>url</code> with a configured HTTP client and gets the
* response.
*
* @param url
* URL to be fetched
* @param datum
* Crawl data
* @param redirect
* Follow redirects if and only if true
* @return HTTP response
*/
@Override
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
resolveCredentials(url);
return new HttpResponse(this, url, datum, redirect);
}
/**
* Configures the HTTP client
*/
private void configureClient() {
// Set up an HTTPS socket factory that accepts self-signed certs.
ProtocolSocketFactory factory;
if (tlsCheckCertificate) {
factory = new SSLProtocolSocketFactory();
} else {
factory = new DummySSLProtocolSocketFactory();
}
Protocol https = new Protocol("https", factory, 443);
Protocol.registerProtocol("https", https);
HttpConnectionManagerParams params = connectionManager.getParams();
params.setConnectionTimeout(timeout);
params.setSoTimeout(timeout);
params.setSendBufferSize(BUFFER_SIZE);
params.setReceiveBufferSize(BUFFER_SIZE);
// --------------------------------------------------------------------------------
// NUTCH-1836: Modification to increase the number of available connections
// for multi-threaded crawls.
// --------------------------------------------------------------------------------
params.setMaxTotalConnections(
conf.getInt("mapreduce.tasktracker.map.tasks.maximum", 5)
* conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
// Also set max connections per host to maxThreadsTotal since all threads
// might be used to fetch from the same host - otherwise timeout errors can
// occur
params.setDefaultMaxConnectionsPerHost(
conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
// executeMethod(HttpMethod) seems to ignore the connection timeout on the
// connection manager.
// set it explicitly on the HttpClient.
client.getParams().setConnectionManagerTimeout(timeout);
HostConfiguration hostConf = client.getHostConfiguration();
ArrayList<Header> headers = new ArrayList<Header>();
// Note: some header fields (e.g., "User-Agent") are set per GET request
if (!acceptLanguage.isEmpty()) {
headers.add(new Header("Accept-Language", acceptLanguage));
}
if (!acceptCharset.isEmpty()) {
headers.add(new Header("Accept-Charset", acceptCharset));
}
if (!accept.isEmpty()) {
headers.add(new Header("Accept", accept));
}
// accept gzipped content
headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
hostConf.getParams().setParameter("http.default-headers", headers);
// HTTP proxy server details
if (useProxy) {
hostConf.setProxy(proxyHost, proxyPort);
if (proxyUsername.length() > 0) {
AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort,
this.proxyRealm);
NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername,
this.proxyPassword, Http.agentHost, this.proxyRealm);
client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials);
}
}
}
/**
* Reads authentication configuration file (defined as 'http.auth.file' in
* Nutch configuration file) and sets the credentials for the configured
* authentication scopes in the HTTP client object.
*
* @throws ParserConfigurationException
* If a document builder can not be created.
* @throws SAXException
* If any parsing error occurs.
* @throws IOException
* If any I/O error occurs.
*/
private static synchronized void setCredentials()
throws ParserConfigurationException, SAXException, IOException {
if (authRulesRead)
return;
authRulesRead = true; // Avoid re-attempting to read
InputStream is = conf.getConfResourceAsInputStream(authFile);
if (is != null) {
Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
.parse(is);
Element rootElement = doc.getDocumentElement();
if (!"auth-configuration".equals(rootElement.getTagName())) {
if (LOG.isWarnEnabled())
LOG.warn("Bad auth conf file: root element <"
+ rootElement.getTagName() + "> found in " + authFile
+ " - must be <auth-configuration>");
}
// For each set of credentials
NodeList credList = rootElement.getChildNodes();
for (int i = 0; i < credList.getLength(); i++) {
Node credNode = credList.item(i);
if (!(credNode instanceof Element))
continue;
Element credElement = (Element) credNode;
if (!"credentials".equals(credElement.getTagName())) {
if (LOG.isWarnEnabled())
LOG.warn("Bad auth conf file: Element <" + credElement.getTagName()
+ "> not recognized in " + authFile
+ " - expected <credentials>");
continue;
}
String authMethod = credElement.getAttribute("authMethod");
// read http form post auth info
if (StringUtils.isNotBlank(authMethod)) {
formConfigurer = readFormAuthConfigurer(credElement, authMethod);
continue;
}
String username = credElement.getAttribute("username");
String password = credElement.getAttribute("password");
// For each authentication scope
NodeList scopeList = credElement.getChildNodes();
for (int j = 0; j < scopeList.getLength(); j++) {
Node scopeNode = scopeList.item(j);
if (!(scopeNode instanceof Element))
continue;
Element scopeElement = (Element) scopeNode;
if ("default".equals(scopeElement.getTagName())) {
// Determine realm and scheme, if any
String realm = scopeElement.getAttribute("realm");
String scheme = scopeElement.getAttribute("scheme");
// Set default credentials
defaultUsername = username;
defaultPassword = password;
defaultRealm = realm;
defaultScheme = scheme;
if (LOG.isTraceEnabled()) {
LOG.trace(
"Credentials - username: " + username + "; set as default"
+ " for realm: " + realm + "; scheme: " + scheme);
}
} else if ("authscope".equals(scopeElement.getTagName())) {
// Determine authentication scope details
String host = scopeElement.getAttribute("host");
int port = -1; // For setting port to AuthScope.ANY_PORT
try {
port = Integer.parseInt(scopeElement.getAttribute("port"));
} catch (Exception ex) {
// do nothing, port is already set to any port
}
String realm = scopeElement.getAttribute("realm");
String scheme = scopeElement.getAttribute("scheme");
// Set credentials for the determined scope
AuthScope authScope = getAuthScope(host, port, realm, scheme);
NTCredentials credentials = new NTCredentials(username, password,
agentHost, realm);
client.getState().setCredentials(authScope, credentials);
if (LOG.isTraceEnabled()) {
LOG.trace("Credentials - username: " + username
+ "; set for AuthScope - " + "host: " + host + "; port: "
+ port + "; realm: " + realm + "; scheme: " + scheme);
}
} else {
if (LOG.isWarnEnabled())
LOG.warn("Bad auth conf file: Element <"
+ scopeElement.getTagName() + "> not recognized in "
+ authFile + " - expected <authscope>");
}
}
is.close();
}
}
}
/**
* <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
* loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field name
* ="username" value="user1"/> </loginPostData>
* <additionalPostHeaders> <field name="header1" value="vaule1"/>
* </additionalPostHeaders>
* <removedFormFields> <field name="header1"/> </removedFormFields> <!--
* NUTCH-2280: Add <loginCookie> and it sub-node <policy> nodes into the
* <credentials> node. The <policy> will mark the POST login form cookie
* policy. The value could be CookiePolicy.<ConstantValues>.
* --> </credentials> </auth-configuration>
*/
private static HttpFormAuthConfigurer readFormAuthConfigurer(
Element credElement, String authMethod) {
if ("formAuth".equals(authMethod)) {
HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
String str = credElement.getAttribute("loginUrl");
if (StringUtils.isNotBlank(str)) {
formConfigurer.setLoginUrl(str.trim());
} else {
throw new IllegalArgumentException("Must set loginUrl.");
}
str = credElement.getAttribute("loginFormId");
if (StringUtils.isNotBlank(str)) {
formConfigurer.setLoginFormId(str.trim());
} else {
throw new IllegalArgumentException("Must set loginFormId.");
}
str = credElement.getAttribute("loginRedirect");
if (StringUtils.isNotBlank(str)) {
formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
}
NodeList nodeList = credElement.getChildNodes();
for (int j = 0; j < nodeList.getLength(); j++) {
Node node = nodeList.item(j);
if (!(node instanceof Element))
continue;
Element element = (Element) node;
if ("loginPostData".equals(element.getTagName())) {
Map<String, String> loginPostData = new HashMap<String, String>();
NodeList childNodes = element.getChildNodes();
for (int k = 0; k < childNodes.getLength(); k++) {
Node fieldNode = childNodes.item(k);
if (!(fieldNode instanceof Element))
continue;
Element fieldElement = (Element) fieldNode;
String name = fieldElement.getAttribute("name");
String value = fieldElement.getAttribute("value");
loginPostData.put(name, value);
}
formConfigurer.setLoginPostData(loginPostData);
} else if ("additionalPostHeaders".equals(element.getTagName())) {
Map<String, String> additionalPostHeaders = new HashMap<String, String>();
NodeList childNodes = element.getChildNodes();
for (int k = 0; k < childNodes.getLength(); k++) {
Node fieldNode = childNodes.item(k);
if (!(fieldNode instanceof Element))
continue;
Element fieldElement = (Element) fieldNode;
String name = fieldElement.getAttribute("name");
String value = fieldElement.getAttribute("value");
additionalPostHeaders.put(name, value);
}
formConfigurer.setAdditionalPostHeaders(additionalPostHeaders);
} else if ("removedFormFields".equals(element.getTagName())) {
Set<String> removedFormFields = new HashSet<String>();
NodeList childNodes = element.getChildNodes();
for (int k = 0; k < childNodes.getLength(); k++) {
Node fieldNode = childNodes.item(k);
if (!(fieldNode instanceof Element))
continue;
Element fieldElement = (Element) fieldNode;
String name = fieldElement.getAttribute("name");
removedFormFields.add(name);
}
formConfigurer.setRemovedFormFields(removedFormFields);
} else if ("loginCookie".equals(element.getTagName())) {
// NUTCH-2280
LOG.debug("start loginCookie");
NodeList childNodes = element.getChildNodes();
for (int k = 0; k < childNodes.getLength(); k++) {
Node fieldNode = childNodes.item(k);
if (!(fieldNode instanceof Element))
continue;
Element fieldElement = (Element) fieldNode;
if ("policy".equals(fieldElement.getTagName())) {
String policy = fieldElement.getTextContent();
formConfigurer.setCookiePolicy(policy);
LOG.debug("cookie policy is " + policy);
}
}
}
}
return formConfigurer;
} else {
throw new IllegalArgumentException(
"Unsupported authMethod: " + authMethod);
}
}
/**
* If credentials for the authentication scope determined from the specified
* <code>url</code> is not already set in the HTTP client, then this method
* sets the default credentials to fetch the specified <code>url</code>. If
* credentials are found for the authentication scope, the method returns
* without altering the client.
*
* @param url
* URL to be fetched
*/
private void resolveCredentials(URL url) {
if (formConfigurer != null) {
HttpFormAuthentication formAuther = new HttpFormAuthentication(
formConfigurer, client, this);
try {
formAuther.login();
} catch (Exception e) {
throw new RuntimeException(e);
}
return;
}
if (defaultUsername != null && defaultUsername.length() > 0) {
int port = url.getPort();
if (port == -1) {
if ("https".equals(url.getProtocol()))
port = 443;
else
port = 80;
}
AuthScope scope = new AuthScope(url.getHost(), port);
if (client.getState().getCredentials(scope) != null) {
if (LOG.isTraceEnabled())
LOG.trace("Pre-configured credentials with scope - host: "
+ url.getHost() + "; port: " + port + "; found for url: " + url);
// Credentials are already configured, so do nothing and return
return;
}
if (LOG.isTraceEnabled())
LOG.trace(
"Pre-configured credentials with scope - host: " + url.getHost()
+ "; port: " + port + "; not found for url: " + url);
AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
defaultRealm, defaultScheme);
NTCredentials serverCredentials = new NTCredentials(defaultUsername,
defaultPassword, agentHost, defaultRealm);
client.getState().setCredentials(serverAuthScope, serverCredentials);
}
}
/**
* Returns an authentication scope for the specified <code>host</code>,
* <code>port</code>, <code>realm</code> and <code>scheme</code>.
*
* @param host
* Host name or address.
* @param port
* Port number.
* @param realm
* Authentication realm.
* @param scheme
* Authentication scheme.
*/
private static AuthScope getAuthScope(String host, int port, String realm,
String scheme) {
if (host.length() == 0)
host = null;
if (port < 0)
port = -1;
if (realm.length() == 0)
realm = null;
if (scheme.length() == 0)
scheme = null;
return new AuthScope(host, port, realm, scheme);
}
/**
* Returns an authentication scope for the specified <code>host</code>,
* <code>port</code> and <code>realm</code>.
*
* @param host
* Host name or address.
* @param port
* Port number.
* @param realm
* Authentication realm.
*/
private static AuthScope getAuthScope(String host, int port, String realm) {
return getAuthScope(host, port, realm, "");
}
}