blob: f761bd04974e27c3c064f08848f0481ace7d832b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.http.api;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.RobotRulesParser;
import crawlercommons.robots.BaseRobotRules;
/**
* This class is used for parsing robots for urls belonging to HTTP protocol. It
* extends the generic {@link RobotRulesParser} class and contains Http protocol
* specific implementation for obtaining the robots file.
*/
public class HttpRobotRulesParser extends RobotRulesParser {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
protected boolean allowForbidden = false;
HttpRobotRulesParser() {
}
public HttpRobotRulesParser(Configuration conf) {
setConf(conf);
}
public void setConf(Configuration conf) {
super.setConf(conf);
allowForbidden = conf.getBoolean("http.robots.403.allow", true);
}
/** Compose unique key to store and access robot rules in cache for given URL */
protected static String getCacheKey(URL url) {
String protocol = url.getProtocol().toLowerCase(); // normalize to lower
// case
String host = url.getHost().toLowerCase(); // normalize to lower case
int port = url.getPort();
if (port == -1) {
port = url.getDefaultPort();
}
/*
* Robot rules apply only to host, protocol, and port where robots.txt is
* hosted (cf. NUTCH-1752). Consequently
*/
String cacheKey = protocol + ":" + host + ":" + port;
return cacheKey;
}
/**
* Get the rules from robots.txt which applies for the given {@code url}.
* Robot rules are cached for a unique combination of host, protocol, and
* port. If no rules are found in the cache, a HTTP request is send to fetch
* {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
* rules are cached to avoid re-fetching and re-parsing it again.
*
* @param http
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
List<Content> robotsTxtContent) {
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
String cacheKey = getCacheKey(url);
BaseRobotRules robotRules = CACHE.get(cacheKey);
if (robotRules != null) {
return robotRules; // cached rule
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
URL redir = null;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
url.getHost());
} else {
try {
URL robotsUrl = new URL(url, "/robots.txt");
Response response = ((HttpBase) http).getResponse(robotsUrl,
new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrl, response);
}
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
if (redirection == null) {
// some versions of MS IIS are known to mangle this header
redirection = response.getHeader("location");
}
if (redirection != null) {
if (!redirection.startsWith("http")) {
// RFC says it should be absolute, but apparently it isn't
redir = new URL(url, redirection);
} else {
redir = new URL(redirection);
}
response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, redir, response);
}
}
}
if (response.getCode() == 200) // found rules: parse them
robotRules = parseRules(url.toString(), response.getContent(),
response.getHeader("Content-Type"), agentNames);
else if ((response.getCode() == 403) && (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
} else
robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
}
}
if (cacheRule) {
CACHE.put(cacheKey, robotRules); // cache rules for host
if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())
&& "/robots.txt".equals(redir.getFile())) {
// cache also for the redirected host
// if the URL path is /robots.txt
CACHE.put(getCacheKey(redir), robotRules);
}
}
return robotRules;
}
/**
* Append {@link Content} of robots.txt to {@literal robotsTxtContent}
*
* @param robotsTxtContent
* container to store robots.txt response content
* @param robotsUrl
* robots.txt URL
* @param robotsResponse
* response object to be stored
*/
protected void addRobotsContent(List<Content> robotsTxtContent,
URL robotsUrl, Response robotsResponse) {
byte[] robotsBytes = robotsResponse.getContent();
if (robotsBytes == null)
robotsBytes = new byte[0];
Content content = new Content(robotsUrl.toString(),
robotsUrl.toString(), robotsBytes,
robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(),
getConf());
robotsTxtContent.add(content);
}
}