| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.protocol.http.api; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.net.URL; |
| import java.util.List; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.protocol.Protocol; |
| import org.apache.nutch.protocol.RobotRulesParser; |
| |
| import crawlercommons.robots.BaseRobotRules; |
| |
| /** |
| * This class is used for parsing robots for urls belonging to HTTP protocol. It |
| * extends the generic {@link RobotRulesParser} class and contains Http protocol |
| * specific implementation for obtaining the robots file. |
| */ |
| public class HttpRobotRulesParser extends RobotRulesParser { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| protected boolean allowForbidden = false; |
| |
| HttpRobotRulesParser() { |
| } |
| |
| public HttpRobotRulesParser(Configuration conf) { |
| setConf(conf); |
| } |
| |
| public void setConf(Configuration conf) { |
| super.setConf(conf); |
| allowForbidden = conf.getBoolean("http.robots.403.allow", true); |
| } |
| |
| /** Compose unique key to store and access robot rules in cache for given URL */ |
| protected static String getCacheKey(URL url) { |
| String protocol = url.getProtocol().toLowerCase(); // normalize to lower |
| // case |
| String host = url.getHost().toLowerCase(); // normalize to lower case |
| int port = url.getPort(); |
| if (port == -1) { |
| port = url.getDefaultPort(); |
| } |
| /* |
| * Robot rules apply only to host, protocol, and port where robots.txt is |
| * hosted (cf. NUTCH-1752). Consequently |
| */ |
| String cacheKey = protocol + ":" + host + ":" + port; |
| return cacheKey; |
| } |
| |
| /** |
| * Get the rules from robots.txt which applies for the given {@code url}. |
| * Robot rules are cached for a unique combination of host, protocol, and |
| * port. If no rules are found in the cache, a HTTP request is send to fetch |
| * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the |
| * rules are cached to avoid re-fetching and re-parsing it again. |
| * |
| * @param http |
| * The {@link Protocol} object |
| * @param url |
| * URL |
| * @param robotsTxtContent |
| * container to store responses when fetching the robots.txt file for |
| * debugging or archival purposes. Instead of a robots.txt file, it |
| * may include redirects or an error page (404, etc.). Response |
| * {@link Content} is appended to the passed list. If null is passed |
| * nothing is stored. |
| * |
| * @return robotRules A {@link BaseRobotRules} object for the rules |
| */ |
| @Override |
| public BaseRobotRules getRobotRulesSet(Protocol http, URL url, |
| List<Content> robotsTxtContent) { |
| |
| if (LOG.isTraceEnabled() && isWhiteListed(url)) { |
| LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); |
| } |
| |
| String cacheKey = getCacheKey(url); |
| BaseRobotRules robotRules = CACHE.get(cacheKey); |
| |
| if (robotRules != null) { |
| return robotRules; // cached rule |
| } else if (LOG.isTraceEnabled()) { |
| LOG.trace("cache miss " + url); |
| } |
| |
| boolean cacheRule = true; |
| URL redir = null; |
| |
| if (isWhiteListed(url)) { |
| // check in advance whether a host is whitelisted |
| // (we do not need to fetch robots.txt) |
| robotRules = EMPTY_RULES; |
| LOG.info("Whitelisted host found for: {}", url); |
| LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", |
| url.getHost()); |
| |
| } else { |
| try { |
| URL robotsUrl = new URL(url, "/robots.txt"); |
| Response response = ((HttpBase) http).getResponse(robotsUrl, |
| new CrawlDatum(), true); |
| if (robotsTxtContent != null) { |
| addRobotsContent(robotsTxtContent, robotsUrl, response); |
| } |
| // try one level of redirection ? |
| if (response.getCode() == 301 || response.getCode() == 302) { |
| String redirection = response.getHeader("Location"); |
| if (redirection == null) { |
| // some versions of MS IIS are known to mangle this header |
| redirection = response.getHeader("location"); |
| } |
| if (redirection != null) { |
| if (!redirection.startsWith("http")) { |
| // RFC says it should be absolute, but apparently it isn't |
| redir = new URL(url, redirection); |
| } else { |
| redir = new URL(redirection); |
| } |
| |
| response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), |
| true); |
| if (robotsTxtContent != null) { |
| addRobotsContent(robotsTxtContent, redir, response); |
| } |
| } |
| } |
| |
| if (response.getCode() == 200) // found rules: parse them |
| robotRules = parseRules(url.toString(), response.getContent(), |
| response.getHeader("Content-Type"), agentNames); |
| |
| else if ((response.getCode() == 403) && (!allowForbidden)) |
| robotRules = FORBID_ALL_RULES; // use forbid all |
| else if (response.getCode() >= 500) { |
| cacheRule = false; // try again later to fetch robots.txt |
| robotRules = EMPTY_RULES; |
| } else |
| robotRules = EMPTY_RULES; // use default rules |
| } catch (Throwable t) { |
| if (LOG.isInfoEnabled()) { |
| LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); |
| } |
| cacheRule = false; // try again later to fetch robots.txt |
| robotRules = EMPTY_RULES; |
| } |
| } |
| |
| if (cacheRule) { |
| CACHE.put(cacheKey, robotRules); // cache rules for host |
| if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost()) |
| && "/robots.txt".equals(redir.getFile())) { |
| // cache also for the redirected host |
| // if the URL path is /robots.txt |
| CACHE.put(getCacheKey(redir), robotRules); |
| } |
| } |
| |
| return robotRules; |
| } |
| |
| /** |
| * Append {@link Content} of robots.txt to {@literal robotsTxtContent} |
| * |
| * @param robotsTxtContent |
| * container to store robots.txt response content |
| * @param robotsUrl |
| * robots.txt URL |
| * @param robotsResponse |
| * response object to be stored |
| */ |
| protected void addRobotsContent(List<Content> robotsTxtContent, |
| URL robotsUrl, Response robotsResponse) { |
| byte[] robotsBytes = robotsResponse.getContent(); |
| if (robotsBytes == null) |
| robotsBytes = new byte[0]; |
| Content content = new Content(robotsUrl.toString(), |
| robotsUrl.toString(), robotsBytes, |
| robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(), |
| getConf()); |
| robotsTxtContent.add(content); |
| } |
| |
| } |