| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.protocol.ftp; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.net.URL; |
| import java.util.List; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.Text; |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.protocol.Protocol; |
| import org.apache.nutch.protocol.ProtocolOutput; |
| import org.apache.nutch.protocol.ProtocolStatus; |
| import org.apache.nutch.protocol.RobotRulesParser; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import crawlercommons.robots.BaseRobotRules; |
| |
| /** |
| * This class is used for parsing robots for urls belonging to FTP protocol. It |
| * extends the generic {@link RobotRulesParser} class and contains Ftp protocol |
| * specific implementation for obtaining the robots file. |
| */ |
| public class FtpRobotRulesParser extends RobotRulesParser { |
| |
| private static final String CONTENT_TYPE = "text/plain"; |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| FtpRobotRulesParser() { |
| } |
| |
| public FtpRobotRulesParser(Configuration conf) { |
| super(conf); |
| } |
| |
| /** |
| * The hosts for which the caching of robots rules is yet to be done, it sends |
| * a Ftp request to the host corresponding to the {@link URL} passed, gets |
| * robots file, parses the rules and caches the rules object to avoid re-work |
| * in future. |
| * |
| * @param ftp |
| * The {@link Protocol} object |
| * @param url |
| * URL |
| * @param robotsTxtContent |
| * container to store responses when fetching the robots.txt file for |
| * debugging or archival purposes. Instead of a robots.txt file, it |
| * may include redirects or an error page (404, etc.). Response |
| * {@link Content} is appended to the passed list. If null is passed |
| * nothing is stored. |
| * |
| * @return robotRules A {@link BaseRobotRules} object for the rules |
| */ |
| @Override |
| public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, |
| List<Content> robotsTxtContent) { |
| |
| String protocol = url.getProtocol().toLowerCase(); // normalize to lower |
| // case |
| String host = url.getHost().toLowerCase(); // normalize to lower case |
| |
| if (LOG.isTraceEnabled() && isWhiteListed(url)) { |
| LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); |
| } |
| |
| BaseRobotRules robotRules = CACHE.get(protocol + ":" + host); |
| |
| if (robotRules != null) { |
| return robotRules; // cached rule |
| } else if (LOG.isTraceEnabled()) { |
| LOG.trace("cache miss " + url); |
| } |
| |
| boolean cacheRule = true; |
| |
| if (isWhiteListed(url)) { |
| // check in advance whether a host is whitelisted |
| // (we do not need to fetch robots.txt) |
| robotRules = EMPTY_RULES; |
| LOG.info("Whitelisted host found for: {}", url); |
| LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host); |
| |
| } else { |
| try { |
| Text robotsUrl = new Text(new URL(url, "/robots.txt").toString()); |
| ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, |
| new CrawlDatum()); |
| ProtocolStatus status = output.getStatus(); |
| |
| if (robotsTxtContent != null) { |
| robotsTxtContent.add(output.getContent()); |
| } |
| |
| if (status.getCode() == ProtocolStatus.SUCCESS) { |
| robotRules = parseRules(url.toString(), output.getContent() |
| .getContent(), CONTENT_TYPE, agentNames); |
| } else { |
| robotRules = EMPTY_RULES; // use default rules |
| } |
| } catch (Throwable t) { |
| if (LOG.isInfoEnabled()) { |
| LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); |
| } |
| cacheRule = false; // try again later to fetch robots.txt |
| robotRules = EMPTY_RULES; |
| } |
| |
| } |
| |
| if (cacheRule) |
| CACHE.put(protocol + ":" + host, robotRules); // cache rules for host |
| |
| return robotRules; |
| } |
| |
| } |