| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.protocol; |
| |
| // JDK imports |
| import java.io.File; |
| import java.io.FileReader; |
| import java.io.LineNumberReader; |
| import java.net.URL; |
| import java.util.Hashtable; |
| import java.util.StringTokenizer; |
| |
| // Commons Logging imports |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| // Nutch imports |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.conf.Configurable; |
| import com.google.common.io.Files; |
| |
| import crawlercommons.robots.BaseRobotRules; |
| import crawlercommons.robots.SimpleRobotRules; |
| import crawlercommons.robots.SimpleRobotRules.RobotRulesMode; |
| import crawlercommons.robots.SimpleRobotRulesParser; |
| |
| /** |
| * This class uses crawler-commons for handling the parsing of |
| * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe |
| * the download permissions as described in SimpleRobotRulesParser. |
| */ |
| public abstract class RobotRulesParser implements Configurable { |
| |
| public static final Logger LOG = LoggerFactory |
| .getLogger(RobotRulesParser.class); |
| |
| protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules>(); |
| |
| /** |
| * A {@link BaseRobotRules} object appropriate for use when the |
| * {@code robots.txt} file is empty or missing; all requests are allowed. |
| */ |
| public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules( |
| RobotRulesMode.ALLOW_ALL); |
| |
| /** |
| * A {@link BaseRobotRules} object appropriate for use when the |
| * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden} |
| * response; all requests are disallowed. |
| */ |
| public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules( |
| RobotRulesMode.ALLOW_NONE); |
| |
| private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); |
| private Configuration conf; |
| protected String agentNames; |
| |
| public RobotRulesParser() { |
| } |
| |
| public RobotRulesParser(Configuration conf) { |
| setConf(conf); |
| } |
| |
| /** |
| * Set the {@link Configuration} object |
| */ |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| |
| // Grab the agent names we advertise to robots files. |
| String agentName = conf.get("http.agent.name"); |
| if (agentName == null || (agentName = agentName.trim()).isEmpty()) { |
| throw new RuntimeException("Agent name not configured!"); |
| } |
| agentNames = agentName; |
| |
| // If there are any other agents specified, append those to the list of |
| // agents |
| String otherAgents = conf.get("http.robots.agents"); |
| if (otherAgents != null && !otherAgents.trim().isEmpty()) { |
| StringTokenizer tok = new StringTokenizer(otherAgents, ","); |
| StringBuilder sb = new StringBuilder(agentNames); |
| while (tok.hasMoreTokens()) { |
| String str = tok.nextToken().trim(); |
| if (str.equals("*") || str.equals(agentName)) { |
| // skip wildcard "*" or agent name itself |
| // (required for backward compatibility, cf. NUTCH-1715 and |
| // NUTCH-1718) |
| } else { |
| sb.append(",").append(str); |
| } |
| } |
| |
| agentNames = sb.toString(); |
| } |
| } |
| |
| /** |
| * Get the {@link Configuration} object |
| */ |
| public Configuration getConf() { |
| return conf; |
| } |
| |
| /** |
| * Parses the robots content using the {@link SimpleRobotRulesParser} from |
| * crawler commons |
| * |
| * @param url |
| * A string containing url |
| * @param content |
| * Contents of the robots file in a byte array |
| * @param contentType |
| * The content type of the robots file |
| * @param robotName |
| * A string containing all the robots agent names used by parser for |
| * matching |
| * @return BaseRobotRules object |
| */ |
| public BaseRobotRules parseRules(String url, byte[] content, |
| String contentType, String robotName) { |
| return robotParser.parseContent(url, content, contentType, robotName); |
| } |
| |
| public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) { |
| URL u = null; |
| try { |
| u = new URL(url); |
| } catch (Exception e) { |
| return EMPTY_RULES; |
| } |
| return getRobotRulesSet(protocol, u); |
| } |
| |
| public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url); |
| |
| /** command-line main for testing */ |
| public static void main(String[] argv) { |
| |
| if (argv.length != 3) { |
| System.err |
| .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n"); |
| System.err |
| .println(" <robots-file> - Input robots.txt file which will be parsed."); |
| System.err |
| .println(" <url-file> - Contains input URLs (1 per line) which are tested against the rules."); |
| System.err |
| .println(" <agent-names> - Input agent names. Multiple agent names can be provided using"); |
| System.err |
| .println(" comma as a delimiter without any spaces."); |
| System.exit(-1); |
| } |
| |
| try { |
| byte[] robotsBytes = Files.toByteArray(new File(argv[0])); |
| BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, |
| "text/plain", argv[2]); |
| |
| LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1])); |
| String testPath = testsIn.readLine().trim(); |
| while (testPath != null) { |
| System.out.println((rules.isAllowed(testPath) ? "allowed" |
| : "not allowed") + ":\t" + testPath); |
| testPath = testsIn.readLine(); |
| } |
| testsIn.close(); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
| } |