| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.protocol.http.api; |
| |
| import org.junit.Assert; |
| import org.junit.Test; |
| |
| import crawlercommons.robots.BaseRobotRules; |
| |
| /** |
| * JUnit test case which tests 1. that robots filtering is performed correctly |
| * as per the agent name 2. that crawl delay is extracted correctly from the |
| * robots file |
| * |
| */ |
| public class TestRobotRulesParser { |
| |
| private static final String CONTENT_TYPE = "text/plain"; |
| private static final String SINGLE_AGENT = "Agent1"; |
| private static final String MULTIPLE_AGENTS = "Agent2, Agent1"; |
| private static final String UNKNOWN_AGENT = "AgentABC"; |
| private static final String CR = "\r"; |
| |
| private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR |
| + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" |
| + CR |
| + "Crawl-delay: 10" |
| + CR // set crawl delay for Agent1 as 10 sec |
| + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh" |
| + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20" |
| + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no |
| // crawl |
| // delay |
| // for |
| // other |
| // agents |
| |
| private static final String[] TEST_PATHS = new String[] { |
| "http://example.com/a", "http://example.com/a/bloh/foo.html", |
| "http://example.com/b", "http://example.com/c", |
| "http://example.com/b/a/index.html", |
| "http://example.com/foo/bar/baz.html" }; |
| |
| private static final boolean[] RESULTS = new boolean[] { false, // /a |
| false, // /a/bloh/foo.html |
| true, // /b |
| true, // /c |
| false, // /b/a/index.html |
| true // /foo/bar/baz.html |
| }; |
| |
| private HttpRobotRulesParser parser; |
| private BaseRobotRules rules; |
| |
| public TestRobotRulesParser() { |
| parser = new HttpRobotRulesParser(); |
| } |
| |
| /** |
| * Test that the robots rules are interpreted correctly by the robots rules |
| * parser. |
| */ |
| @Test |
| public void testRobotsAgent() { |
| rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), |
| CONTENT_TYPE, SINGLE_AGENT); |
| |
| for (int counter = 0; counter < TEST_PATHS.length; counter++) { |
| Assert.assertTrue( |
| "testing on agent (" + SINGLE_AGENT + "), and " + "path " |
| + TEST_PATHS[counter] + " got " |
| + rules.isAllowed(TEST_PATHS[counter]), |
| rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); |
| } |
| |
| rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), |
| CONTENT_TYPE, MULTIPLE_AGENTS); |
| |
| for (int counter = 0; counter < TEST_PATHS.length; counter++) { |
| Assert.assertTrue( |
| "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path " |
| + TEST_PATHS[counter] + " got " |
| + rules.isAllowed(TEST_PATHS[counter]), |
| rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); |
| } |
| } |
| |
| /** |
| * Test that the crawl delay is extracted from the robots file for respective |
| * agent. If its not specified for a given agent, default value must be |
| * returned. |
| */ |
| @Test |
| public void testCrawlDelay() { |
| // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be |
| // returned by the parser |
| rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), |
| CONTENT_TYPE, SINGLE_AGENT); |
| Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ", |
| (rules.getCrawlDelay() == 10000)); |
| |
| // for UNKNOWN_AGENT, the default crawl delay must be returned. |
| rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), |
| CONTENT_TYPE, UNKNOWN_AGENT); |
| Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", |
| (rules.getCrawlDelay() == Long.MIN_VALUE)); |
| } |
| } |