blob: 93bb51b22f870fe629be8b742eece9dd60b1e5fa [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.http.api;
import org.junit.Assert;
import org.junit.Test;
import crawlercommons.robots.BaseRobotRules;
/**
* JUnit test case which tests 1. that robots filtering is performed correctly
* as per the agent name 2. that crawl delay is extracted correctly from the
* robots file
*
*/
public class TestRobotRulesParser {
private static final String CONTENT_TYPE = "text/plain";
private static final String SINGLE_AGENT = "Agent1";
private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
private static final String UNKNOWN_AGENT = "AgentABC";
private static final String CR = "\r";
private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+ "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+ CR
+ "Crawl-delay: 10"
+ CR // set crawl delay for Agent1 as 10 sec
+ "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+ CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+ CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
// crawl
// delay
// for
// other
// agents
private static final String[] TEST_PATHS = new String[] {
"http://example.com/a", "http://example.com/a/bloh/foo.html",
"http://example.com/b", "http://example.com/c",
"http://example.com/b/a/index.html",
"http://example.com/foo/bar/baz.html" };
private static final boolean[] RESULTS = new boolean[] { false, // /a
false, // /a/bloh/foo.html
true, // /b
true, // /c
false, // /b/a/index.html
true // /foo/bar/baz.html
};
private HttpRobotRulesParser parser;
private BaseRobotRules rules;
public TestRobotRulesParser() {
parser = new HttpRobotRulesParser();
}
/**
* Test that the robots rules are interpreted correctly by the robots rules
* parser.
*/
@Test
public void testRobotsAgent() {
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
for (int counter = 0; counter < TEST_PATHS.length; counter++) {
Assert.assertTrue(
"testing on agent (" + SINGLE_AGENT + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, MULTIPLE_AGENTS);
for (int counter = 0; counter < TEST_PATHS.length; counter++) {
Assert.assertTrue(
"testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
}
/**
* Test that the crawl delay is extracted from the robots file for respective
* agent. If its not specified for a given agent, default value must be
* returned.
*/
@Test
public void testCrawlDelay() {
// for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
// returned by the parser
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
(rules.getCrawlDelay() == 10000));
// for UNKNOWN_AGENT, the default crawl delay must be returned.
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, UNKNOWN_AGENT);
Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
(rules.getCrawlDelay() == Long.MIN_VALUE));
}
}