blob: 245353fad8bf5132a9a3c952d4fc81a3bb29b585 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.fetcher;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDBTestUtil;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.mortbay.jetty.Server;
/**
* Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
* Verify contents
*
*/
public class TestFetcher {
final static Path testdir = new Path("build/test/fetch-test");
Configuration conf;
FileSystem fs;
Path crawldbPath;
Path segmentsPath;
Path urlPath;
Server server;
@Before
public void setUp() throws Exception {
conf = CrawlDBTestUtil.createContext().getConfiguration();
fs = FileSystem.get(conf);
fs.delete(testdir, true);
urlPath = new Path(testdir, "urls");
crawldbPath = new Path(testdir, "crawldb");
segmentsPath = new Path(testdir, "segments");
server = CrawlDBTestUtil.getServer(
conf.getInt("content.server.port", 50000),
"build/test/data/fetch-test-site");
server.start();
}
@After
public void tearDown() throws Exception {
server.stop();
for (int i = 0; i < 5; i++) {
if (!server.isStopped()) {
Thread.sleep(1000);
}
}
fs.delete(testdir, true);
}
@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
// generate seedlist
ArrayList<String> urls = new ArrayList<String>();
addUrl(urls, "index.html");
addUrl(urls, "pagea.html");
addUrl(urls, "pageb.html");
addUrl(urls, "dup_of_pagea.html");
addUrl(urls, "nested_spider_trap.html");
addUrl(urls, "exception.html");
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
// inject
Injector injector = new Injector(conf);
injector.inject(crawldbPath, urlPath);
// generate
Generator g = new Generator(conf);
Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
Long.MAX_VALUE, Long.MAX_VALUE, false, false);
long time = System.currentTimeMillis();
// fetch
Fetcher fetcher = new Fetcher(conf);
// Set fetcher.parse to true
conf.setBoolean("fetcher.parse", true);
fetcher.fetch(generatedSegment[0], 1);
time = System.currentTimeMillis() - time;
// verify politeness, time taken should be more than (num_of_pages +1)*delay
int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
"fetcher.server.delay", 5));
Assert.assertTrue(time > minimumTime);
// verify content
Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
"part-r-00000/data");
@SuppressWarnings("resource")
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
ArrayList<String> handledurls = new ArrayList<String>();
READ_CONTENT: do {
Text key = new Text();
Content value = new Content();
if (!reader.next(key, value))
break READ_CONTENT;
String contentString = new String(value.getContent());
if (contentString.indexOf("Nutch fetcher test page") != -1) {
handledurls.add(key.toString());
}
} while (true);
reader.close();
Collections.sort(urls);
Collections.sort(handledurls);
// verify that enough pages were handled
Assert.assertEquals(urls.size(), handledurls.size());
// verify that correct pages were handled
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
handledurls.clear();
// verify parse data
Path parseData = new Path(
new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
READ_PARSE_DATA: do {
Text key = new Text();
ParseData value = new ParseData();
if (!reader.next(key, value))
break READ_PARSE_DATA;
// make sure they all contain "nutch.segment.name" and
// "nutch.content.digest"
// keys in parse metadata
Metadata contentMeta = value.getContentMeta();
if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
&& contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
handledurls.add(key.toString());
}
} while (true);
Collections.sort(handledurls);
Assert.assertEquals(urls.size(), handledurls.size());
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
}
private void addUrl(ArrayList<String> urls, String page) {
urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
+ page);
}
@Test
public void testAgentNameCheck() {
boolean failedNoAgentName = false;
conf.set("http.agent.name", "");
try {
conf.setBoolean("fetcher.parse", false);
Fetcher fetcher = new Fetcher(conf);
fetcher.fetch(null, 1);
} catch (IllegalArgumentException iae) {
String message = iae.getMessage();
failedNoAgentName = message.equals("Fetcher: No agents listed in "
+ "'http.agent.name' property.");
} catch (Exception e) {
}
Assert.assertTrue(failedNoAgentName);
}
}