src/test/org/apache/nutch/fetcher/TestFetcher.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.fetcher;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDBTestUtil;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.Injector;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 import org.mortbay.jetty.Server;

 /**
  * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
  * Verify contents
  *
  */
 public class TestFetcher {

   final static Path testdir = new Path("build/test/fetch-test");
   Configuration conf;
   FileSystem fs;
   Path crawldbPath;
   Path segmentsPath;
   Path urlPath;
   Server server;

   @Before
   public void setUp() throws Exception {
     conf = CrawlDBTestUtil.createContext().getConfiguration();
     fs = FileSystem.get(conf);
     fs.delete(testdir, true);
     urlPath = new Path(testdir, "urls");
     crawldbPath = new Path(testdir, "crawldb");
     segmentsPath = new Path(testdir, "segments");
     server = CrawlDBTestUtil.getServer(
         conf.getInt("content.server.port", 50000),
         "build/test/data/fetch-test-site");
     server.start();
   }

   @After
   public void tearDown() throws Exception {
     server.stop();
     for (int i = 0; i < 5; i++) {
       if (!server.isStopped()) {
        Thread.sleep(1000);
       }
     }
     fs.delete(testdir, true);
   }

   @Test
   public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {

     // generate seedlist
     ArrayList<String> urls = new ArrayList<String>();

     addUrl(urls, "index.html");
     addUrl(urls, "pagea.html");
     addUrl(urls, "pageb.html");
     addUrl(urls, "dup_of_pagea.html");
     addUrl(urls, "nested_spider_trap.html");
     addUrl(urls, "exception.html");

     CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);

     // inject
     Injector injector = new Injector(conf);
     injector.inject(crawldbPath, urlPath);

     // generate
     Generator g = new Generator(conf);
     Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
         Long.MAX_VALUE, Long.MAX_VALUE, false, false);

     long time = System.currentTimeMillis();
     // fetch
     Fetcher fetcher = new Fetcher(conf);

     // Set fetcher.parse to true
     conf.setBoolean("fetcher.parse", true);

     fetcher.fetch(generatedSegment[0], 1);

     time = System.currentTimeMillis() - time;

     // verify politeness, time taken should be more than (num_of_pages +1)*delay
     int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
         "fetcher.server.delay", 5));
     Assert.assertTrue(time > minimumTime);

     // verify content
     Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
         "part-r-00000/data");
     @SuppressWarnings("resource")
     SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));

     ArrayList<String> handledurls = new ArrayList<String>();

     READ_CONTENT: do {
       Text key = new Text();
       Content value = new Content();
       if (!reader.next(key, value))
         break READ_CONTENT;
       String contentString = new String(value.getContent());
       if (contentString.indexOf("Nutch fetcher test page") != -1) {
         handledurls.add(key.toString());
       }
     } while (true);

     reader.close();

     Collections.sort(urls);
     Collections.sort(handledurls);

     // verify that enough pages were handled
     Assert.assertEquals(urls.size(), handledurls.size());

     // verify that correct pages were handled
     Assert.assertTrue(handledurls.containsAll(urls));
     Assert.assertTrue(urls.containsAll(handledurls));

     handledurls.clear();

     // verify parse data
     Path parseData = new Path(
         new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
     reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));

     READ_PARSE_DATA: do {
       Text key = new Text();
       ParseData value = new ParseData();
       if (!reader.next(key, value))
         break READ_PARSE_DATA;
       // make sure they all contain "nutch.segment.name" and
       // "nutch.content.digest"
       // keys in parse metadata
       Metadata contentMeta = value.getContentMeta();
       if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
           && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
         handledurls.add(key.toString());
       }
     } while (true);

     Collections.sort(handledurls);

     Assert.assertEquals(urls.size(), handledurls.size());

     Assert.assertTrue(handledurls.containsAll(urls));
     Assert.assertTrue(urls.containsAll(handledurls));
   }

   private void addUrl(ArrayList<String> urls, String page) {
     urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
         + page);
   }

   @Test
   public void testAgentNameCheck() {

     boolean failedNoAgentName = false;
     conf.set("http.agent.name", "");

     try {
       conf.setBoolean("fetcher.parse", false);
       Fetcher fetcher = new Fetcher(conf);
       fetcher.fetch(null, 1);
     } catch (IllegalArgumentException iae) {
       String message = iae.getMessage();
       failedNoAgentName = message.equals("Fetcher: No agents listed in "
           + "'http.agent.name' property.");
     } catch (Exception e) {
     }

     Assert.assertTrue(failedNoAgentName);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.fetcher;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.SequenceFile;
	import org.apache.hadoop.io.Text;
	import org.apache.nutch.crawl.CrawlDBTestUtil;
	import org.apache.nutch.crawl.Generator;
	import org.apache.nutch.crawl.Injector;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.metadata.Nutch;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.protocol.Content;
	import org.junit.After;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Test;
	import org.mortbay.jetty.Server;

	/**
	* Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
	* Verify contents
	*
	*/
	public class TestFetcher {

	final static Path testdir = new Path("build/test/fetch-test");
	Configuration conf;
	FileSystem fs;
	Path crawldbPath;
	Path segmentsPath;
	Path urlPath;
	Server server;

	@Before
	public void setUp() throws Exception {
	conf = CrawlDBTestUtil.createContext().getConfiguration();
	fs = FileSystem.get(conf);
	fs.delete(testdir, true);
	urlPath = new Path(testdir, "urls");
	crawldbPath = new Path(testdir, "crawldb");
	segmentsPath = new Path(testdir, "segments");
	server = CrawlDBTestUtil.getServer(
	conf.getInt("content.server.port", 50000),
	"build/test/data/fetch-test-site");
	server.start();
	}

	@After
	public void tearDown() throws Exception {
	server.stop();
	for (int i = 0; i < 5; i++) {
	if (!server.isStopped()) {
	Thread.sleep(1000);
	}
	}
	fs.delete(testdir, true);
	}

	@Test
	public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {

	// generate seedlist
	ArrayList<String> urls = new ArrayList<String>();

	addUrl(urls, "index.html");
	addUrl(urls, "pagea.html");
	addUrl(urls, "pageb.html");
	addUrl(urls, "dup_of_pagea.html");
	addUrl(urls, "nested_spider_trap.html");
	addUrl(urls, "exception.html");

	CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);

	// inject
	Injector injector = new Injector(conf);
	injector.inject(crawldbPath, urlPath);

	// generate
	Generator g = new Generator(conf);
	Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
	Long.MAX_VALUE, Long.MAX_VALUE, false, false);

	long time = System.currentTimeMillis();
	// fetch
	Fetcher fetcher = new Fetcher(conf);

	// Set fetcher.parse to true
	conf.setBoolean("fetcher.parse", true);

	fetcher.fetch(generatedSegment[0], 1);

	time = System.currentTimeMillis() - time;

	// verify politeness, time taken should be more than (num_of_pages +1)*delay
	int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
	"fetcher.server.delay", 5));
	Assert.assertTrue(time > minimumTime);

	// verify content
	Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
	"part-r-00000/data");
	@SuppressWarnings("resource")
	SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));

	ArrayList<String> handledurls = new ArrayList<String>();

	READ_CONTENT: do {
	Text key = new Text();
	Content value = new Content();
	if (!reader.next(key, value))
	break READ_CONTENT;
	String contentString = new String(value.getContent());
	if (contentString.indexOf("Nutch fetcher test page") != -1) {
	handledurls.add(key.toString());
	}
	} while (true);

	reader.close();

	Collections.sort(urls);
	Collections.sort(handledurls);

	// verify that enough pages were handled
	Assert.assertEquals(urls.size(), handledurls.size());

	// verify that correct pages were handled
	Assert.assertTrue(handledurls.containsAll(urls));
	Assert.assertTrue(urls.containsAll(handledurls));

	handledurls.clear();

	// verify parse data
	Path parseData = new Path(
	new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
	reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));

	READ_PARSE_DATA: do {
	Text key = new Text();
	ParseData value = new ParseData();
	if (!reader.next(key, value))
	break READ_PARSE_DATA;
	// make sure they all contain "nutch.segment.name" and
	// "nutch.content.digest"
	// keys in parse metadata
	Metadata contentMeta = value.getContentMeta();
	if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
	&& contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
	handledurls.add(key.toString());
	}
	} while (true);

	Collections.sort(handledurls);

	Assert.assertEquals(urls.size(), handledurls.size());

	Assert.assertTrue(handledurls.containsAll(urls));
	Assert.assertTrue(urls.containsAll(handledurls));
	}

	private void addUrl(ArrayList<String> urls, String page) {
	urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
	+ page);
	}

	@Test
	public void testAgentNameCheck() {

	boolean failedNoAgentName = false;
	conf.set("http.agent.name", "");

	try {
	conf.setBoolean("fetcher.parse", false);
	Fetcher fetcher = new Fetcher(conf);
	fetcher.fetch(null, 1);
	} catch (IllegalArgumentException iae) {
	String message = iae.getMessage();
	failedNoAgentName = message.equals("Fetcher: No agents listed in "
	+ "'http.agent.name' property.");
	} catch (Exception e) {
	}

	Assert.assertTrue(failedNoAgentName);
	}

	}