src/test/org/apache/nutch/crawl/TestGenerator.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile.Reader.Option;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;

 /**
  * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
  * fetch 3. Verifies that number of generated urls match 4. Verifies that
  * highest scoring urls are generated
  *
  */
 public class TestGenerator {

   Configuration conf;

   Path dbDir;

   Path segmentsDir;

   FileSystem fs;

   final static Path testdir = new Path("build/test/generator-test");

   @Before
   public void setUp() throws Exception {
     conf = CrawlDBTestUtil.createContext().getConfiguration();
     fs = FileSystem.get(conf);
     fs.delete(testdir, true);
   }

   @After
   public void tearDown() {
     delete(testdir);
   }

   private void delete(Path p) {
     try {
       fs.delete(p, true);
     } catch (IOException e) {
     }
   }

   /**
    * Test that generator generates fetchlish ordered by score (desc).
    *
    * @throws Exception
    */
   @Test
   public void testGenerateHighest() throws Exception {

     final int NUM_RESULTS = 2;

     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

     for (int i = 0; i <= 100; i++) {
       list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i));
     }

     createCrawlDB(list);

     Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);

     Path fetchlist = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     ArrayList<URLCrawlDatum> l = readContents(fetchlist);

     // sort urls by score desc
     Collections.sort(l, new ScoreComparator());

     // verify we got right amount of records
     Assert.assertEquals(NUM_RESULTS, l.size());

     // verify we have the highest scoring urls
     Assert.assertEquals("http://aaa/100", (l.get(0).url.toString()));
     Assert.assertEquals("http://aaa/099", (l.get(1).url.toString()));
   }

   private String pad(int i) {
     String s = Integer.toString(i);
     while (s.length() < 3) {
       s = "0" + s;
     }
     return s;
   }

   /**
    * Comparator that sorts by score desc.
    */
   public class ScoreComparator implements Comparator<URLCrawlDatum> {

     public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
       if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
         return -1;
       }
       if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
         return 1;
       }
       return 0;
     }
   }

   /**
    * Test that generator obeys the property "generate.max.count".
    *
    * @throws Exception
    */
   @Test
   public void testGenerateHostLimit() throws Exception {
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

     list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1));
     list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1));
     list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1));

     createCrawlDB(list);

     int maxPerHost = 1;
     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
         myConfiguration, false);

     Path fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);

     // verify we got right amount of records
     int expectedFetchListSize = Math.min(maxPerHost, list.size());
     Assert.assertEquals("Failed to apply generate.max.count by host",
         expectedFetchListSize, fetchList.size());

     maxPerHost = 2;
     myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);

     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     fetchList = readContents(fetchlistPath);

     // verify we got right amount of records
     expectedFetchListSize = Math.min(maxPerHost, list.size());
     Assert.assertEquals("Failed to apply generate.max.count by host",
         expectedFetchListSize, fetchList.size());

     maxPerHost = 3;
     myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);

     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     fetchList = readContents(fetchlistPath);

     // verify we got right amount of records
     expectedFetchListSize = Math.min(maxPerHost, list.size());
     Assert.assertEquals("Failed to apply generate.max.count by host",
         expectedFetchListSize, fetchList.size());
   }

   /**
    * Test that generator obeys the property "generate.max.count" and
    * "generate.count.mode".
    *
    * @throws Exception
    */
   @Test
   public void testGenerateDomainLimit() throws Exception {
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

     list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1));
     list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1));
     list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1));

     createCrawlDB(list);

     int maxPerDomain = 1;
     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
     myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
         Generator.GENERATOR_COUNT_VALUE_DOMAIN);

     Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
         myConfiguration, false);

     Path fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);

     // verify we got right amount of records
     int expectedFetchListSize = Math.min(maxPerDomain, list.size());
     Assert.assertEquals("Failed to apply generate.max.count by domain",
         expectedFetchListSize, fetchList.size());

     maxPerDomain = 2;
     myConfiguration = new Configuration(myConfiguration);
     myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);

     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     fetchList = readContents(fetchlistPath);

     // verify we got right amount of records
     expectedFetchListSize = Math.min(maxPerDomain, list.size());
     Assert.assertEquals("Failed to apply generate.max.count by domain",
         expectedFetchListSize, fetchList.size());

     maxPerDomain = 3;
     myConfiguration = new Configuration(myConfiguration);
     myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);

     fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     fetchList = readContents(fetchlistPath);

     // verify we got right amount of records
     expectedFetchListSize = Math.min(maxPerDomain, list.size());
     Assert.assertEquals("Failed to apply generate.max.count by domain",
         expectedFetchListSize, fetchList.size());
   }

   /**
    * Test generator obeys the filter setting.
    *
    * @throws Exception
    * @throws IOException
    */
   @Test
   public void testFilter() throws IOException, Exception {

     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

     list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
     list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
     list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));

     createCrawlDB(list);

     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");

     Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
         myConfiguration, true);

     Assert.assertNull("should be null (0 entries)", generatedSegment);

     generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
         false);

     Path fetchlistPath = new Path(new Path(generatedSegment,
         CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

     ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);

     // verify nothing got filtered
     Assert.assertEquals(list.size(), fetchList.size());

   }

   /**
    * Read contents of fetchlist.
    *
    * @param fetchlist
    *          path to Generated fetchlist
    * @return Generated {@link URLCrawlDatum} objects
    * @throws IOException
    */
   private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
       throws IOException {
     // verify results
     Option rFile = SequenceFile.Reader.file(fetchlist);
     SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);

     ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();

     READ: do {
       Text key = new Text();
       CrawlDatum value = new CrawlDatum();
       if (!reader.next(key, value)) {
         break READ;
       }
       l.add(new URLCrawlDatum(key, value));
     } while (true);

     reader.close();
     return l;
   }

   /**
    * Generate Fetchlist.
    *
    * @param numResults
    *          number of results to generate
    * @param config
    *          Configuration to use
    * @return path to generated segment
    * @throws IOException
    * @throws InterruptedException
    * @throws ClassNotFoundException
    */
   private Path generateFetchlist(int numResults, Configuration config,
       boolean filter) throws IOException, ClassNotFoundException, InterruptedException {
     // generate segment
     Generator g = new Generator(config);
     Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
         Long.MAX_VALUE, filter, false);
     if (generatedSegment == null)
       return null;
     return generatedSegment[0];
   }

   /**
    * Creates CrawlDB.
    *
    * @param list
    *          database contents
    * @throws IOException
    * @throws Exception
    */
   private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
       Exception {
     dbDir = new Path(testdir, "crawldb");
     segmentsDir = new Path(testdir, "segments");
     fs.mkdirs(dbDir);
     fs.mkdirs(segmentsDir);

     // create crawldb
     CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
   }

   /**
    * Constructs new {@link URLCrawlDatum} from submitted parameters.
    *
    * @param url
    *          url to use
    * @param fetchInterval
    *          {@link CrawlDatum#setFetchInterval(float)}
    * @param score
    *          {@link CrawlDatum#setScore(float)}
    * @return Constructed object
    */
   private URLCrawlDatum createURLCrawlDatum(final String url,
       final int fetchInterval, final float score) {
     return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
         CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.SequenceFile;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.SequenceFile.Reader.Option;
	import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
	import org.junit.After;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Test;

	/**
	* Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
	* fetch 3. Verifies that number of generated urls match 4. Verifies that
	* highest scoring urls are generated
	*
	*/
	public class TestGenerator {

	Configuration conf;

	Path dbDir;

	Path segmentsDir;

	FileSystem fs;

	final static Path testdir = new Path("build/test/generator-test");

	@Before
	public void setUp() throws Exception {
	conf = CrawlDBTestUtil.createContext().getConfiguration();
	fs = FileSystem.get(conf);
	fs.delete(testdir, true);
	}

	@After
	public void tearDown() {
	delete(testdir);
	}

	private void delete(Path p) {
	try {
	fs.delete(p, true);
	} catch (IOException e) {
	}
	}

	/**
	* Test that generator generates fetchlish ordered by score (desc).
	*
	* @throws Exception
	*/
	@Test
	public void testGenerateHighest() throws Exception {

	final int NUM_RESULTS = 2;

	ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

	for (int i = 0; i <= 100; i++) {
	list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i));
	}

	createCrawlDB(list);

	Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);

	Path fetchlist = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	ArrayList<URLCrawlDatum> l = readContents(fetchlist);

	// sort urls by score desc
	Collections.sort(l, new ScoreComparator());

	// verify we got right amount of records
	Assert.assertEquals(NUM_RESULTS, l.size());

	// verify we have the highest scoring urls
	Assert.assertEquals("http://aaa/100", (l.get(0).url.toString()));
	Assert.assertEquals("http://aaa/099", (l.get(1).url.toString()));
	}

	private String pad(int i) {
	String s = Integer.toString(i);
	while (s.length() < 3) {
	s = "0" + s;
	}
	return s;
	}

	/**
	* Comparator that sorts by score desc.
	*/
	public class ScoreComparator implements Comparator<URLCrawlDatum> {

	public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
	if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
	return -1;
	}
	if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
	return 1;
	}
	return 0;
	}
	}

	/**
	* Test that generator obeys the property "generate.max.count".
	*
	* @throws Exception
	*/
	@Test
	public void testGenerateHostLimit() throws Exception {
	ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

	list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1));
	list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1));
	list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1));

	createCrawlDB(list);

	int maxPerHost = 1;
	Configuration myConfiguration = new Configuration(conf);
	myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
	Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
	myConfiguration, false);

	Path fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);

	// verify we got right amount of records
	int expectedFetchListSize = Math.min(maxPerHost, list.size());
	Assert.assertEquals("Failed to apply generate.max.count by host",
	expectedFetchListSize, fetchList.size());

	maxPerHost = 2;
	myConfiguration = new Configuration(conf);
	myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
	generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
	false);

	fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	fetchList = readContents(fetchlistPath);

	// verify we got right amount of records
	expectedFetchListSize = Math.min(maxPerHost, list.size());
	Assert.assertEquals("Failed to apply generate.max.count by host",
	expectedFetchListSize, fetchList.size());

	maxPerHost = 3;
	myConfiguration = new Configuration(conf);
	myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
	generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
	false);

	fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	fetchList = readContents(fetchlistPath);

	// verify we got right amount of records
	expectedFetchListSize = Math.min(maxPerHost, list.size());
	Assert.assertEquals("Failed to apply generate.max.count by host",
	expectedFetchListSize, fetchList.size());
	}

	/**
	* Test that generator obeys the property "generate.max.count" and
	* "generate.count.mode".
	*
	* @throws Exception
	*/
	@Test
	public void testGenerateDomainLimit() throws Exception {
	ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

	list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1));
	list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1));
	list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1));

	createCrawlDB(list);

	int maxPerDomain = 1;
	Configuration myConfiguration = new Configuration(conf);
	myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
	myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
	Generator.GENERATOR_COUNT_VALUE_DOMAIN);

	Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
	myConfiguration, false);

	Path fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);

	// verify we got right amount of records
	int expectedFetchListSize = Math.min(maxPerDomain, list.size());
	Assert.assertEquals("Failed to apply generate.max.count by domain",
	expectedFetchListSize, fetchList.size());

	maxPerDomain = 2;
	myConfiguration = new Configuration(myConfiguration);
	myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
	generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
	false);

	fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	fetchList = readContents(fetchlistPath);

	// verify we got right amount of records
	expectedFetchListSize = Math.min(maxPerDomain, list.size());
	Assert.assertEquals("Failed to apply generate.max.count by domain",
	expectedFetchListSize, fetchList.size());

	maxPerDomain = 3;
	myConfiguration = new Configuration(myConfiguration);
	myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
	generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
	false);

	fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	fetchList = readContents(fetchlistPath);

	// verify we got right amount of records
	expectedFetchListSize = Math.min(maxPerDomain, list.size());
	Assert.assertEquals("Failed to apply generate.max.count by domain",
	expectedFetchListSize, fetchList.size());
	}

	/**
	* Test generator obeys the filter setting.
	*
	* @throws Exception
	* @throws IOException
	*/
	@Test
	public void testFilter() throws IOException, Exception {

	ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

	list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
	list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
	list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));

	createCrawlDB(list);

	Configuration myConfiguration = new Configuration(conf);
	myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");

	Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
	myConfiguration, true);

	Assert.assertNull("should be null (0 entries)", generatedSegment);

	generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
	false);

	Path fetchlistPath = new Path(new Path(generatedSegment,
	CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");

	ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);

	// verify nothing got filtered
	Assert.assertEquals(list.size(), fetchList.size());

	}

	/**
	* Read contents of fetchlist.
	*
	* @param fetchlist
	* path to Generated fetchlist
	* @return Generated {@link URLCrawlDatum} objects
	* @throws IOException
	*/
	private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
	throws IOException {
	// verify results
	Option rFile = SequenceFile.Reader.file(fetchlist);
	SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);

	ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();

	READ: do {
	Text key = new Text();
	CrawlDatum value = new CrawlDatum();
	if (!reader.next(key, value)) {
	break READ;
	}
	l.add(new URLCrawlDatum(key, value));
	} while (true);

	reader.close();
	return l;
	}

	/**
	* Generate Fetchlist.
	*
	* @param numResults
	* number of results to generate
	* @param config
	* Configuration to use
	* @return path to generated segment
	* @throws IOException
	* @throws InterruptedException
	* @throws ClassNotFoundException
	*/
	private Path generateFetchlist(int numResults, Configuration config,
	boolean filter) throws IOException, ClassNotFoundException, InterruptedException {
	// generate segment
	Generator g = new Generator(config);
	Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
	Long.MAX_VALUE, filter, false);
	if (generatedSegment == null)
	return null;
	return generatedSegment[0];
	}

	/**
	* Creates CrawlDB.
	*
	* @param list
	* database contents
	* @throws IOException
	* @throws Exception
	*/
	private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
	Exception {
	dbDir = new Path(testdir, "crawldb");
	segmentsDir = new Path(testdir, "segments");
	fs.mkdirs(dbDir);
	fs.mkdirs(segmentsDir);

	// create crawldb
	CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
	}

	/**
	* Constructs new {@link URLCrawlDatum} from submitted parameters.
	*
	* @param url
	* url to use
	* @param fetchInterval
	* {@link CrawlDatum#setFetchInterval(float)}
	* @param score
	* {@link CrawlDatum#setScore(float)}
	* @return Constructed object
	*/
	private URLCrawlDatum createURLCrawlDatum(final String url,
	final int fetchInterval, final float score) {
	return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
	CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
	}
	}