blob: ad05f21360879d97d12d345692ff38168ea077fb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
/**
* Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
* fetch 3. Verifies that number of generated urls match 4. Verifies that
* highest scoring urls are generated
*
*/
public class TestGenerator {
Configuration conf;
Path dbDir;
Path segmentsDir;
FileSystem fs;
final static Path testdir = new Path("build/test/generator-test");
@Before
public void setUp() throws Exception {
conf = CrawlDBTestUtil.createContext().getConfiguration();
fs = FileSystem.get(conf);
fs.delete(testdir, true);
}
@After
public void tearDown() {
delete(testdir);
}
private void delete(Path p) {
try {
fs.delete(p, true);
} catch (IOException e) {
}
}
/**
* Test that generator generates fetchlish ordered by score (desc).
*
* @throws Exception
*/
@Test
public void testGenerateHighest() throws Exception {
final int NUM_RESULTS = 2;
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
for (int i = 0; i <= 100; i++) {
list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i));
}
createCrawlDB(list);
Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
Path fetchlist = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
ArrayList<URLCrawlDatum> l = readContents(fetchlist);
// sort urls by score desc
Collections.sort(l, new ScoreComparator());
// verify we got right amount of records
Assert.assertEquals(NUM_RESULTS, l.size());
// verify we have the highest scoring urls
Assert.assertEquals("http://aaa/100", (l.get(0).url.toString()));
Assert.assertEquals("http://aaa/099", (l.get(1).url.toString()));
}
private String pad(int i) {
String s = Integer.toString(i);
while (s.length() < 3) {
s = "0" + s;
}
return s;
}
/**
* Comparator that sorts by score desc.
*/
public class ScoreComparator implements Comparator<URLCrawlDatum> {
public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
return -1;
}
if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
return 1;
}
return 0;
}
}
/**
* Test that generator obeys the property "generate.max.count".
*
* @throws Exception
*/
@Test
public void testGenerateHostLimit() throws Exception {
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1));
list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1));
list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1));
createCrawlDB(list);
int maxPerHost = 1;
Configuration myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
myConfiguration, false);
Path fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
// verify we got right amount of records
int expectedFetchListSize = Math.min(maxPerHost, list.size());
Assert.assertEquals("Failed to apply generate.max.count by host",
expectedFetchListSize, fetchList.size());
maxPerHost = 2;
myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
fetchList = readContents(fetchlistPath);
// verify we got right amount of records
expectedFetchListSize = Math.min(maxPerHost, list.size());
Assert.assertEquals("Failed to apply generate.max.count by host",
expectedFetchListSize, fetchList.size());
maxPerHost = 3;
myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
fetchList = readContents(fetchlistPath);
// verify we got right amount of records
expectedFetchListSize = Math.min(maxPerHost, list.size());
Assert.assertEquals("Failed to apply generate.max.count by host",
expectedFetchListSize, fetchList.size());
}
/**
* Test that generator obeys the property "generate.max.count" and
* "generate.count.mode".
*
* @throws Exception
*/
@Test
public void testGenerateDomainLimit() throws Exception {
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1));
list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1));
list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1));
createCrawlDB(list);
int maxPerDomain = 1;
Configuration myConfiguration = new Configuration(conf);
myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
Generator.GENERATOR_COUNT_VALUE_DOMAIN);
Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
myConfiguration, false);
Path fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
// verify we got right amount of records
int expectedFetchListSize = Math.min(maxPerDomain, list.size());
Assert.assertEquals("Failed to apply generate.max.count by domain",
expectedFetchListSize, fetchList.size());
maxPerDomain = 2;
myConfiguration = new Configuration(myConfiguration);
myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
fetchList = readContents(fetchlistPath);
// verify we got right amount of records
expectedFetchListSize = Math.min(maxPerDomain, list.size());
Assert.assertEquals("Failed to apply generate.max.count by domain",
expectedFetchListSize, fetchList.size());
maxPerDomain = 3;
myConfiguration = new Configuration(myConfiguration);
myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, maxPerDomain);
generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
false);
fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
fetchList = readContents(fetchlistPath);
// verify we got right amount of records
expectedFetchListSize = Math.min(maxPerDomain, list.size());
Assert.assertEquals("Failed to apply generate.max.count by domain",
expectedFetchListSize, fetchList.size());
}
/**
* Test generator obeys the filter setting.
*
* @throws Exception
* @throws IOException
*/
@Test
public void testFilter() throws IOException, Exception {
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
createCrawlDB(list);
Configuration myConfiguration = new Configuration(conf);
myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
myConfiguration, true);
Assert.assertNull("should be null (0 entries)", generatedSegment);
generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
false);
Path fetchlistPath = new Path(new Path(generatedSegment,
CrawlDatum.GENERATE_DIR_NAME), "part-r-00000");
ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
// verify nothing got filtered
Assert.assertEquals(list.size(), fetchList.size());
}
/**
* Read contents of fetchlist.
*
* @param fetchlist
* path to Generated fetchlist
* @return Generated {@link URLCrawlDatum} objects
* @throws IOException
*/
private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
throws IOException {
// verify results
Option rFile = SequenceFile.Reader.file(fetchlist);
SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
READ: do {
Text key = new Text();
CrawlDatum value = new CrawlDatum();
if (!reader.next(key, value)) {
break READ;
}
l.add(new URLCrawlDatum(key, value));
} while (true);
reader.close();
return l;
}
/**
* Generate Fetchlist.
*
* @param numResults
* number of results to generate
* @param config
* Configuration to use
* @return path to generated segment
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
private Path generateFetchlist(int numResults, Configuration config,
boolean filter) throws IOException, ClassNotFoundException, InterruptedException {
// generate segment
Generator g = new Generator(config);
Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
Long.MAX_VALUE, filter, false);
if (generatedSegment == null)
return null;
return generatedSegment[0];
}
/**
* Creates CrawlDB.
*
* @param list
* database contents
* @throws IOException
* @throws Exception
*/
private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
Exception {
dbDir = new Path(testdir, "crawldb");
segmentsDir = new Path(testdir, "segments");
fs.mkdirs(dbDir);
fs.mkdirs(segmentsDir);
// create crawldb
CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
}
/**
* Constructs new {@link URLCrawlDatum} from submitted parameters.
*
* @param url
* url to use
* @param fetchInterval
* {@link CrawlDatum#setFetchInterval(float)}
* @param score
* {@link CrawlDatum#setScore(float)}
* @return Constructed object
*/
private URLCrawlDatum createURLCrawlDatum(final String url,
final int fetchInterval, final float score) {
return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
}
}