blob: 82fefaf164b965ac6be3659897c68c702d114eb0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
/**
* CrawlDbFiltering test which tests for correct, error free url normalization
* when the CrawlDB includes urls with <code>DB GONE</code> status and
* <code>CRAWLDB_PURGE_404</code> is set to true.
*
* @author lufeng
*/
public class TestCrawlDbFilter {
Configuration conf;
Path dbDir;
Path newCrawlDb;
final static Path testdir = new Path("build/test/crawldbfilter-test");
FileSystem fs;
@Before
public void setUp() throws Exception {
conf = CrawlDBTestUtil.createContext().getConfiguration();
fs = FileSystem.get(conf);
fs.delete(testdir, true);
}
@After
public void tearDown() {
delete(testdir);
}
private void delete(Path p) {
try {
fs.delete(p, true);
} catch (IOException e) {
}
}
/**
* Test url404Purging
*
* @throws Exception
*/
@Test
public void testUrl404Purging() throws Exception {
// create a CrawlDatum with DB GONE status
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
list.add(new URLCrawlDatum(new Text("http://www.example.com"),
new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
list.add(new URLCrawlDatum(new Text("http://www.example1.com"),
new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
list.add(new URLCrawlDatum(new Text("http://www.example2.com"),
new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
dbDir = new Path(testdir, "crawldb");
newCrawlDb = new Path(testdir, "newcrawldb");
// create crawldb
CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
// set CRAWLDB_PURGE_404 to true
conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true);
conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
conf.setInt("urlnormalizer.loop.count", 2);
Job job = NutchJob.getInstance(conf);
job.setJobName("Test CrawlDbFilter");
Path current = new Path(dbDir, "current");
if (FileSystem.get(conf).exists(current)) {
FileInputFormat.addInputPath(job, current);
}
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbFilter.class);
job.setReducerClass(CrawlDbReducer.class);
FileOutputFormat.setOutputPath(job, newCrawlDb);
job.setOutputFormatClass(MapFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setJarByClass(CrawlDbFilter.class);
job.waitForCompletion(true);
Path fetchlist = new Path(new Path(newCrawlDb, "part-r-00000"), "data");
ArrayList<URLCrawlDatum> l = readContents(fetchlist);
// verify we got right amount of records
Assert.assertEquals(2, l.size());
}
/**
* Read contents of fetchlist.
*
* @param fetchlist
* path to Generated fetchlist
* @return Generated {@link URLCrawlDatum} objects
* @throws IOException
*/
private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
throws IOException {
// verify results
Option fFile = SequenceFile.Reader.file(fetchlist);
SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile);
ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
READ: do {
Text key = new Text();
CrawlDatum value = new CrawlDatum();
if (!reader.next(key, value)) {
break READ;
}
l.add(new URLCrawlDatum(key, value));
} while (true);
reader.close();
return l;
}
}