| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.crawl; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.io.SequenceFile; |
| import org.apache.hadoop.io.SequenceFile.Reader.Option; |
| import org.apache.hadoop.io.Text; |
| import org.apache.hadoop.mapreduce.Job; |
| import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; |
| import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; |
| import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
| import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
| import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum; |
| import org.apache.nutch.util.NutchJob; |
| import org.junit.After; |
| import org.junit.Assert; |
| import org.junit.Before; |
| import org.junit.Test; |
| |
| /** |
| * CrawlDbFiltering test which tests for correct, error free url normalization |
| * when the CrawlDB includes urls with <code>DB GONE</code> status and |
| * <code>CRAWLDB_PURGE_404</code> is set to true. |
| * |
| * @author lufeng |
| */ |
| public class TestCrawlDbFilter { |
| Configuration conf; |
| Path dbDir; |
| Path newCrawlDb; |
| final static Path testdir = new Path("build/test/crawldbfilter-test"); |
| FileSystem fs; |
| |
| @Before |
| public void setUp() throws Exception { |
| conf = CrawlDBTestUtil.createContext().getConfiguration(); |
| fs = FileSystem.get(conf); |
| fs.delete(testdir, true); |
| } |
| |
| @After |
| public void tearDown() { |
| delete(testdir); |
| } |
| |
| private void delete(Path p) { |
| try { |
| fs.delete(p, true); |
| } catch (IOException e) { |
| } |
| } |
| |
| /** |
| * Test url404Purging |
| * |
| * @throws Exception |
| */ |
| @Test |
| public void testUrl404Purging() throws Exception { |
| // create a CrawlDatum with DB GONE status |
| ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); |
| list.add(new URLCrawlDatum(new Text("http://www.example.com"), |
| new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f))); |
| list.add(new URLCrawlDatum(new Text("http://www.example1.com"), |
| new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f))); |
| list.add(new URLCrawlDatum(new Text("http://www.example2.com"), |
| new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f))); |
| dbDir = new Path(testdir, "crawldb"); |
| newCrawlDb = new Path(testdir, "newcrawldb"); |
| // create crawldb |
| CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list); |
| // set CRAWLDB_PURGE_404 to true |
| conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true); |
| conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true); |
| conf.setBoolean(CrawlDbFilter.URL_FILTERING, false); |
| conf.setInt("urlnormalizer.loop.count", 2); |
| Job job = NutchJob.getInstance(conf); |
| job.setJobName("Test CrawlDbFilter"); |
| Path current = new Path(dbDir, "current"); |
| if (FileSystem.get(conf).exists(current)) { |
| FileInputFormat.addInputPath(job, current); |
| } |
| job.setInputFormatClass(SequenceFileInputFormat.class); |
| job.setMapperClass(CrawlDbFilter.class); |
| job.setReducerClass(CrawlDbReducer.class); |
| FileOutputFormat.setOutputPath(job, newCrawlDb); |
| job.setOutputFormatClass(MapFileOutputFormat.class); |
| job.setOutputKeyClass(Text.class); |
| job.setOutputValueClass(CrawlDatum.class); |
| job.setJarByClass(CrawlDbFilter.class); |
| job.waitForCompletion(true); |
| |
| Path fetchlist = new Path(new Path(newCrawlDb, "part-r-00000"), "data"); |
| |
| ArrayList<URLCrawlDatum> l = readContents(fetchlist); |
| |
| // verify we got right amount of records |
| Assert.assertEquals(2, l.size()); |
| } |
| |
| /** |
| * Read contents of fetchlist. |
| * |
| * @param fetchlist |
| * path to Generated fetchlist |
| * @return Generated {@link URLCrawlDatum} objects |
| * @throws IOException |
| */ |
| private ArrayList<URLCrawlDatum> readContents(Path fetchlist) |
| throws IOException { |
| // verify results |
| Option fFile = SequenceFile.Reader.file(fetchlist); |
| SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile); |
| |
| ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>(); |
| |
| READ: do { |
| Text key = new Text(); |
| CrawlDatum value = new CrawlDatum(); |
| if (!reader.next(key, value)) { |
| break READ; |
| } |
| l.add(new URLCrawlDatum(key, value)); |
| } while (true); |
| |
| reader.close(); |
| return l; |
| } |
| } |