| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.crawl; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.io.SequenceFile; |
| import org.apache.hadoop.io.Text; |
| import org.apache.hadoop.io.SequenceFile.Reader.Option; |
| import org.junit.After; |
| import org.junit.Assert; |
| import org.junit.Before; |
| import org.junit.Test; |
| |
| /** |
| * Basic injector test: 1. Creates a text file with urls 2. Injects them into |
| * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls |
| * into webdb 5. Reads crawldb entries and verifies contents |
| * |
| */ |
| public class TestInjector { |
| |
| private Configuration conf; |
| private FileSystem fs; |
| final static Path testdir = new Path("build/test/inject-test"); |
| Path crawldbPath; |
| Path urlPath; |
| |
| @Before |
| public void setUp() throws Exception { |
| conf = CrawlDBTestUtil.createContext().getConfiguration(); |
| urlPath = new Path(testdir, "urls"); |
| crawldbPath = new Path(testdir, "crawldb"); |
| fs = FileSystem.get(conf); |
| if (fs.exists(urlPath)) |
| fs.delete(urlPath, false); |
| if (fs.exists(crawldbPath)) |
| fs.delete(crawldbPath, true); |
| } |
| |
| @After |
| public void tearDown() throws IOException { |
| fs.delete(testdir, true); |
| } |
| |
| @Test |
| public void testInject() |
| throws IOException, ClassNotFoundException, InterruptedException { |
| ArrayList<String> urls = new ArrayList<String>(); |
| // We'll use a separate list for MD so we can still compare url with |
| // containsAll |
| ArrayList<String> metadata = new ArrayList<String>(); |
| for (int i = 0; i < 100; i++) { |
| urls.add("http://zzz.com/" + i + ".html"); |
| metadata.add("\tnutch.score=2." + i |
| + "\tnutch.fetchInterval=171717\tkey=value"); |
| } |
| CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata); |
| |
| Injector injector = new Injector(conf); |
| injector.inject(crawldbPath, urlPath); |
| |
| // verify results |
| List<String> read = readCrawldb(); |
| |
| Collections.sort(read); |
| Collections.sort(urls); |
| |
| Assert.assertEquals(urls.size(), read.size()); |
| |
| Assert.assertTrue(read.containsAll(urls)); |
| Assert.assertTrue(urls.containsAll(read)); |
| |
| // inject more urls |
| ArrayList<String> urls2 = new ArrayList<String>(); |
| for (int i = 0; i < 100; i++) { |
| urls2.add("http://xxx.com/" + i + ".html"); |
| // We'll overwrite previously injected records but preserve their original |
| // MD |
| urls2.add("http://zzz.com/" + i + ".html"); |
| } |
| CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2); |
| injector = new Injector(conf); |
| conf.setBoolean("db.injector.update", true); |
| injector.inject(crawldbPath, urlPath); |
| urls.addAll(urls2); |
| |
| // verify results |
| read = readCrawldb(); |
| |
| Collections.sort(read); |
| Collections.sort(urls); |
| |
| // We should have 100 less records because we've overwritten |
| Assert.assertEquals(urls.size() - 100, read.size()); |
| |
| Assert.assertTrue(read.containsAll(urls)); |
| Assert.assertTrue(urls.containsAll(read)); |
| |
| // Check if we correctly preserved MD |
| Map<String, CrawlDatum> records = readCrawldbRecords(); |
| |
| // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs |
| // so we can check for MD and score and interval |
| Text writableKey = new Text("key"); |
| Text writableValue = new Text("value"); |
| for (String url : urls) { |
| if (url.indexOf("http://zzz") == 0) { |
| // Check for fetch interval |
| Assert.assertTrue(records.get(url).getFetchInterval() == 171717); |
| // Check for default score |
| Assert.assertTrue(records.get(url).getScore() != 1.0); |
| // Check for MD key=value |
| Assert.assertEquals(writableValue, |
| records.get(url).getMetaData().get(writableKey)); |
| } |
| } |
| } |
| |
| private List<String> readCrawldb() throws IOException { |
| Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME |
| + "/part-r-00000/data"); |
| System.out.println("reading:" + dbfile); |
| Option rFile = SequenceFile.Reader.file(dbfile); |
| @SuppressWarnings("resource") |
| SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); |
| ArrayList<String> read = new ArrayList<String>(); |
| |
| READ: do { |
| Text key = new Text(); |
| CrawlDatum value = new CrawlDatum(); |
| if (!reader.next(key, value)) |
| break READ; |
| read.add(key.toString()); |
| } while (true); |
| |
| return read; |
| } |
| |
| private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException { |
| Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME |
| + "/part-r-00000/data"); |
| System.out.println("reading:" + dbfile); |
| Option rFile = SequenceFile.Reader.file(dbfile); |
| @SuppressWarnings("resource") |
| SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); |
| HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>(); |
| |
| READ: do { |
| Text key = new Text(); |
| CrawlDatum value = new CrawlDatum(); |
| if (!reader.next(key, value)) |
| break READ; |
| read.put(key.toString(), value); |
| } while (true); |
| |
| return read; |
| } |
| } |