blob: c6db4ba77a5a54ad74742a510860f55551373d21 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
/**
* Basic injector test: 1. Creates a text file with urls 2. Injects them into
* crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls
* into webdb 5. Reads crawldb entries and verifies contents
*
*/
public class TestInjector {
private Configuration conf;
private FileSystem fs;
final static Path testdir = new Path("build/test/inject-test");
Path crawldbPath;
Path urlPath;
@Before
public void setUp() throws Exception {
conf = CrawlDBTestUtil.createContext().getConfiguration();
urlPath = new Path(testdir, "urls");
crawldbPath = new Path(testdir, "crawldb");
fs = FileSystem.get(conf);
if (fs.exists(urlPath))
fs.delete(urlPath, false);
if (fs.exists(crawldbPath))
fs.delete(crawldbPath, true);
}
@After
public void tearDown() throws IOException {
fs.delete(testdir, true);
}
@Test
public void testInject()
throws IOException, ClassNotFoundException, InterruptedException {
ArrayList<String> urls = new ArrayList<String>();
// We'll use a separate list for MD so we can still compare url with
// containsAll
ArrayList<String> metadata = new ArrayList<String>();
for (int i = 0; i < 100; i++) {
urls.add("http://zzz.com/" + i + ".html");
metadata.add("\tnutch.score=2." + i
+ "\tnutch.fetchInterval=171717\tkey=value");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
Injector injector = new Injector(conf);
injector.inject(crawldbPath, urlPath);
// verify results
List<String> read = readCrawldb();
Collections.sort(read);
Collections.sort(urls);
Assert.assertEquals(urls.size(), read.size());
Assert.assertTrue(read.containsAll(urls));
Assert.assertTrue(urls.containsAll(read));
// inject more urls
ArrayList<String> urls2 = new ArrayList<String>();
for (int i = 0; i < 100; i++) {
urls2.add("http://xxx.com/" + i + ".html");
// We'll overwrite previously injected records but preserve their original
// MD
urls2.add("http://zzz.com/" + i + ".html");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
injector = new Injector(conf);
conf.setBoolean("db.injector.update", true);
injector.inject(crawldbPath, urlPath);
urls.addAll(urls2);
// verify results
read = readCrawldb();
Collections.sort(read);
Collections.sort(urls);
// We should have 100 less records because we've overwritten
Assert.assertEquals(urls.size() - 100, read.size());
Assert.assertTrue(read.containsAll(urls));
Assert.assertTrue(urls.containsAll(read));
// Check if we correctly preserved MD
Map<String, CrawlDatum> records = readCrawldbRecords();
// Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
// so we can check for MD and score and interval
Text writableKey = new Text("key");
Text writableValue = new Text("value");
for (String url : urls) {
if (url.indexOf("http://zzz") == 0) {
// Check for fetch interval
Assert.assertTrue(records.get(url).getFetchInterval() == 171717);
// Check for default score
Assert.assertTrue(records.get(url).getScore() != 1.0);
// Check for MD key=value
Assert.assertEquals(writableValue,
records.get(url).getMetaData().get(writableKey));
}
}
}
private List<String> readCrawldb() throws IOException {
Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+ "/part-r-00000/data");
System.out.println("reading:" + dbfile);
Option rFile = SequenceFile.Reader.file(dbfile);
@SuppressWarnings("resource")
SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
ArrayList<String> read = new ArrayList<String>();
READ: do {
Text key = new Text();
CrawlDatum value = new CrawlDatum();
if (!reader.next(key, value))
break READ;
read.add(key.toString());
} while (true);
return read;
}
private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException {
Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+ "/part-r-00000/data");
System.out.println("reading:" + dbfile);
Option rFile = SequenceFile.Reader.file(dbfile);
@SuppressWarnings("resource")
SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>();
READ: do {
Text key = new Text();
CrawlDatum value = new CrawlDatum();
if (!reader.next(key, value))
break READ;
read.put(key.toString(), value);
} while (true);
return read;
}
}