blob: 622fc98717048a4785cf45c0f1dfd7fe472073e8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class TestCrawlDbMerger {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
String url10 = "http://example.com/";
String url11 = "http://example.com/foo";
String url20 = "http://example.com/";
String url21 = "http://example.com/bar";
String[] urls_expected = new String[] { url10, url11, url21 };
TreeSet<String> init1 = new TreeSet<String>();
TreeSet<String> init2 = new TreeSet<String>();
HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
CrawlDatum cd1, cd2, cd3;
Configuration conf;
FileSystem fs;
Path testDir;
CrawlDbReader reader;
@Before
public void setUp() throws Exception {
init1.add(url10);
init1.add(url11);
init2.add(url20);
init2.add(url21);
long time = System.currentTimeMillis();
cd1 = new CrawlDatum();
cd1.setFetchInterval(1.0f);
cd1.setFetchTime(time);
cd1.getMetaData().put(new Text("name"), new Text("cd1"));
cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
cd2 = new CrawlDatum();
cd2.setFetchInterval(1.0f);
cd2.setFetchTime(time + 10000);
cd2.getMetaData().put(new Text("name"), new Text("cd2"));
cd3 = new CrawlDatum();
cd3.setFetchInterval(1.0f);
cd3.setFetchTime(time + 10000);
cd3.getMetaData().putAll(cd1.getMetaData());
cd3.getMetaData().putAll(cd2.getMetaData());
expected.put(url10, cd3);
expected.put(url11, cd1);
expected.put(url21, cd2);
conf = NutchConfiguration.create();
fs = FileSystem.get(conf);
testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
fs.mkdirs(testDir);
}
@After
public void tearDown() {
try {
if (fs.exists(testDir))
fs.delete(testDir, true);
} catch (Exception e) {
}
try {
reader.close();
} catch (Exception e) {
}
}
/**
* Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
* populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs
* and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data.
* It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
* tool. The merged CrawlDb is then written to an arbitrary output location and the results
* read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool.
* Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
* with actual results based on the merge process.
* @throws Exception
*/
@Test
public void testMerge() throws Exception {
Path crawldb1 = new Path(testDir, "crawldb1");
Path crawldb2 = new Path(testDir, "crawldb2");
Path output = new Path(testDir, "output");
createCrawlDb(conf, fs, crawldb1, init1, cd1);
createCrawlDb(conf, fs, crawldb2, init2, cd2);
CrawlDbMerger merger = new CrawlDbMerger(conf);
LOG.debug("* merging crawldbs to " + output);
merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
LOG.debug("* reading crawldb: " + output);
reader = new CrawlDbReader();
String crawlDb = output.toString();
Iterator<String> it = expected.keySet().iterator();
while (it.hasNext()) {
String url = it.next();
LOG.debug("url=" + url);
CrawlDatum cd = expected.get(url);
CrawlDatum res = reader.get(crawlDb, url, conf);
LOG.debug(" -> " + res);
System.out.println("url=" + url);
System.out.println(" cd " + cd);
System.out.println(" res " + res);
// may not be null
Assert.assertNotNull(res);
Assert.assertTrue(cd.equals(res));
}
reader.close();
fs.delete(testDir, true);
}
private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
TreeSet<String> init, CrawlDatum cd) throws Exception {
LOG.debug("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
"part-r-00000"), wKeyOpt, wValueOpt);
Iterator<String> it = init.iterator();
while (it.hasNext()) {
String key = it.next();
writer.append(new Text(key), cd);
}
writer.close();
}
}