blob: 14fd05ea4d3a525d17c4da2773da069d55a3376f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TestLinkDbMerger {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
String url10 = "http://example.com/foo";
String[] urls10 = new String[] { "http://example.com/100",
"http://example.com/101" };
String url11 = "http://example.com/";
String[] urls11 = new String[] { "http://example.com/110",
"http://example.com/111" };
String url20 = "http://example.com/";
String[] urls20 = new String[] { "http://foo.com/200", "http://foo.com/201" };
String url21 = "http://example.com/bar";
String[] urls21 = new String[] { "http://foo.com/210", "http://foo.com/211" };
String[] urls10_expected = urls10;
String[] urls11_expected = new String[] { urls11[0], urls11[1], urls20[0],
urls20[1] };
String[] urls20_expected = urls11_expected;
String[] urls21_expected = urls21;
TreeMap<String, String[]> init1 = new TreeMap<String, String[]>();
TreeMap<String, String[]> init2 = new TreeMap<String, String[]>();
HashMap<String, String[]> expected = new HashMap<String, String[]>();
Configuration conf;
Path testDir;
FileSystem fs;
LinkDbReader reader;
@Before
public void setUp() throws Exception {
init1.put(url10, urls10);
init1.put(url11, urls11);
init2.put(url20, urls20);
init2.put(url21, urls21);
expected.put(url10, urls10_expected);
expected.put(url11, urls11_expected);
expected.put(url20, urls20_expected);
expected.put(url21, urls21_expected);
conf = NutchConfiguration.create();
fs = FileSystem.get(conf);
testDir = new Path("build/test/test-linkdb-"
+ new java.util.Random().nextInt());
fs.mkdirs(testDir);
}
@After
public void tearDown() {
try {
if (fs.exists(testDir))
fs.delete(testDir, true);
} catch (Exception e) {
}
try {
reader.close();
} catch (Exception e) {
}
}
@Test
public void testMerge() throws Exception {
Configuration conf = NutchConfiguration.create();
FileSystem fs = FileSystem.get(conf);
fs.mkdirs(testDir);
Path linkdb1 = new Path(testDir, "linkdb1");
Path linkdb2 = new Path(testDir, "linkdb2");
Path output = new Path(testDir, "output");
createLinkDb(conf, fs, linkdb1, init1);
createLinkDb(conf, fs, linkdb2, init2);
LinkDbMerger merger = new LinkDbMerger(conf);
LOG.debug("* merging linkdbs to " + output);
merger.merge(output, new Path[] { linkdb1, linkdb2 }, false, false);
LOG.debug("* reading linkdb: " + output);
reader = new LinkDbReader(conf, output);
Iterator<String> it = expected.keySet().iterator();
while (it.hasNext()) {
String url = it.next();
LOG.debug("url=" + url);
String[] vals = expected.get(url);
Inlinks inlinks = reader.getInlinks(new Text(url));
// may not be null
Assert.assertNotNull(inlinks);
ArrayList<String> links = new ArrayList<String>();
Iterator<?> it2 = inlinks.iterator();
while (it2.hasNext()) {
Inlink in = (Inlink) it2.next();
links.add(in.getFromUrl());
}
for (int i = 0; i < vals.length; i++) {
LOG.debug(" -> " + vals[i]);
Assert.assertTrue(links.contains(vals[i]));
}
}
reader.close();
fs.delete(testDir, true);
}
private void createLinkDb(Configuration config, FileSystem fs, Path linkdb,
TreeMap<String, String[]> init) throws Exception {
LOG.debug("* creating linkdb: " + linkdb);
Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class);
MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
"part-00000"), wKeyOpt, wValueOpt);
Iterator<String> it = init.keySet().iterator();
while (it.hasNext()) {
String key = it.next();
Inlinks inlinks = new Inlinks();
String[] vals = init.get(key);
for (int i = 0; i < vals.length; i++) {
Inlink in = new Inlink(vals[i], vals[i]);
inlinks.add(in);
}
writer.append(new Text(key), inlinks);
}
writer.close();
}
}