src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.lang.invoke.MethodHandles;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.TreeSet;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;

 public class TestCrawlDbMerger {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   String url10 = "http://example.com/";
   String url11 = "http://example.com/foo";
   String url20 = "http://example.com/";
   String url21 = "http://example.com/bar";
   String[] urls_expected = new String[] { url10, url11, url21 };

   TreeSet<String> init1 = new TreeSet<String>();
   TreeSet<String> init2 = new TreeSet<String>();
   HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
   CrawlDatum cd1, cd2, cd3;
   Configuration conf;
   FileSystem fs;
   Path testDir;
   CrawlDbReader reader;

   @Before
   public void setUp() throws Exception {
     init1.add(url10);
     init1.add(url11);
     init2.add(url20);
     init2.add(url21);
     long time = System.currentTimeMillis();
     cd1 = new CrawlDatum();
     cd1.setFetchInterval(1.0f);
     cd1.setFetchTime(time);
     cd1.getMetaData().put(new Text("name"), new Text("cd1"));
     cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
     cd2 = new CrawlDatum();
     cd2.setFetchInterval(1.0f);
     cd2.setFetchTime(time + 10000);
     cd2.getMetaData().put(new Text("name"), new Text("cd2"));
     cd3 = new CrawlDatum();
     cd3.setFetchInterval(1.0f);
     cd3.setFetchTime(time + 10000);
     cd3.getMetaData().putAll(cd1.getMetaData());
     cd3.getMetaData().putAll(cd2.getMetaData());
     expected.put(url10, cd3);
     expected.put(url11, cd1);
     expected.put(url21, cd2);
     conf = NutchConfiguration.create();
     fs = FileSystem.get(conf);
     testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
     fs.mkdirs(testDir);
   }

   @After
   public void tearDown() {
     try {
       if (fs.exists(testDir))
         fs.delete(testDir, true);
     } catch (Exception e) {
     }
     try {
       reader.close();
     } catch (Exception e) {
     }
   }

   /**
    * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
    * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs
    * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data.
    * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
    * tool. The merged CrawlDb is then written to an arbitrary output location and the results
    * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool.
    * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
    * with actual results based on the merge process.
    * @throws Exception
    */
   @Test
   public void testMerge() throws Exception {
     Path crawldb1 = new Path(testDir, "crawldb1");
     Path crawldb2 = new Path(testDir, "crawldb2");
     Path output = new Path(testDir, "output");
     createCrawlDb(conf, fs, crawldb1, init1, cd1);
     createCrawlDb(conf, fs, crawldb2, init2, cd2);
     CrawlDbMerger merger = new CrawlDbMerger(conf);
     LOG.debug("* merging crawldbs to " + output);
     merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
     LOG.debug("* reading crawldb: " + output);
     reader = new CrawlDbReader();
     String crawlDb = output.toString();
     Iterator<String> it = expected.keySet().iterator();
     while (it.hasNext()) {
       String url = it.next();
       LOG.debug("url=" + url);
       CrawlDatum cd = expected.get(url);
       CrawlDatum res = reader.get(crawlDb, url, conf);
       LOG.debug(" -> " + res);
       System.out.println("url=" + url);
       System.out.println(" cd " + cd);
       System.out.println(" res " + res);
       // may not be null
       Assert.assertNotNull(res);
       Assert.assertTrue(cd.equals(res));
     }
     reader.close();
     fs.delete(testDir, true);
   }

   private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
       TreeSet<String> init, CrawlDatum cd) throws Exception {
     LOG.debug("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);

     Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
     org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);

     MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
         "part-r-00000"), wKeyOpt, wValueOpt);
     Iterator<String> it = init.iterator();
     while (it.hasNext()) {
       String key = it.next();
       writer.append(new Text(key), cd);
     }
     writer.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.lang.invoke.MethodHandles;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.TreeSet;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.MapFile;
	import org.apache.hadoop.io.SequenceFile;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.MapFile.Writer.Option;
	import org.apache.nutch.util.NutchConfiguration;
	import org.junit.After;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Test;

	public class TestCrawlDbMerger {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	String url10 = "http://example.com/";
	String url11 = "http://example.com/foo";
	String url20 = "http://example.com/";
	String url21 = "http://example.com/bar";
	String[] urls_expected = new String[] { url10, url11, url21 };

	TreeSet<String> init1 = new TreeSet<String>();
	TreeSet<String> init2 = new TreeSet<String>();
	HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
	CrawlDatum cd1, cd2, cd3;
	Configuration conf;
	FileSystem fs;
	Path testDir;
	CrawlDbReader reader;

	@Before
	public void setUp() throws Exception {
	init1.add(url10);
	init1.add(url11);
	init2.add(url20);
	init2.add(url21);
	long time = System.currentTimeMillis();
	cd1 = new CrawlDatum();
	cd1.setFetchInterval(1.0f);
	cd1.setFetchTime(time);
	cd1.getMetaData().put(new Text("name"), new Text("cd1"));
	cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
	cd2 = new CrawlDatum();
	cd2.setFetchInterval(1.0f);
	cd2.setFetchTime(time + 10000);
	cd2.getMetaData().put(new Text("name"), new Text("cd2"));
	cd3 = new CrawlDatum();
	cd3.setFetchInterval(1.0f);
	cd3.setFetchTime(time + 10000);
	cd3.getMetaData().putAll(cd1.getMetaData());
	cd3.getMetaData().putAll(cd2.getMetaData());
	expected.put(url10, cd3);
	expected.put(url11, cd1);
	expected.put(url21, cd2);
	conf = NutchConfiguration.create();
	fs = FileSystem.get(conf);
	testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
	fs.mkdirs(testDir);
	}

	@After
	public void tearDown() {
	try {
	if (fs.exists(testDir))
	fs.delete(testDir, true);
	} catch (Exception e) {
	}
	try {
	reader.close();
	} catch (Exception e) {
	}
	}

	/**
	* Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
	* populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs
	* and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data.
	* It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
	* tool. The merged CrawlDb is then written to an arbitrary output location and the results
	* read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool.
	* Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
	* with actual results based on the merge process.
	* @throws Exception
	*/
	@Test
	public void testMerge() throws Exception {
	Path crawldb1 = new Path(testDir, "crawldb1");
	Path crawldb2 = new Path(testDir, "crawldb2");
	Path output = new Path(testDir, "output");
	createCrawlDb(conf, fs, crawldb1, init1, cd1);
	createCrawlDb(conf, fs, crawldb2, init2, cd2);
	CrawlDbMerger merger = new CrawlDbMerger(conf);
	LOG.debug("* merging crawldbs to " + output);
	merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
	LOG.debug("* reading crawldb: " + output);
	reader = new CrawlDbReader();
	String crawlDb = output.toString();
	Iterator<String> it = expected.keySet().iterator();
	while (it.hasNext()) {
	String url = it.next();
	LOG.debug("url=" + url);
	CrawlDatum cd = expected.get(url);
	CrawlDatum res = reader.get(crawlDb, url, conf);
	LOG.debug(" -> " + res);
	System.out.println("url=" + url);
	System.out.println(" cd " + cd);
	System.out.println(" res " + res);
	// may not be null
	Assert.assertNotNull(res);
	Assert.assertTrue(cd.equals(res));
	}
	reader.close();
	fs.delete(testDir, true);
	}

	private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
	TreeSet<String> init, CrawlDatum cd) throws Exception {
	LOG.debug("* creating crawldb: " + crawldb);
	Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);

	Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
	org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);

	MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
	"part-r-00000"), wKeyOpt, wValueOpt);
	Iterator<String> it = init.iterator();
	while (it.hasNext()) {
	String key = it.next();
	writer.append(new Text(key), cd);
	}
	writer.close();
	}
	}