src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.lang.invoke.MethodHandles;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.util.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
 import org.apache.hadoop.mrunit.types.Pair;

 /**
  * Utility to test transitions of {@link CrawlDatum} states during an update of
  * {@link CrawlDb} (command {@literal updatedb}): call
  * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
  * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch
  * status)
  */
 public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver;
   private T reducer;
   private Configuration configuration;

   public static Text dummyURL = new Text("http://nutch.apache.org/");

 //  protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) {
   protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) {
     reducer = updateReducer;
     configuration = conf;
   }

   /**
    * run
    * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
    * and return the CrawlDatum(s) which would have been written into CrawlDb
    *
    * @param values
    *          list of input CrawlDatums
    * @return list of resulting CrawlDatum(s) in CrawlDb
    */
   public List<CrawlDatum> update(List<CrawlDatum> values) {
     List<CrawlDatum> result = new ArrayList<CrawlDatum>(0);
     if (values == null || values.size() == 0) {
       return result;
     }
     Collections.shuffle(values); // sorting of values should have no influence
     reduceDriver = ReduceDriver.newReduceDriver(reducer);
     reduceDriver.getConfiguration().addResource(configuration);
     reduceDriver.withInput(dummyURL, values);
     List<Pair<Text,CrawlDatum>> reduceResult;
     try {
       reduceResult = reduceDriver.run();
       for (Pair<Text,CrawlDatum> p : reduceResult) {
         if (p.getFirst().equals(dummyURL)) {
           result.add(p.getSecond());
         }
       }
     } catch (IOException e) {
       LOG.error(StringUtils.stringifyException(e));
       return result;
     }
     return result;
   }

   /**
    * run
    * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
    * and return the CrawlDatum(s) which would have been written into CrawlDb
    *
    * @param dbDatum
    *          previous CrawlDatum in CrawlDb
    * @param fetchDatum
    *          CrawlDatum resulting from fetching
    * @return list of resulting CrawlDatum(s) in CrawlDb
    */
   public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
     List<CrawlDatum> values = new ArrayList<CrawlDatum>();
     if (dbDatum != null)
       values.add(dbDatum);
     if (fetchDatum != null)
       values.add(fetchDatum);
     return update(values);
   }

   /**
    * see {@link #update(List)}
    */
   public List<CrawlDatum> update(CrawlDatum... values) {
     return update(Arrays.asList(values));
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.lang.invoke.MethodHandles;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Iterator;
	import java.util.List;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.util.StringUtils;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
	import org.apache.hadoop.mrunit.types.Pair;

	/**
	* Utility to test transitions of {@link CrawlDatum} states during an update of
	* {@link CrawlDb} (command {@literal updatedb}): call
	* {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
	* (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch
	* status)
	*/
	public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver;
	private T reducer;
	private Configuration configuration;

	public static Text dummyURL = new Text("http://nutch.apache.org/");

	// protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) {
	protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) {
	reducer = updateReducer;
	configuration = conf;
	}

	/**
	* run
	* {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
	* and return the CrawlDatum(s) which would have been written into CrawlDb
	*
	* @param values
	* list of input CrawlDatums
	* @return list of resulting CrawlDatum(s) in CrawlDb
	*/
	public List<CrawlDatum> update(List<CrawlDatum> values) {
	List<CrawlDatum> result = new ArrayList<CrawlDatum>(0);
	if (values == null \|\| values.size() == 0) {
	return result;
	}
	Collections.shuffle(values); // sorting of values should have no influence
	reduceDriver = ReduceDriver.newReduceDriver(reducer);
	reduceDriver.getConfiguration().addResource(configuration);
	reduceDriver.withInput(dummyURL, values);
	List<Pair<Text,CrawlDatum>> reduceResult;
	try {
	reduceResult = reduceDriver.run();
	for (Pair<Text,CrawlDatum> p : reduceResult) {
	if (p.getFirst().equals(dummyURL)) {
	result.add(p.getSecond());
	}
	}
	} catch (IOException e) {
	LOG.error(StringUtils.stringifyException(e));
	return result;
	}
	return result;
	}

	/**
	* run
	* {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
	* and return the CrawlDatum(s) which would have been written into CrawlDb
	*
	* @param dbDatum
	* previous CrawlDatum in CrawlDb
	* @param fetchDatum
	* CrawlDatum resulting from fetching
	* @return list of resulting CrawlDatum(s) in CrawlDb
	*/
	public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
	List<CrawlDatum> values = new ArrayList<CrawlDatum>();
	if (dbDatum != null)
	values.add(dbDatum);
	if (fetchDatum != null)
	values.add(fetchDatum);
	return update(values);
	}

	/**
	* see {@link #update(List)}
	*/
	public List<CrawlDatum> update(CrawlDatum... values) {
	return update(Arrays.asList(values));
	}

	}