src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.lang.invoke.MethodHandles;
 import java.net.URI;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration.IntegerRanges;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.Counters;
 import org.apache.hadoop.mapred.Counters.Counter;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.JobID;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.OutputCommitter;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.Partitioner;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.security.Credentials;
 import org.apache.hadoop.util.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Utility to test transitions of {@link CrawlDatum} states during an update of
  * {@link CrawlDb} (command {@literal updatedb}): call
  * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with
  * the old CrawlDatum (db status) and the new one (fetch status)
  */
 public class CrawlDbUpdateUtil <T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private CrawlDbReducer reducer;

   public static Text dummyURL = new Text("http://nutch.apache.org/");

   protected CrawlDbUpdateUtil(CrawlDbReducer red, Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context) throws IOException {
     reducer = red;
     reducer.setup(context);
   }

   /** {@link Context} to collect all values in a {@link List} */
   private class DummyContext extends Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context {

     private DummyContext() {
       reducer.super();
     }

     private List<CrawlDatum> values = new ArrayList<CrawlDatum>();

     @Override
     public void write(Text key, CrawlDatum value) throws IOException, InterruptedException {
       values.add(value);
     }

     /** collected values as List */
     public List<CrawlDatum> getValues() {
       return values;
     }

     /** Obtain current collected value from List */
     @Override
     public CrawlDatum getCurrentValue() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context");
     }

     /** Obtain current collected key from List */
     @Override
     public Text getCurrentKey() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context with no keys");
     }

     private Counters dummyCounters = new Counters();

     public void progress() {
     }

     public Counter getCounter(Enum<?> arg0) {
       return dummyCounters.getGroup("dummy").getCounterForName("dummy");
     }

     public Counter getCounter(String arg0, String arg1) {
       return dummyCounters.getGroup("dummy").getCounterForName("dummy");
     }

     public void setStatus(String arg0) throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context with no status");
     }

     @Override
     public String getStatus() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context with no status");
     }

     public float getProgress() {
       return 1f;
     }

     public OutputCommitter getOutputCommitter() {
       throw new UnsupportedOperationException("Dummy context without committer");
     }

     public boolean nextKey(){
       return false;
     }

     @Override
     public boolean nextKeyValue(){
       return false;
     }

     @Override
     public TaskAttemptID getTaskAttemptID() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context without TaskAttemptID");
     }

     @Override
     public Path[] getArchiveClassPaths() {
       return null;
     }

     @Override
     public String[] getArchiveTimestamps() {
       return null;
     }

     @Override
     public URI[] getCacheArchives() throws IOException {
       return null;
     }

     @Override
     public URI[] getCacheFiles() throws IOException {
       return null;
     }

     @Override
     public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public RawComparator<?> getCombinerKeyGroupingComparator() {
       return null;
     }

     @Override
     public Configuration getConfiguration() {
       return null;
     }

     @Override
     public Credentials getCredentials() {
       return null;
     }

     @Override
     public Path[] getFileClassPaths() {
       return null;
     }

     @Override
     public String[] getFileTimestamps() {
       return null;
     }

     @Override
     public RawComparator<?> getGroupingComparator() {
       return null;
     }

     @Override
     public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public String getJar() {
       return null;
     }

     @Override
     public JobID getJobID() {
       return null;
     }

     @Override
     public String getJobName() {
       return null;
     }

     @Override
     public boolean getJobSetupCleanupNeeded() {
       return false;
     }

     @Override
     @Deprecated
     public Path[] getLocalCacheArchives() throws IOException {
       return null;
     }

     @Override
     @Deprecated
     public Path[] getLocalCacheFiles() throws IOException {
       return null;
     }

     @Override
     public Class<?> getMapOutputKeyClass() {
       return null;
     }

     @Override
     public Class<?> getMapOutputValueClass() {
       return null;
     }

     @Override
     public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public int getMaxMapAttempts() {
       return 0;
     }

     @Override
     public int getMaxReduceAttempts() {
       return 0;
     }

     @Override
     public int getNumReduceTasks() {
       return 0;
     }

     @Override
     public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public Class<?> getOutputKeyClass() {
       return null;
     }

     @Override
     public Class<?> getOutputValueClass() {
       return null;
     }

     @Override
     public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public boolean getProfileEnabled() {
       return false;
     }

     @Override
     public String getProfileParams() {
       return null;
     }

     @Override
     public IntegerRanges getProfileTaskRange(boolean arg0) {
       return null;
     }

     @Override
     public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public RawComparator<?> getSortComparator() {
       return null;
     }

     @Override
     @Deprecated
     public boolean getSymlink() {
       return false;
     }

     @Override
     public boolean getTaskCleanupNeeded() {
       return false;
     }

     @Override
     public String getUser() {
       return null;
     }

     @Override
     public Path getWorkingDirectory() throws IOException {
       return null;
     }

   }

   /**
    * run
    * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
    * and return the CrawlDatum(s) which would have been written into CrawlDb
    *
    * @param values
    *          list of input CrawlDatums
    * @return list of resulting CrawlDatum(s) in CrawlDb
    */
   @SuppressWarnings("unchecked")
   public List<CrawlDatum> update(List<CrawlDatum> values) {
     if (values == null || values.size() == 0) {
       return new ArrayList<CrawlDatum>(0);
     }
     Collections.shuffle(values); // sorting of values should have no influence
     DummyContext context = new DummyContext();
     try {
       Iterable<CrawlDatum> iterable_values = (Iterable)values;
       reducer.reduce(dummyURL, iterable_values, (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context);
     } catch (IOException e) {
       LOG.error(StringUtils.stringifyException(e));
     } catch (InterruptedException e) {
       LOG.error(StringUtils.stringifyException(e));
     }
     return context.getValues();
   }

   /**
    * run
    * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
    * and return the CrawlDatum(s) which would have been written into CrawlDb
    *
    * @param dbDatum
    *          previous CrawlDatum in CrawlDb
    * @param fetchDatum
    *          CrawlDatum resulting from fetching
    * @return list of resulting CrawlDatum(s) in CrawlDb
    */
   public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
     List<CrawlDatum> values = new ArrayList<CrawlDatum>();
     if (dbDatum != null)
       values.add(dbDatum);
     if (fetchDatum != null)
       values.add(fetchDatum);
     return update(values);
   }

   /**
    * see {@link #update(List)}
    */
   public List<CrawlDatum> update(CrawlDatum... values) {
     return update(Arrays.asList(values));
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.lang.invoke.MethodHandles;
	import java.net.URI;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Iterator;
	import java.util.List;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configuration.IntegerRanges;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.RawComparator;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapred.Counters;
	import org.apache.hadoop.mapred.Counters.Counter;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.JobID;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.OutputCommitter;
	import org.apache.hadoop.mapreduce.OutputFormat;
	import org.apache.hadoop.mapreduce.Partitioner;
	import org.apache.hadoop.mapreduce.TaskAttemptID;
	import org.apache.hadoop.security.Credentials;
	import org.apache.hadoop.util.StringUtils;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Utility to test transitions of {@link CrawlDatum} states during an update of
	* {@link CrawlDb} (command {@literal updatedb}): call
	* {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with
	* the old CrawlDatum (db status) and the new one (fetch status)
	*/
	public class CrawlDbUpdateUtil <T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private CrawlDbReducer reducer;

	public static Text dummyURL = new Text("http://nutch.apache.org/");

	protected CrawlDbUpdateUtil(CrawlDbReducer red, Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context) throws IOException {
	reducer = red;
	reducer.setup(context);
	}

	/** {@link Context} to collect all values in a {@link List} */
	private class DummyContext extends Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context {

	private DummyContext() {
	reducer.super();
	}

	private List<CrawlDatum> values = new ArrayList<CrawlDatum>();

	@Override
	public void write(Text key, CrawlDatum value) throws IOException, InterruptedException {
	values.add(value);
	}

	/** collected values as List */
	public List<CrawlDatum> getValues() {
	return values;
	}

	/** Obtain current collected value from List */
	@Override
	public CrawlDatum getCurrentValue() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context");
	}

	/** Obtain current collected key from List */
	@Override
	public Text getCurrentKey() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context with no keys");
	}

	private Counters dummyCounters = new Counters();

	public void progress() {
	}

	public Counter getCounter(Enum<?> arg0) {
	return dummyCounters.getGroup("dummy").getCounterForName("dummy");
	}

	public Counter getCounter(String arg0, String arg1) {
	return dummyCounters.getGroup("dummy").getCounterForName("dummy");
	}

	public void setStatus(String arg0) throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context with no status");
	}

	@Override
	public String getStatus() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context with no status");
	}

	public float getProgress() {
	return 1f;
	}

	public OutputCommitter getOutputCommitter() {
	throw new UnsupportedOperationException("Dummy context without committer");
	}

	public boolean nextKey(){
	return false;
	}

	@Override
	public boolean nextKeyValue(){
	return false;
	}

	@Override
	public TaskAttemptID getTaskAttemptID() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context without TaskAttemptID");
	}

	@Override
	public Path[] getArchiveClassPaths() {
	return null;
	}

	@Override
	public String[] getArchiveTimestamps() {
	return null;
	}

	@Override
	public URI[] getCacheArchives() throws IOException {
	return null;
	}

	@Override
	public URI[] getCacheFiles() throws IOException {
	return null;
	}

	@Override
	public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public RawComparator<?> getCombinerKeyGroupingComparator() {
	return null;
	}

	@Override
	public Configuration getConfiguration() {
	return null;
	}

	@Override
	public Credentials getCredentials() {
	return null;
	}

	@Override
	public Path[] getFileClassPaths() {
	return null;
	}

	@Override
	public String[] getFileTimestamps() {
	return null;
	}

	@Override
	public RawComparator<?> getGroupingComparator() {
	return null;
	}

	@Override
	public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public String getJar() {
	return null;
	}

	@Override
	public JobID getJobID() {
	return null;
	}

	@Override
	public String getJobName() {
	return null;
	}

	@Override
	public boolean getJobSetupCleanupNeeded() {
	return false;
	}

	@Override
	@Deprecated
	public Path[] getLocalCacheArchives() throws IOException {
	return null;
	}

	@Override
	@Deprecated
	public Path[] getLocalCacheFiles() throws IOException {
	return null;
	}

	@Override
	public Class<?> getMapOutputKeyClass() {
	return null;
	}

	@Override
	public Class<?> getMapOutputValueClass() {
	return null;
	}

	@Override
	public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public int getMaxMapAttempts() {
	return 0;
	}

	@Override
	public int getMaxReduceAttempts() {
	return 0;
	}

	@Override
	public int getNumReduceTasks() {
	return 0;
	}

	@Override
	public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public Class<?> getOutputKeyClass() {
	return null;
	}

	@Override
	public Class<?> getOutputValueClass() {
	return null;
	}

	@Override
	public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public boolean getProfileEnabled() {
	return false;
	}

	@Override
	public String getProfileParams() {
	return null;
	}

	@Override
	public IntegerRanges getProfileTaskRange(boolean arg0) {
	return null;
	}

	@Override
	public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public RawComparator<?> getSortComparator() {
	return null;
	}

	@Override
	@Deprecated
	public boolean getSymlink() {
	return false;
	}

	@Override
	public boolean getTaskCleanupNeeded() {
	return false;
	}

	@Override
	public String getUser() {
	return null;
	}

	@Override
	public Path getWorkingDirectory() throws IOException {
	return null;
	}

	}

	/**
	* run
	* {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
	* and return the CrawlDatum(s) which would have been written into CrawlDb
	*
	* @param values
	* list of input CrawlDatums
	* @return list of resulting CrawlDatum(s) in CrawlDb
	*/
	@SuppressWarnings("unchecked")
	public List<CrawlDatum> update(List<CrawlDatum> values) {
	if (values == null \|\| values.size() == 0) {
	return new ArrayList<CrawlDatum>(0);
	}
	Collections.shuffle(values); // sorting of values should have no influence
	DummyContext context = new DummyContext();
	try {
	Iterable<CrawlDatum> iterable_values = (Iterable)values;
	reducer.reduce(dummyURL, iterable_values, (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context);
	} catch (IOException e) {
	LOG.error(StringUtils.stringifyException(e));
	} catch (InterruptedException e) {
	LOG.error(StringUtils.stringifyException(e));
	}
	return context.getValues();
	}

	/**
	* run
	* {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
	* and return the CrawlDatum(s) which would have been written into CrawlDb
	*
	* @param dbDatum
	* previous CrawlDatum in CrawlDb
	* @param fetchDatum
	* CrawlDatum resulting from fetching
	* @return list of resulting CrawlDatum(s) in CrawlDb
	*/
	public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
	List<CrawlDatum> values = new ArrayList<CrawlDatum>();
	if (dbDatum != null)
	values.add(dbDatum);
	if (fetchDatum != null)
	values.add(fetchDatum);
	return update(values);
	}

	/**
	* see {@link #update(List)}
	*/
	public List<CrawlDatum> update(CrawlDatum... values) {
	return update(Arrays.asList(values));
	}

	}