src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.lang.invoke.MethodHandles;
 import java.io.IOException;
 import java.net.UnknownHostException;
 import java.net.URI;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.conf.Configuration.IntegerRanges;
 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.mapred.Counters;
 import org.apache.hadoop.mapred.Counters.Counter;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.JobID;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.OutputCommitter;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.Partitioner;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapreduce.Reducer.Context;
 import org.apache.hadoop.security.Credentials;
 import org.mortbay.jetty.Server;
 import org.mortbay.jetty.bio.SocketConnector;
 import org.mortbay.jetty.handler.ContextHandler;
 import org.mortbay.jetty.handler.ResourceHandler;

 public class CrawlDBTestUtil {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private static CrawlDbReducer reducer = new CrawlDbReducer();
   /**
    * Creates synthetic crawldb
    *
    * @param fs
    *          filesystem where db will be created
    * @param crawldb
    *          path were db will be created
    * @param init
    *          urls to be inserted, objects are of type URLCrawlDatum
    * @throws Exception
    */
   public static void createCrawlDb(Configuration conf, FileSystem fs,
       Path crawldb, List<URLCrawlDatum> init) throws Exception {
     LOG.trace("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
     Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
     org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
     MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
         "part-r-00000"), wKeyOpt, wValueOpt);
     Iterator<URLCrawlDatum> it = init.iterator();
     while (it.hasNext()) {
       URLCrawlDatum row = it.next();
       LOG.info("adding:" + row.url.toString());
       writer.append(new Text(row.url), row.datum);
     }
     writer.close();
   }

   /** {@link Context} to collect all values in a {@link List} */
   private static class DummyContext extends Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context {

     private Configuration conf;

     private DummyContext() {
       reducer.super();
       conf = new Configuration();
     }

     private List<CrawlDatum> values = new ArrayList<CrawlDatum>();

     @Override
     public void write(Text key, CrawlDatum value) throws IOException, InterruptedException {
       values.add(value);
     }

     /** collected values as List */
     public List<CrawlDatum> getValues() {
       return values;
     }

     /** Obtain current collected value from List */
     @Override
     public CrawlDatum getCurrentValue() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context");
     }

     /** Obtain current collected key from List */
     @Override
     public Text getCurrentKey() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context with no keys");
     }

     private Counters dummyCounters = new Counters();

     public void progress() {
     }

     public Counter getCounter(Enum<?> arg0) {
       return dummyCounters.getGroup("dummy").getCounterForName("dummy");
     }

     public Counter getCounter(String arg0, String arg1) {
       return dummyCounters.getGroup("dummy").getCounterForName("dummy");
     }

     public void setStatus(String arg0) throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context with no status");
     }

     @Override
     public String getStatus() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context with no status");
     }

     public float getProgress() {
       return 1f;
     }

     public OutputCommitter getOutputCommitter() {
       throw new UnsupportedOperationException("Dummy context without committer");
     }

     public boolean nextKey(){
       return false;
     }

     @Override
     public boolean nextKeyValue(){
       return false;
     }

     @Override
     public TaskAttemptID getTaskAttemptID() throws UnsupportedOperationException {
       throw new UnsupportedOperationException("Dummy context without TaskAttemptID");
     }

     @Override
     public Path[] getArchiveClassPaths() {
       return null;
     }

     @Override
     public String[] getArchiveTimestamps() {
       return null;
     }

     @Override
     public URI[] getCacheArchives() throws IOException {
       return null;
     }

     @Override
     public URI[] getCacheFiles() throws IOException {
       return null;
     }

     @Override
     public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public RawComparator<?> getCombinerKeyGroupingComparator() {
       return null;
     }

     @Override
     public Configuration getConfiguration() {
       return conf;
     }

     @Override
     public Credentials getCredentials() {
       return null;
     }

     @Override
     public Path[] getFileClassPaths() {
       return null;
     }

     @Override
     public String[] getFileTimestamps() {
       return null;
     }

     @Override
     public RawComparator<?> getGroupingComparator() {
       return null;
     }

     @Override
     public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public String getJar() {
       return null;
     }

     @Override
     public JobID getJobID() {
       return null;
     }

     @Override
     public String getJobName() {
       return null;
     }

     @Override
     public boolean getJobSetupCleanupNeeded() {
       return false;
     }

     @Override
     @Deprecated
     public Path[] getLocalCacheArchives() throws IOException {
       return null;
     }

     @Override
     @Deprecated
     public Path[] getLocalCacheFiles() throws IOException {
       return null;
     }

     @Override
     public Class<?> getMapOutputKeyClass() {
       return null;
     }

     @Override
     public Class<?> getMapOutputValueClass() {
       return null;
     }

     @Override
     public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public int getMaxMapAttempts() {
       return 0;
     }

     @Override
     public int getMaxReduceAttempts() {
       return 0;
     }

     @Override
     public int getNumReduceTasks() {
       return 0;
     }

     @Override
     public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public Class<?> getOutputKeyClass() {
       return null;
     }

     @Override
     public Class<?> getOutputValueClass() {
       return null;
     }

     @Override
     public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public boolean getProfileEnabled() {
       return false;
     }

     @Override
     public String getProfileParams() {
       return null;
     }

     @Override
     public IntegerRanges getProfileTaskRange(boolean arg0) {
       return null;
     }

     @Override
     public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException {
       return null;
     }

     @Override
     public RawComparator<?> getSortComparator() {
       return null;
     }

     @Override
     @Deprecated
     public boolean getSymlink() {
       return false;
     }

     @Override
     public boolean getTaskCleanupNeeded() {
       return false;
     }

    @Override
     public String getUser() {
       return null;
     }

     @Override
     public Path getWorkingDirectory() throws IOException {
       return null;
     }
   }
   /**
    * For now we need to manually construct our Configuration, because we need to
    * override the default one and it is currently not possible to use
    * dynamically set values.
    *
    * @return
    */
   public static Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context createContext() {
     DummyContext context = new DummyContext();
     Configuration conf = context.getConfiguration();
     conf.addResource("nutch-default.xml");
     conf.addResource("crawl-tests.xml");
     return (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context;
   }

   public static class URLCrawlDatum {

     public Text url;

     public CrawlDatum datum;

     public URLCrawlDatum(Text url, CrawlDatum datum) {
       this.url = url;
       this.datum = datum;
     }
   }

   /**
    * Generate seedlist
    *
    * @throws IOException
    */
   public static void generateSeedList(FileSystem fs, Path urlPath,
       List<String> urls) throws IOException {
     generateSeedList(fs, urlPath, urls, new ArrayList<String>());
   }

   /**
    * Generate seedlist
    *
    * @throws IOException
    */
   public static void generateSeedList(FileSystem fs, Path urlPath,
       List<String> urls, List<String> metadata) throws IOException {
     FSDataOutputStream out;
     Path file = new Path(urlPath, "urls.txt");
     fs.mkdirs(urlPath);
     out = fs.create(file);

     Iterator<String> urls_i = urls.iterator();
     Iterator<String> metadata_i = metadata.iterator();

     String url;
     String md;
     while (urls_i.hasNext()) {
       url = urls_i.next();

       out.writeBytes(url);

       if (metadata_i.hasNext()) {
         md = metadata_i.next();
         out.writeBytes(md);
       }

       out.writeBytes("\n");
     }

     out.flush();
     out.close();
   }

   /**
    * Creates a new JettyServer with one static root context
    *
    * @param port
    *          port to listen to
    * @param staticContent
    *          folder where static content lives
    * @throws UnknownHostException
    */
   public static Server getServer(int port, String staticContent)
       throws UnknownHostException {
     Server webServer = new org.mortbay.jetty.Server();
     SocketConnector listener = new SocketConnector();
     listener.setPort(port);
     listener.setHost("127.0.0.1");
     webServer.addConnector(listener);
     ContextHandler staticContext = new ContextHandler();
     staticContext.setContextPath("/");
     staticContext.setResourceBase(staticContent);
     staticContext.addHandler(new ResourceHandler());
     webServer.addHandler(staticContext);
     return webServer;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.lang.invoke.MethodHandles;
	import java.io.IOException;
	import java.net.UnknownHostException;
	import java.net.URI;
	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.List;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.MapFile;
	import org.apache.hadoop.io.SequenceFile;
	import org.apache.hadoop.io.MapFile.Writer.Option;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.conf.Configuration.IntegerRanges;
	import org.apache.hadoop.io.RawComparator;
	import org.apache.hadoop.mapred.Counters;
	import org.apache.hadoop.mapred.Counters.Counter;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.JobID;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.OutputCommitter;
	import org.apache.hadoop.mapreduce.OutputFormat;
	import org.apache.hadoop.mapreduce.Partitioner;
	import org.apache.hadoop.mapreduce.TaskAttemptID;
	import org.apache.hadoop.mapreduce.Reducer.Context;
	import org.apache.hadoop.security.Credentials;
	import org.mortbay.jetty.Server;
	import org.mortbay.jetty.bio.SocketConnector;
	import org.mortbay.jetty.handler.ContextHandler;
	import org.mortbay.jetty.handler.ResourceHandler;

	public class CrawlDBTestUtil {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private static CrawlDbReducer reducer = new CrawlDbReducer();
	/**
	* Creates synthetic crawldb
	*
	* @param fs
	* filesystem where db will be created
	* @param crawldb
	* path were db will be created
	* @param init
	* urls to be inserted, objects are of type URLCrawlDatum
	* @throws Exception
	*/
	public static void createCrawlDb(Configuration conf, FileSystem fs,
	Path crawldb, List<URLCrawlDatum> init) throws Exception {
	LOG.trace("* creating crawldb: " + crawldb);
	Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
	Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
	org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
	MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
	"part-r-00000"), wKeyOpt, wValueOpt);
	Iterator<URLCrawlDatum> it = init.iterator();
	while (it.hasNext()) {
	URLCrawlDatum row = it.next();
	LOG.info("adding:" + row.url.toString());
	writer.append(new Text(row.url), row.datum);
	}
	writer.close();
	}

	/** {@link Context} to collect all values in a {@link List} */
	private static class DummyContext extends Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context {

	private Configuration conf;

	private DummyContext() {
	reducer.super();
	conf = new Configuration();
	}

	private List<CrawlDatum> values = new ArrayList<CrawlDatum>();

	@Override
	public void write(Text key, CrawlDatum value) throws IOException, InterruptedException {
	values.add(value);
	}

	/** collected values as List */
	public List<CrawlDatum> getValues() {
	return values;
	}

	/** Obtain current collected value from List */
	@Override
	public CrawlDatum getCurrentValue() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context");
	}

	/** Obtain current collected key from List */
	@Override
	public Text getCurrentKey() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context with no keys");
	}

	private Counters dummyCounters = new Counters();

	public void progress() {
	}

	public Counter getCounter(Enum<?> arg0) {
	return dummyCounters.getGroup("dummy").getCounterForName("dummy");
	}

	public Counter getCounter(String arg0, String arg1) {
	return dummyCounters.getGroup("dummy").getCounterForName("dummy");
	}

	public void setStatus(String arg0) throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context with no status");
	}

	@Override
	public String getStatus() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context with no status");
	}

	public float getProgress() {
	return 1f;
	}

	public OutputCommitter getOutputCommitter() {
	throw new UnsupportedOperationException("Dummy context without committer");
	}

	public boolean nextKey(){
	return false;
	}

	@Override
	public boolean nextKeyValue(){
	return false;
	}

	@Override
	public TaskAttemptID getTaskAttemptID() throws UnsupportedOperationException {
	throw new UnsupportedOperationException("Dummy context without TaskAttemptID");
	}

	@Override
	public Path[] getArchiveClassPaths() {
	return null;
	}

	@Override
	public String[] getArchiveTimestamps() {
	return null;
	}

	@Override
	public URI[] getCacheArchives() throws IOException {
	return null;
	}

	@Override
	public URI[] getCacheFiles() throws IOException {
	return null;
	}

	@Override
	public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public RawComparator<?> getCombinerKeyGroupingComparator() {
	return null;
	}

	@Override
	public Configuration getConfiguration() {
	return conf;
	}

	@Override
	public Credentials getCredentials() {
	return null;
	}

	@Override
	public Path[] getFileClassPaths() {
	return null;
	}

	@Override
	public String[] getFileTimestamps() {
	return null;
	}

	@Override
	public RawComparator<?> getGroupingComparator() {
	return null;
	}

	@Override
	public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public String getJar() {
	return null;
	}

	@Override
	public JobID getJobID() {
	return null;
	}

	@Override
	public String getJobName() {
	return null;
	}

	@Override
	public boolean getJobSetupCleanupNeeded() {
	return false;
	}

	@Override
	@Deprecated
	public Path[] getLocalCacheArchives() throws IOException {
	return null;
	}

	@Override
	@Deprecated
	public Path[] getLocalCacheFiles() throws IOException {
	return null;
	}

	@Override
	public Class<?> getMapOutputKeyClass() {
	return null;
	}

	@Override
	public Class<?> getMapOutputValueClass() {
	return null;
	}

	@Override
	public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public int getMaxMapAttempts() {
	return 0;
	}

	@Override
	public int getMaxReduceAttempts() {
	return 0;
	}

	@Override
	public int getNumReduceTasks() {
	return 0;
	}

	@Override
	public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public Class<?> getOutputKeyClass() {
	return null;
	}

	@Override
	public Class<?> getOutputValueClass() {
	return null;
	}

	@Override
	public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public boolean getProfileEnabled() {
	return false;
	}

	@Override
	public String getProfileParams() {
	return null;
	}

	@Override
	public IntegerRanges getProfileTaskRange(boolean arg0) {
	return null;
	}

	@Override
	public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException {
	return null;
	}

	@Override
	public RawComparator<?> getSortComparator() {
	return null;
	}

	@Override
	@Deprecated
	public boolean getSymlink() {
	return false;
	}

	@Override
	public boolean getTaskCleanupNeeded() {
	return false;
	}

	@Override
	public String getUser() {
	return null;
	}

	@Override
	public Path getWorkingDirectory() throws IOException {
	return null;
	}
	}
	/**
	* For now we need to manually construct our Configuration, because we need to
	* override the default one and it is currently not possible to use
	* dynamically set values.
	*
	* @return
	*/
	public static Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context createContext() {
	DummyContext context = new DummyContext();
	Configuration conf = context.getConfiguration();
	conf.addResource("nutch-default.xml");
	conf.addResource("crawl-tests.xml");
	return (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context;
	}

	public static class URLCrawlDatum {

	public Text url;

	public CrawlDatum datum;

	public URLCrawlDatum(Text url, CrawlDatum datum) {
	this.url = url;
	this.datum = datum;
	}
	}

	/**
	* Generate seedlist
	*
	* @throws IOException
	*/
	public static void generateSeedList(FileSystem fs, Path urlPath,
	List<String> urls) throws IOException {
	generateSeedList(fs, urlPath, urls, new ArrayList<String>());
	}

	/**
	* Generate seedlist
	*
	* @throws IOException
	*/
	public static void generateSeedList(FileSystem fs, Path urlPath,
	List<String> urls, List<String> metadata) throws IOException {
	FSDataOutputStream out;
	Path file = new Path(urlPath, "urls.txt");
	fs.mkdirs(urlPath);
	out = fs.create(file);

	Iterator<String> urls_i = urls.iterator();
	Iterator<String> metadata_i = metadata.iterator();

	String url;
	String md;
	while (urls_i.hasNext()) {
	url = urls_i.next();

	out.writeBytes(url);

	if (metadata_i.hasNext()) {
	md = metadata_i.next();
	out.writeBytes(md);
	}

	out.writeBytes("\n");
	}

	out.flush();
	out.close();
	}

	/**
	* Creates a new JettyServer with one static root context
	*
	* @param port
	* port to listen to
	* @param staticContent
	* folder where static content lives
	* @throws UnknownHostException
	*/
	public static Server getServer(int port, String staticContent)
	throws UnknownHostException {
	Server webServer = new org.mortbay.jetty.Server();
	SocketConnector listener = new SocketConnector();
	listener.setPort(port);
	listener.setHost("127.0.0.1");
	webServer.addConnector(listener);
	ContextHandler staticContext = new ContextHandler();
	staticContext.setContextPath("/");
	staticContext.setResourceBase(staticContent);
	staticContext.addHandler(new ResourceHandler());
	webServer.addHandler(staticContext);
	return webServer;
	}
	}