blob: e271e88cfa4fe4aa4a92aec3a1ceed300d8a30a8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.lang.invoke.MethodHandles;
import java.io.IOException;
import java.net.UnknownHostException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration.IntegerRanges;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.security.Credentials;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.bio.SocketConnector;
import org.mortbay.jetty.handler.ContextHandler;
import org.mortbay.jetty.handler.ResourceHandler;
public class CrawlDBTestUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static CrawlDbReducer reducer = new CrawlDbReducer();
/**
* Creates synthetic crawldb
*
* @param fs
* filesystem where db will be created
* @param crawldb
* path were db will be created
* @param init
* urls to be inserted, objects are of type URLCrawlDatum
* @throws Exception
*/
public static void createCrawlDb(Configuration conf, FileSystem fs,
Path crawldb, List<URLCrawlDatum> init) throws Exception {
LOG.trace("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
"part-r-00000"), wKeyOpt, wValueOpt);
Iterator<URLCrawlDatum> it = init.iterator();
while (it.hasNext()) {
URLCrawlDatum row = it.next();
LOG.info("adding:" + row.url.toString());
writer.append(new Text(row.url), row.datum);
}
writer.close();
}
/** {@link Context} to collect all values in a {@link List} */
private static class DummyContext extends Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context {
private Configuration conf;
private DummyContext() {
reducer.super();
conf = new Configuration();
}
private List<CrawlDatum> values = new ArrayList<CrawlDatum>();
@Override
public void write(Text key, CrawlDatum value) throws IOException, InterruptedException {
values.add(value);
}
/** collected values as List */
public List<CrawlDatum> getValues() {
return values;
}
/** Obtain current collected value from List */
@Override
public CrawlDatum getCurrentValue() throws UnsupportedOperationException {
throw new UnsupportedOperationException("Dummy context");
}
/** Obtain current collected key from List */
@Override
public Text getCurrentKey() throws UnsupportedOperationException {
throw new UnsupportedOperationException("Dummy context with no keys");
}
private Counters dummyCounters = new Counters();
public void progress() {
}
public Counter getCounter(Enum<?> arg0) {
return dummyCounters.getGroup("dummy").getCounterForName("dummy");
}
public Counter getCounter(String arg0, String arg1) {
return dummyCounters.getGroup("dummy").getCounterForName("dummy");
}
public void setStatus(String arg0) throws UnsupportedOperationException {
throw new UnsupportedOperationException("Dummy context with no status");
}
@Override
public String getStatus() throws UnsupportedOperationException {
throw new UnsupportedOperationException("Dummy context with no status");
}
public float getProgress() {
return 1f;
}
public OutputCommitter getOutputCommitter() {
throw new UnsupportedOperationException("Dummy context without committer");
}
public boolean nextKey(){
return false;
}
@Override
public boolean nextKeyValue(){
return false;
}
@Override
public TaskAttemptID getTaskAttemptID() throws UnsupportedOperationException {
throw new UnsupportedOperationException("Dummy context without TaskAttemptID");
}
@Override
public Path[] getArchiveClassPaths() {
return null;
}
@Override
public String[] getArchiveTimestamps() {
return null;
}
@Override
public URI[] getCacheArchives() throws IOException {
return null;
}
@Override
public URI[] getCacheFiles() throws IOException {
return null;
}
@Override
public Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException {
return null;
}
@Override
public RawComparator<?> getCombinerKeyGroupingComparator() {
return null;
}
@Override
public Configuration getConfiguration() {
return conf;
}
@Override
public Credentials getCredentials() {
return null;
}
@Override
public Path[] getFileClassPaths() {
return null;
}
@Override
public String[] getFileTimestamps() {
return null;
}
@Override
public RawComparator<?> getGroupingComparator() {
return null;
}
@Override
public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException {
return null;
}
@Override
public String getJar() {
return null;
}
@Override
public JobID getJobID() {
return null;
}
@Override
public String getJobName() {
return null;
}
@Override
public boolean getJobSetupCleanupNeeded() {
return false;
}
@Override
@Deprecated
public Path[] getLocalCacheArchives() throws IOException {
return null;
}
@Override
@Deprecated
public Path[] getLocalCacheFiles() throws IOException {
return null;
}
@Override
public Class<?> getMapOutputKeyClass() {
return null;
}
@Override
public Class<?> getMapOutputValueClass() {
return null;
}
@Override
public Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException {
return null;
}
@Override
public int getMaxMapAttempts() {
return 0;
}
@Override
public int getMaxReduceAttempts() {
return 0;
}
@Override
public int getNumReduceTasks() {
return 0;
}
@Override
public Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException {
return null;
}
@Override
public Class<?> getOutputKeyClass() {
return null;
}
@Override
public Class<?> getOutputValueClass() {
return null;
}
@Override
public Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException {
return null;
}
@Override
public boolean getProfileEnabled() {
return false;
}
@Override
public String getProfileParams() {
return null;
}
@Override
public IntegerRanges getProfileTaskRange(boolean arg0) {
return null;
}
@Override
public Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException {
return null;
}
@Override
public RawComparator<?> getSortComparator() {
return null;
}
@Override
@Deprecated
public boolean getSymlink() {
return false;
}
@Override
public boolean getTaskCleanupNeeded() {
return false;
}
@Override
public String getUser() {
return null;
}
@Override
public Path getWorkingDirectory() throws IOException {
return null;
}
}
/**
* For now we need to manually construct our Configuration, because we need to
* override the default one and it is currently not possible to use
* dynamically set values.
*
* @return
*/
public static Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context createContext() {
DummyContext context = new DummyContext();
Configuration conf = context.getConfiguration();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
return (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context;
}
public static class URLCrawlDatum {
public Text url;
public CrawlDatum datum;
public URLCrawlDatum(Text url, CrawlDatum datum) {
this.url = url;
this.datum = datum;
}
}
/**
* Generate seedlist
*
* @throws IOException
*/
public static void generateSeedList(FileSystem fs, Path urlPath,
List<String> urls) throws IOException {
generateSeedList(fs, urlPath, urls, new ArrayList<String>());
}
/**
* Generate seedlist
*
* @throws IOException
*/
public static void generateSeedList(FileSystem fs, Path urlPath,
List<String> urls, List<String> metadata) throws IOException {
FSDataOutputStream out;
Path file = new Path(urlPath, "urls.txt");
fs.mkdirs(urlPath);
out = fs.create(file);
Iterator<String> urls_i = urls.iterator();
Iterator<String> metadata_i = metadata.iterator();
String url;
String md;
while (urls_i.hasNext()) {
url = urls_i.next();
out.writeBytes(url);
if (metadata_i.hasNext()) {
md = metadata_i.next();
out.writeBytes(md);
}
out.writeBytes("\n");
}
out.flush();
out.close();
}
/**
* Creates a new JettyServer with one static root context
*
* @param port
* port to listen to
* @param staticContent
* folder where static content lives
* @throws UnknownHostException
*/
public static Server getServer(int port, String staticContent)
throws UnknownHostException {
Server webServer = new org.mortbay.jetty.Server();
SocketConnector listener = new SocketConnector();
listener.setPort(port);
listener.setHost("127.0.0.1");
webServer.addConnector(listener);
ContextHandler staticContext = new ContextHandler();
staticContext.setContextPath("/");
staticContext.setResourceBase(staticContent);
staticContext.addHandler(new ResourceHandler());
webServer.addHandler(staticContext);
return webServer;
}
}