src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.tools;

 import java.io.BufferedOutputStream;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;

 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.GnuParser;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.FilenameUtils;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.Inlink;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.crawl.LinkDbReader;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchTool;

 import org.apache.tika.Tika;

 import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
 import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.ibm.icu.text.DateFormat;
 import com.ibm.icu.text.SimpleDateFormat;

 /**
  * <p>
  * The Common Crawl Data Dumper tool enables one to reverse generate the raw
  * content from Nutch segment data directories into a common crawling data
  * format, consumed by many applications. The data is then serialized as <a
  * href="http://cbor.io">CBOR</a>
  * </p>
  * <p>
  * Text content will be stored in a structured document format. Below is a
  * schema for storage of data and metadata related to a crawling request, with
  * the response body truncated for readability. This document must be encoded
  * using CBOR and should be compressed with gzip after encoding. The timestamped
  * URL key for these records' keys follows the same layout as the media file
  * directory structure, with underscores in place of directory separators.
  * </p>
  * <p>
  * Thus, the timestamped url key for the record is provided below followed by an
  * example record:
  * </p>
  * <pre>
  * {@code
  * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000
  *
  *     {
  *         "url": "http:\/\/somepage.com\/22\/14560817",
  *         "timestamp": "1411623696000",
  *         "request": {
  *             "method": "GET",
  *             "client": {
  *                 "hostname": "crawler01.local",
  *                 "address": "74.347.129.200",
  *                 "software": "Apache Nutch v1.10",
  *                 "robots": "classic",
  *                 "contact": {
  *                     "name": "Nutch Admin",
  *                     "email": "nutch.pro@nutchadmin.org"
  *                 }
  *             },
  *             "headers": {
  *                 "Accept": "text\/html,application\/xhtml+xml,application\/xml",
  *                 "Accept-Encoding": "gzip,deflate,sdch",
  *                 "Accept-Language": "en-US,en",
  *                 "User-Agent": "Mozilla\/5.0",
  *                 "...": "..."
  *             },
  *             "body": null
  *         },
  *         "response": {
  *             "status": "200",
  *             "server": {
  *                 "hostname": "somepage.com",
  *                 "address": "55.33.51.19",
  *             },
  *             "headers": {
  *                 "Content-Encoding": "gzip",
  *                 "Content-Type": "text\/html",
  *                 "Date": "Thu, 25 Sep 2014 04:16:58 GMT",
  *                 "Expires": "Thu, 25 Sep 2014 04:16:57 GMT",
  *                 "Server": "nginx",
  *                 "...": "..."
  *             },
  *             "body": "\r\n  <!DOCTYPE html PUBLIC ... \r\n\r\n  \r\n    </body>\r\n    </html>\r\n  \r\n\r\n",
  *         },
  *         "key": "com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000",
  *         "imported": "1411623698000"
  *     }
  *     }
  * </pre>
  * <p>
  * Upon successful completion the tool displays a very convenient JSON snippet
  * detailing the mimetype classifications and the counts of documents which fall
  * into those classifications. An example is as follows:
  * </p>
  * <pre>
  * {@code
  * INFO: File Types:
  *   TOTAL Stats:    {
  *     {"mimeType":"application/xml","count":19"}
  *     {"mimeType":"image/png","count":47"}
  *     {"mimeType":"image/jpeg","count":141"}
  *     {"mimeType":"image/vnd.microsoft.icon","count":4"}
  *     {"mimeType":"text/plain","count":89"}
  *     {"mimeType":"video/quicktime","count":2"}
  *     {"mimeType":"image/gif","count":63"}
  *     {"mimeType":"application/xhtml+xml","count":1670"}
  *     {"mimeType":"application/octet-stream","count":40"}
  *     {"mimeType":"text/html","count":1863"}
  *   }
  * }
  * </pre>
  */
 public class CommonCrawlDataDumper extends NutchTool implements Tool {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   private static final int MAX_INLINKS = 5000;

   private CommonCrawlConfig config = null;

   // Gzip initialization
   private FileOutputStream fileOutput = null;
   private BufferedOutputStream bufOutput = null;
   private GzipCompressorOutputStream gzipOutput = null;
   private TarArchiveOutputStream tarOutput = null;
   private ArrayList<String> fileList = null;

   /**
    * Main method for invoking this tool
    *
    * @param args 1) output directory (which will be created if it does not
    *             already exist) to host the CBOR data and 2) a directory
    *             containing one or more segments from which we wish to generate
    *             CBOR data from. Optionally, 3) a list of mimetypes and the 4)
    *             the gzip option may be provided.
    * @throws Exception
    */
   public static void main(String[] args) throws Exception {
     Configuration conf = NutchConfiguration.create();
     int res = ToolRunner.run(conf, new CommonCrawlDataDumper(), args);
     System.exit(res);
   }

   /**
    * Constructor
    */
   public CommonCrawlDataDumper(CommonCrawlConfig config) {
     this.config = config;
   }

   public CommonCrawlDataDumper() {
   }

   /**
    * Dumps the reverse engineered CBOR content from the provided segment
    * directories if a parent directory contains more than one segment,
    * otherwise a single segment can be passed as an argument. If the boolean
    * argument is provided then the CBOR is also zipped.
    *
    * @param outputDir      the directory you wish to dump the raw content to. This
    *                       directory will be created.
    * @param segmentRootDir a directory containing one or more segments.
    * @param linkdb         Path to linkdb.
    * @param gzip           a boolean flag indicating whether the CBOR content should also
    *                       be gzipped.
    * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
    * @param extension      a file extension to use with output documents.
    * @throws Exception if any exception occurs.
    */
   public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip,
       String[] mimeTypes, boolean epochFilename, String extension, boolean warc)
       throws Exception {
     if (gzip) {
       LOG.info("Gzipping CBOR data has been skipped");
     }
     // total file counts
     Map<String, Integer> typeCounts = new HashMap<>();
     // filtered file counters
     Map<String, Integer> filteredCounts = new HashMap<>();

     Configuration nutchConfig = NutchConfiguration.create();
     Path segmentRootPath = new Path(segmentRootDir.toString());
     FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);

     //get all paths
     List<Path> parts = new ArrayList<>();
     RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
     String partPattern = ".*" + File.separator + Content.DIR_NAME
         + File.separator + "part-[0-9]{5}" + File.separator + "data";
     while (files.hasNext()) {
       LocatedFileStatus next = files.next();
       if (next.isFile()) {
         Path path = next.getPath();
         if (path.toString().matches(partPattern)){
           parts.add(path);
         }
       }
     }

     LinkDbReader linkDbReader = null;
     if (linkdb != null) {
       linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
     }
     if (parts == null || parts.size() == 0) {
       LOG.error( "No segment directories found in {} ",
           segmentRootDir.getAbsolutePath());
       System.exit(1);
     }
     LOG.info("Found {} segment parts", parts.size());
     if (gzip && !warc) {
       fileList = new ArrayList<>();
       constructNewStream(outputDir);
     }

     for (Path segmentPart : parts) {
       LOG.info("Processing segment Part : [ {} ]", segmentPart);
       try {
         SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
             SequenceFile.Reader.file(segmentPart));

         Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();

         Content content = null;
         while (reader.next(key)) {
           content = new Content();
           reader.getCurrentValue(content);
           Metadata metadata = content.getMetadata();
           String url = key.toString();

           String baseName = FilenameUtils.getBaseName(url);
           String extensionName = FilenameUtils.getExtension(url);

           if (!extension.isEmpty()) {
             extensionName = extension;
           } else if ((extensionName == null) || extensionName.isEmpty()) {
             extensionName = "html";
           }

           String outputFullPath = null;
           String outputRelativePath = null;
           String filename = null;
           String timestamp = null;
           String reverseKey = null;

           if (epochFilename || config.getReverseKey()) {
             try {
               long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z")
                   .parse(getDate(metadata.get("Date"))).getTime();
               timestamp = String.valueOf(epoch);
             } catch (ParseException pe) {
               LOG.warn(pe.getMessage());
             }

             reverseKey = reverseUrl(url);
             config.setReverseKeyValue(
                 reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url)
                     + "_" + timestamp);
           }

           if (!warc) {
             if (epochFilename) {
               outputFullPath = DumpFileUtil
                   .createFileNameFromUrl(outputDir.getAbsolutePath(),
                       reverseKey, url, timestamp, extensionName, !gzip);
               outputRelativePath = outputFullPath
                   .substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
               filename = content.getMetadata().get(Metadata.DATE) + "."
                   + extensionName;
             } else {
               String md5Ofurl = DumpFileUtil.getUrlMD5(url);
               String fullDir = DumpFileUtil
                   .createTwoLevelsDirectory(outputDir.getAbsolutePath(),
                       md5Ofurl, !gzip);
               filename = DumpFileUtil
                   .createFileName(md5Ofurl, baseName, extensionName);
               outputFullPath = String.format("%s/%s", fullDir, filename);

               String[] fullPathLevels = fullDir
                   .split(Pattern.quote(File.separator));
               String firstLevelDirName = fullPathLevels[fullPathLevels.length
                   - 2];
               String secondLevelDirName = fullPathLevels[fullPathLevels.length
                   - 1];
               outputRelativePath = firstLevelDirName + secondLevelDirName;
             }
           }
           // Encode all filetypes if no mimetypes have been given
           Boolean filter = (mimeTypes == null);

           String jsonData = "";
           try {
             String mimeType = new Tika().detect(content.getContent());
             // Maps file to JSON-based structure

             Set<String> inUrls = null; //there may be duplicates, so using set
             if (linkDbReader != null) {
               Inlinks inlinks = linkDbReader.getInlinks((Text) key);
               if (inlinks != null) {
                 Iterator<Inlink> iterator = inlinks.iterator();
                 inUrls = new LinkedHashSet<>();
                 while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()){
                   inUrls.add(iterator.next().getFromUrl());
                 }
               }
             }
             //TODO: Make this Jackson Format implementation reusable
             try (CommonCrawlFormat format = CommonCrawlFormatFactory
                 .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
               if (inUrls != null) {
                 format.setInLinks(new ArrayList<>(inUrls));
               }
               jsonData = format.getJsonData(url, content, metadata);
             }

             collectStats(typeCounts, mimeType);
             // collects statistics for the given mimetypes
             if ((mimeType != null) && (mimeTypes != null) && Arrays
                 .asList(mimeTypes).contains(mimeType)) {
               collectStats(filteredCounts, mimeType);
               filter = true;
             }
           } catch (IOException ioe) {
             LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
             return;
           }

           if (!warc) {
             if (filter) {
               byte[] byteData = serializeCBORData(jsonData);

               if (!gzip) {
                 File outputFile = new File(outputFullPath);
                 if (outputFile.exists()) {
                   LOG.info("Skipping writing: [" + outputFullPath
                       + "]: file already exists");
                 } else {
                   LOG.info("Writing: [" + outputFullPath + "]");
                   IOUtils.copy(new ByteArrayInputStream(byteData),
                       new FileOutputStream(outputFile));
                 }
               } else {
                 if (fileList.contains(outputFullPath)) {
                   LOG.info("Skipping compressing: [" + outputFullPath
                       + "]: file already exists");
                 } else {
                   fileList.add(outputFullPath);
                   LOG.info("Compressing: [" + outputFullPath + "]");
                   //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                   TarArchiveEntry tarEntry = new TarArchiveEntry(
                       outputRelativePath + File.separator + filename);
                   tarEntry.setSize(byteData.length);
                   tarOutput.putArchiveEntry(tarEntry);
                   tarOutput.write(byteData);
                   tarOutput.closeArchiveEntry();
                 }
               }
             }
           }
         }
         reader.close();
       } catch (Exception e){
         LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
       } finally {
         fs.close();
       }
     }

     if (gzip && !warc) {
       closeStream();
     }

     if (!typeCounts.isEmpty()) {
       LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil
           .displayFileTypes(typeCounts, filteredCounts));
     }

   }

   private void closeStream() {
     try {
       tarOutput.finish();

       tarOutput.close();
       gzipOutput.close();
       bufOutput.close();
       fileOutput.close();
     } catch (IOException ioe) {
       LOG.warn("Error in closing stream: " + ioe.getMessage());
     }
   }

   private void constructNewStream(File outputDir) throws IOException {
     String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'")
         .format(new Date());
     LOG.info("Creating a new gzip archive: " + archiveName);
     fileOutput = new FileOutputStream(
         new File(outputDir + File.separator + archiveName));
     bufOutput = new BufferedOutputStream(fileOutput);
     gzipOutput = new GzipCompressorOutputStream(bufOutput);
     tarOutput = new TarArchiveOutputStream(gzipOutput);
     tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
   }

   /**
    * Writes the CBOR "Self-Describe Tag" (value 55799, serialized as 3-byte
    * sequence of {@code 0xd9d9f7}) at the current position. This method must
    * be used to write the CBOR magic number at the beginning of the document.
    * Since version 2.5, <a
    * href="https://github.com/FasterXML/jackson-dataformat-cbor"
    * >jackson-dataformat-cbor</a> will support the {@code WRITE_TYPE_HEADER}
    * feature to write that type tag at the beginning of the document.
    *
    * @param generator {@link CBORGenerator} object used to create a CBOR-encoded document.
    * @throws IOException if any I/O error occurs.
    * @see <a href="https://tools.ietf.org/html/rfc7049#section-2.4.5">RFC
    * 7049</a>
    */
   private void writeMagicHeader(CBORGenerator generator) throws IOException {
     // Writes self-describe CBOR
     // https://tools.ietf.org/html/rfc7049#section-2.4.5
     // It will be supported in jackson-cbor since 2.5
     byte[] header = new byte[3];
     header[0] = (byte) 0xd9;
     header[1] = (byte) 0xd9;
     header[2] = (byte) 0xf7;
     generator.writeBytes(header, 0, header.length);
   }

   private byte[] serializeCBORData(String jsonData) {
     CBORFactory factory = new CBORFactory();

     CBORGenerator generator = null;
     ByteArrayOutputStream stream = null;

     try {
       stream = new ByteArrayOutputStream();
       generator = factory.createGenerator(stream);
       // Writes CBOR tag
       writeMagicHeader(generator);
       generator.writeString(jsonData);
       generator.flush();
       stream.flush();

       return stream.toByteArray();

     } catch (Exception e) {
       LOG.warn("CBOR encoding failed: " + e.getMessage());
     } finally {
       try {
         generator.close();
         stream.close();
       } catch (IOException e) {
         // nothing to do
       }
     }

     return null;
   }

   private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
     typeCounts.put(mimeType,
         typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
   }

   /**
    * Gets the current date if the given timestamp is empty or null.
    *
    * @param timestamp the timestamp
    * @return the current timestamp if the given one is null.
    */
   private String getDate(String timestamp) {
     if (timestamp == null || timestamp.isEmpty()) {
       DateFormat dateFormat = new SimpleDateFormat(
           "EEE, d MMM yyyy HH:mm:ss z");
       timestamp = dateFormat.format(new Date());
     }
     return timestamp;

   }

   public static String reverseUrl(String urlString) {
     URL url;
     String reverseKey = null;
     try {
       url = new URL(urlString);

       String[] hostPart = url.getHost().replace('.', '/').split("/");

       StringBuilder sb = new StringBuilder();
       sb.append(hostPart[hostPart.length - 1]);
       for (int i = hostPart.length - 2; i >= 0; i--) {
         sb.append("/" + hostPart[i]);
       }

       reverseKey = sb.toString();

     } catch (MalformedURLException e) {
       LOG.error("Failed to parse URL: {}", urlString);
     }

     return reverseKey;
   }

   @Override
   public int run(String[] args) throws Exception {
     Option helpOpt = new Option("h", "help", false, "show this help message.");
     // argument options
     @SuppressWarnings("static-access")
     Option outputOpt = OptionBuilder.withArgName("outputDir").hasArg()
         .withDescription(
             "output directory (which will be created) to host the CBOR data.")
         .create("outputDir");
     // WARC format
     Option warcOpt = new Option("warc", "export to a WARC file");

     @SuppressWarnings("static-access")
     Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
         .withDescription("the segment or directory containing segments to use").create("segment");
     // create mimetype and gzip options
     @SuppressWarnings("static-access")
     Option mimeOpt = OptionBuilder.isRequired(false).withArgName("mimetype")
         .hasArgs().withDescription(
             "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
         .create("mimetype");
     @SuppressWarnings("static-access")
     Option gzipOpt = OptionBuilder.withArgName("gzip").hasArg(false)
         .withDescription(
             "an optional flag indicating whether to additionally gzip the data.")
         .create("gzip");
     @SuppressWarnings("static-access")
     Option keyPrefixOpt = OptionBuilder.withArgName("keyPrefix").hasArg(true)
         .withDescription("an optional prefix for key in the output format.")
         .create("keyPrefix");
     @SuppressWarnings("static-access")
     Option simpleDateFormatOpt = OptionBuilder.withArgName("SimpleDateFormat")
         .hasArg(false).withDescription(
             "an optional format for timestamp in GMT epoch milliseconds.")
         .create("SimpleDateFormat");
     @SuppressWarnings("static-access")
     Option epochFilenameOpt = OptionBuilder.withArgName("epochFilename")
         .hasArg(false)
         .withDescription("an optional format for output filename.")
         .create("epochFilename");
     @SuppressWarnings("static-access")
     Option jsonArrayOpt = OptionBuilder.withArgName("jsonArray").hasArg(false)
         .withDescription("an optional format for JSON output.")
         .create("jsonArray");
     @SuppressWarnings("static-access")
     Option reverseKeyOpt = OptionBuilder.withArgName("reverseKey").hasArg(false)
         .withDescription("an optional format for key value in JSON output.")
         .create("reverseKey");
     @SuppressWarnings("static-access")
     Option extensionOpt = OptionBuilder.withArgName("extension").hasArg(true)
         .withDescription("an optional file extension for output documents.")
         .create("extension");
     @SuppressWarnings("static-access")
     Option sizeOpt = OptionBuilder.withArgName("warcSize").hasArg(true)
         .withType(Number.class)
         .withDescription("an optional file size in bytes for the WARC file(s)")
         .create("warcSize");
     @SuppressWarnings("static-access")
     Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true)
         .withDescription("an optional linkdb parameter to include inlinks in dump files")
         .isRequired(false)
         .create("linkdb");

     // create the options
     Options options = new Options();
     options.addOption(helpOpt);
     options.addOption(outputOpt);
     options.addOption(segOpt);
     // create mimetypes and gzip options
     options.addOption(warcOpt);
     options.addOption(mimeOpt);
     options.addOption(gzipOpt);
     // create keyPrefix option
     options.addOption(keyPrefixOpt);
     // create simpleDataFormat option
     options.addOption(simpleDateFormatOpt);
     options.addOption(epochFilenameOpt);
     options.addOption(jsonArrayOpt);
     options.addOption(reverseKeyOpt);
     options.addOption(extensionOpt);
     options.addOption(sizeOpt);
     options.addOption(linkDbOpt);

     CommandLineParser parser = new GnuParser();
     try {
       CommandLine line = parser.parse(options, args);
       if (line.hasOption("help") || !line.hasOption("outputDir") || (!line
           .hasOption("segment"))) {
         HelpFormatter formatter = new HelpFormatter();
         formatter
             .printHelp(CommonCrawlDataDumper.class.getName(), options, true);
         return 0;
       }

       File outputDir = new File(line.getOptionValue("outputDir"));
       File segmentRootDir = new File(line.getOptionValue("segment"));
       String[] mimeTypes = line.getOptionValues("mimetype");
       boolean gzip = line.hasOption("gzip");
       boolean epochFilename = line.hasOption("epochFilename");

       String keyPrefix = line.getOptionValue("keyPrefix", "");
       boolean simpleDateFormat = line.hasOption("SimpleDateFormat");
       boolean jsonArray = line.hasOption("jsonArray");
       boolean reverseKey = line.hasOption("reverseKey");
       String extension = line.getOptionValue("extension", "");
       boolean warc = line.hasOption("warc");
       long warcSize = 0;

       if (line.getParsedOptionValue("warcSize") != null) {
         warcSize = (Long) line.getParsedOptionValue("warcSize");
       }
       String linkdbPath = line.getOptionValue("linkdb");
       File linkdb = linkdbPath == null ? null : new File(linkdbPath);

       CommonCrawlConfig config = new CommonCrawlConfig();
       config.setKeyPrefix(keyPrefix);
       config.setSimpleDateFormat(simpleDateFormat);
       config.setJsonArray(jsonArray);
       config.setReverseKey(reverseKey);
       config.setCompressed(gzip);
       config.setWarcSize(warcSize);
       config.setOutputDir(line.getOptionValue("outputDir"));

       if (!outputDir.exists()) {
         LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
             + "]: does not exist, creating it.");
         if (!outputDir.mkdirs())
           throw new Exception(
               "Unable to create: [" + outputDir.getAbsolutePath() + "]");
       }

       CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);

       dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, epochFilename,
           extension, warc);

     } catch (Exception e) {
       LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils
           .stringifyException(e));
       e.printStackTrace();
       return -1;
     }

     return 0;
   }

   /**
    * Used by the REST service
    */
   @Override
   public Map<String, Object> run(Map<String, Object> args, String crawlId)
       throws Exception {

     String keyPrefix = args.containsKey("keyPrefix")
         ? (String) args.get("keyPrefix")
         : "";

     File outputDir = new File((String) args.get("outputDir"));
     File segmentRootDir = new File((String) args.get(Nutch.ARG_SEGMENTDIR));
     ArrayList<String> mimeTypesList = args.containsKey("mimetypes")
         ? (ArrayList<String>) args.get("mimetypes")
         : null;
     String[] mimeTypes = null;
     if (mimeTypesList != null) {
       mimeTypes = new String[mimeTypesList.size()];
       int i = 0;
       for (String m : mimeTypesList)
         mimeTypes[i++] = m;
     }
     boolean gzip = args.containsKey("gzip") ? (boolean) args.get("gzip")
         : false;
     boolean epochFilename = args.containsKey("epochFilename")
         ? (boolean) args.get("epochFilename")
         : false;

     boolean simpleDateFormat = args.containsKey("simpleDateFormat")
         ? (boolean) args.get("simpleDateFormat")
         : false;
     boolean jsonArray = args.containsKey("jsonArray")
         ? (boolean) args.get("jsonArray")
         : false;
     boolean reverseKey = args.containsKey("reverseKey")
         ? (boolean) args.get("reverseKey")
         : false;
     String extension = args.containsKey("extension")
         ? (String) args.get("extension")
         : "";
     boolean warc = args.containsKey("warc") ? (boolean) args.get("warc")
         : false;
     long warcSize = args.containsKey("warcSize") ? (Long) args.get("warcSize")
         : 0;

     CommonCrawlConfig config = new CommonCrawlConfig();
     config.setKeyPrefix(keyPrefix);
     config.setSimpleDateFormat(simpleDateFormat);
     config.setJsonArray(jsonArray);
     config.setReverseKey(reverseKey);
     config.setCompressed(gzip);
     config.setWarcSize(warcSize);
     config.setOutputDir((String) args.get("outputDir"));

     if (!outputDir.exists()) {
       if (!outputDir.mkdirs())
         throw new Exception(
             "Unable to create: [" + outputDir.getAbsolutePath() + "]");
     }

     CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);

     dumper.dump(outputDir, segmentRootDir, null, gzip, mimeTypes, epochFilename,
         extension, warc);
     return null;
   }
 }