blob: 316b9776c06db342244eb79ff3ddf04a47b757cc [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.ByteArrayInputStream;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import com.google.common.base.Strings;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TableUtil;
import org.apache.tika.Tika;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.codehaus.jackson.map.ObjectMapper;
/**
* The file dumper tool enables one to reverse generate the raw content from
* Nutch segment data directories.
* <p>
* The tool has a number of immediate uses:
* <ol>
* <li>one can see what a page looked like at the time it was crawled</li>
* <li>one can see different media types acquired as part of the crawl</li>
* <li>it enables us to see webpages before we augment them with additional
* metadata, this can be handy for providing a provenance trail for your crawl
* data.</li>
* </ol>
* <p>
* Upon successful completion the tool displays a very convenient JSON snippet
* detailing the mimetype classifications and the counts of documents which fall
* into those classifications. An example is as follows:
*
* <pre>
* {@code
* INFO: File Types:
* TOTAL Stats:
* [
* {"mimeType":"application/xml","count":"19"}
* {"mimeType":"image/png","count":"47"}
* {"mimeType":"image/jpeg","count":"141"}
* {"mimeType":"image/vnd.microsoft.icon","count":"4"}
* {"mimeType":"text/plain","count":"89"}
* {"mimeType":"video/quicktime","count":"2"}
* {"mimeType":"image/gif","count":"63"}
* {"mimeType":"application/xhtml+xml","count":"1670"}
* {"mimeType":"application/octet-stream","count":"40"}
* {"mimeType":"text/html","count":"1863"}
* ]
*
* FILTER Stats:
* [
* {"mimeType":"image/png","count":"47"}
* {"mimeType":"image/jpeg","count":"141"}
* {"mimeType":"image/vnd.microsoft.icon","count":"4"}
* {"mimeType":"video/quicktime","count":"2"}
* {"mimeType":"image/gif","count":"63"}
* ]
* }
* </pre>
* <p>
* In the case above, the tool would have been run with the <b>-mimeType
* image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
* flag and corresponding values activated.
* </p>
*/
public class FileDumper {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
/**
* Dumps the reverse engineered raw content from the provided segment
* directories if a parent directory contains more than one segment, otherwise
* a single segment can be passed as an argument.
*
* @param outputDir
* the directory you wish to dump the raw content to. This directory
* will be created.
* @param segmentRootDir
* a directory containing one or more segments.
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
* @param flatDir
* a boolean flag specifying whether the output directory should contain
* only files instead of using nested directories to prevent naming
* conflicts.
* @param mimeTypeStats
* a flag indicating whether mimetype stats should be displayed
* instead of dumping files.
* @throws Exception
*/
public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump)
throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
// total file counts
Map<String, Integer> typeCounts = new HashMap<>();
// filtered file counts
Map<String, Integer> filteredCounts = new HashMap<>();
Configuration conf = NutchConfiguration.create();
int fileCount = 0;
File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
if (segmentDirs == null) {
LOG.error("No segment directories found in ["
+ segmentRootDir.getAbsolutePath() + "]");
return;
}
for (File segment : segmentDirs) {
LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
DataOutputStream doutputStream = null;
Map<String, String> filenameToUrl = new HashMap<String, String>();
File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
if (partDirs == null) {
LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
continue;
}
for (File partDir : partDirs) {
try (FileSystem fs = FileSystem.get(conf)) {
String segmentPath = partDir + "/data";
Path file = new Path(segmentPath);
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" + segmentPath
+ "]: no data directory present");
continue;
}
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();
Content content = null;
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);
if (extension == null || (extension != null && extension.equals(""))) {
extension = "html";
}
ByteArrayInputStream bas = null;
Boolean filter = false;
try {
bas = new ByteArrayInputStream(content.getContent());
String mimeType = new Tika().detect(content.getContent());
collectStats(typeCounts, mimeType);
if (mimeType != null) {
if (mimeTypes == null
|| Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn("Tika is unable to detect type for: [" + url + "]");
} finally {
if (bas != null) {
try {
bas.close();
} catch (Exception ignore) {
}
}
}
if (filter) {
if (!mimeTypeStats) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = outputDir.getAbsolutePath();
if (!flatDir && !reverseURLDump) {
fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
}
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath;
if (reverseURLDump) {
String[] reversedURL = TableUtil.reverseUrl(url).split(":");
reversedURL[0] = reversedURL[0].replace('.', '/');
String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
// We'll drop the trailing file name and create the nested structure if it doesn't already exist.
String[] splitPath = outputFullPath.split("/");
File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
if (!fullOutputDir.exists()) {
fullOutputDir.mkdirs();
}
} else {
outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
}
filenameToUrl.put(outputFullPath, url);
File outputFile = new File(outputFullPath);
if (!outputFile.exists()) {
LOG.info("Writing: [" + outputFullPath + "]");
// Modified to prevent FileNotFoundException (Invalid Argument)
FileOutputStream output = null;
try {
output = new FileOutputStream(outputFile);
IOUtils.write(content.getContent(), output);
} catch (Exception e) {
LOG.warn("Write Error: [" + outputFullPath + "]");
e.printStackTrace();
} finally {
if (output != null) {
output.flush();
try {
output.close();
} catch (Exception ignore) {
}
}
}
fileCount++;
} else {
LOG.info("Skipping writing: [" + outputFullPath
+ "]: file already exists");
}
}
}
}
}
reader.close();
} finally {
if (doutputStream != null) {
try {
doutputStream.close();
} catch (Exception ignore) {
}
}
}
}
//save filenameToUrl in a json file for each segment there is one mapping file
String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName() );
new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
}
LOG.info("Dumper File Stats: "
+ DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
if (mimeTypeStats) {
System.out.println("Dumper File Stats: "
+ DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}
/**
* Main method for invoking this tool
*
* @param args
* 1) output directory (which will be created) to host the raw data
* and 2) a directory containing one or more segments.
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// boolean options
Option helpOpt = new Option("h", "help", false, "show this help message");
// argument options
@SuppressWarnings("static-access")
Option outputOpt = OptionBuilder
.withArgName("outputDir")
.hasArg()
.withDescription(
"output directory (which will be created) to host the raw data")
.create("outputDir");
@SuppressWarnings("static-access")
Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
.withDescription("the segment(s) to use").create("segment");
@SuppressWarnings("static-access")
Option mimeOpt = OptionBuilder
.withArgName("mimetype")
.hasArgs()
.withDescription(
"an optional list of mimetypes to dump, excluding all others. Defaults to all.")
.create("mimetype");
@SuppressWarnings("static-access")
Option mimeStat = OptionBuilder
.withArgName("mimeStats")
.withDescription(
"only display mimetype stats for the segment(s) instead of dumping file.")
.create("mimeStats");
@SuppressWarnings("static-access")
Option dirStructureOpt = OptionBuilder
.withArgName("flatdir")
.withDescription(
"optionally specify that the output directory should only contain files.")
.create("flatdir");
@SuppressWarnings("static-access")
Option reverseURLOutput = OptionBuilder
.withArgName("reverseUrlDirs")
.withDescription(
"optionally specify to use reverse URL folders for output structure.")
.create("reverseUrlDirs");
// create the options
Options options = new Options();
options.addOption(helpOpt);
options.addOption(outputOpt);
options.addOption(segOpt);
options.addOption(mimeOpt);
options.addOption(mimeStat);
options.addOption(dirStructureOpt);
options.addOption(reverseURLOutput);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("outputDir")
|| (!line.hasOption("segment"))) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("FileDumper", options, true);
return;
}
File outputDir = new File(line.getOptionValue("outputDir"));
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
boolean flatDir = line.hasOption("flatdir");
boolean shouldDisplayStats = false;
if (line.hasOption("mimeStats"))
shouldDisplayStats = true;
boolean reverseURLDump = false;
if (line.hasOption("reverseUrlDirs"))
reverseURLDump = true;
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+ "]: does not exist, creating it.");
if (!shouldDisplayStats) {
if (!outputDir.mkdirs())
throw new Exception("Unable to create: ["
+ outputDir.getAbsolutePath() + "]");
}
}
FileDumper dumper = new FileDumper();
dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats, reverseURLDump);
} catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();
return;
}
}
private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
typeCounts.put(mimeType,
typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
}
}