blob: a9ad19545dcd7dc2143f174642234706c9fe8c46 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.MD5Hash;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
public class DumpFileUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private final static String DIR_PATTERN = "%s/%s/%s";
private final static String FILENAME_PATTERN = "%s_%s.%s";
private final static Integer MAX_LENGTH_OF_FILENAME = 32;
private final static Integer MAX_LENGTH_OF_EXTENSION = 5;
public static String getUrlMD5(String url) {
byte[] digest = MD5Hash.digest(url).getDigest();
StringBuffer sb = new StringBuffer();
for (byte b : digest) {
sb.append(String.format("%02x", b & 0xff));
}
return sb.toString();
}
public static String createTwoLevelsDirectory(String basePath, String md5, boolean makeDir) {
String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);
if (makeDir) {
try {
FileUtils.forceMkdir(new File(fullDirPath));
} catch (IOException e) {
LOG.error("Failed to create dir: {}", fullDirPath);
fullDirPath = null;
}
}
return fullDirPath;
}
public static String createTwoLevelsDirectory(String basePath, String md5) {
return createTwoLevelsDirectory(basePath, md5, true);
}
public static String createFileName(String md5, String fileBaseName, String fileExtension) {
if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {
LOG.info("File name is too long. Truncated to {} characters.", MAX_LENGTH_OF_FILENAME);
fileBaseName = StringUtils.substring(fileBaseName, 0, MAX_LENGTH_OF_FILENAME);
}
if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION);
fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION);
}
// Added to prevent FileNotFoundException (Invalid Argument) - in *nix environment
fileBaseName = fileBaseName.replaceAll("\\?", "");
fileExtension = fileExtension.replaceAll("\\?", "");
return String.format(FILENAME_PATTERN, md5, fileBaseName, fileExtension);
}
public static String createFileNameFromUrl(String basePath, String reverseKey, String urlString, String epochScrapeTime, String fileExtension, boolean makeDir) {
String fullDirPath = basePath + File.separator + reverseKey + File.separator + DigestUtils.sha1Hex(urlString);
if (makeDir) {
try {
FileUtils.forceMkdir(new File(fullDirPath));
} catch (IOException e) {
LOG.error("Failed to create dir: {}", fullDirPath);
fullDirPath = null;
}
}
if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION);
fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION);
}
String outputFullPath = fullDirPath + File.separator + epochScrapeTime + "." + fileExtension;
return outputFullPath;
}
public static String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
StringBuilder builder = new StringBuilder();
// print total stats
builder.append("\nTOTAL Stats:\n");
builder.append("[\n");
int mimetypeCount = 0;
for (String mimeType : typeCounts.keySet()) {
builder.append(" {\"mimeType\":\"");
builder.append(mimeType);
builder.append("\",\"count\":\"");
builder.append(typeCounts.get(mimeType));
builder.append("\"}\n");
mimetypeCount += typeCounts.get(mimeType);
}
builder.append("]\n");
builder.append("Total count: " + mimetypeCount + "\n");
// filtered types stats
mimetypeCount = 0;
if (!filteredCounts.isEmpty()) {
builder.append("\nFILTERED Stats:\n");
builder.append("[\n");
for (String mimeType : filteredCounts.keySet()) {
builder.append(" {\"mimeType\":\"");
builder.append(mimeType);
builder.append("\",\"count\":\"");
builder.append(filteredCounts.get(mimeType));
builder.append("\"}\n");
mimetypeCount += filteredCounts.get(mimeType);
}
builder.append("]\n");
builder.append("Total filtered count: " + mimetypeCount + "\n");
}
return builder.toString();
}
}