| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.util; |
| |
| import org.apache.commons.codec.digest.DigestUtils; |
| import org.apache.commons.io.FileUtils; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.hadoop.io.MD5Hash; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.lang.invoke.MethodHandles; |
| import java.util.Map; |
| |
| public class DumpFileUtil { |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| private final static String DIR_PATTERN = "%s/%s/%s"; |
| private final static String FILENAME_PATTERN = "%s_%s.%s"; |
| private final static Integer MAX_LENGTH_OF_FILENAME = 32; |
| private final static Integer MAX_LENGTH_OF_EXTENSION = 5; |
| |
| public static String getUrlMD5(String url) { |
| byte[] digest = MD5Hash.digest(url).getDigest(); |
| |
| StringBuffer sb = new StringBuffer(); |
| for (byte b : digest) { |
| sb.append(String.format("%02x", b & 0xff)); |
| } |
| |
| return sb.toString(); |
| } |
| |
| public static String createTwoLevelsDirectory(String basePath, String md5, boolean makeDir) { |
| String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString(); |
| String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString(); |
| |
| String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName); |
| |
| if (makeDir) { |
| try { |
| FileUtils.forceMkdir(new File(fullDirPath)); |
| } catch (IOException e) { |
| LOG.error("Failed to create dir: {}", fullDirPath); |
| fullDirPath = null; |
| } |
| } |
| |
| return fullDirPath; |
| } |
| |
| public static String createTwoLevelsDirectory(String basePath, String md5) { |
| return createTwoLevelsDirectory(basePath, md5, true); |
| } |
| |
| public static String createFileName(String md5, String fileBaseName, String fileExtension) { |
| if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) { |
| LOG.info("File name is too long. Truncated to {} characters.", MAX_LENGTH_OF_FILENAME); |
| fileBaseName = StringUtils.substring(fileBaseName, 0, MAX_LENGTH_OF_FILENAME); |
| } |
| |
| if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) { |
| LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION); |
| fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION); |
| } |
| |
| // Added to prevent FileNotFoundException (Invalid Argument) - in *nix environment |
| fileBaseName = fileBaseName.replaceAll("\\?", ""); |
| fileExtension = fileExtension.replaceAll("\\?", ""); |
| |
| return String.format(FILENAME_PATTERN, md5, fileBaseName, fileExtension); |
| } |
| |
| public static String createFileNameFromUrl(String basePath, String reverseKey, String urlString, String epochScrapeTime, String fileExtension, boolean makeDir) { |
| String fullDirPath = basePath + File.separator + reverseKey + File.separator + DigestUtils.sha1Hex(urlString); |
| |
| if (makeDir) { |
| try { |
| FileUtils.forceMkdir(new File(fullDirPath)); |
| } catch (IOException e) { |
| LOG.error("Failed to create dir: {}", fullDirPath); |
| fullDirPath = null; |
| } |
| } |
| |
| if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) { |
| LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION); |
| fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION); |
| } |
| |
| String outputFullPath = fullDirPath + File.separator + epochScrapeTime + "." + fileExtension; |
| |
| return outputFullPath; |
| } |
| |
| public static String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) { |
| StringBuilder builder = new StringBuilder(); |
| // print total stats |
| builder.append("\nTOTAL Stats:\n"); |
| builder.append("[\n"); |
| int mimetypeCount = 0; |
| for (String mimeType : typeCounts.keySet()) { |
| builder.append(" {\"mimeType\":\""); |
| builder.append(mimeType); |
| builder.append("\",\"count\":\""); |
| builder.append(typeCounts.get(mimeType)); |
| builder.append("\"}\n"); |
| mimetypeCount += typeCounts.get(mimeType); |
| } |
| builder.append("]\n"); |
| builder.append("Total count: " + mimetypeCount + "\n"); |
| // filtered types stats |
| mimetypeCount = 0; |
| if (!filteredCounts.isEmpty()) { |
| builder.append("\nFILTERED Stats:\n"); |
| builder.append("[\n"); |
| for (String mimeType : filteredCounts.keySet()) { |
| builder.append(" {\"mimeType\":\""); |
| builder.append(mimeType); |
| builder.append("\",\"count\":\""); |
| builder.append(filteredCounts.get(mimeType)); |
| builder.append("\"}\n"); |
| mimetypeCount += filteredCounts.get(mimeType); |
| } |
| builder.append("]\n"); |
| builder.append("Total filtered count: " + mimetypeCount + "\n"); |
| } |
| return builder.toString(); |
| } |
| } |