blob: 2433b28a859c9d7c87f72f7f47d8092933f4994d [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.tools.offlineImageViewer;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
import org.apache.hadoop.hdfs.server.namenode.FSImageUtil;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.io.IOUtils;
import com.google.common.base.Preconditions;
import com.google.common.io.LimitInputStream;
/**
* This is the tool for analyzing file sizes in the namespace image. In order to
* run the tool one should define a range of integers <tt>[0, maxSize]</tt> by
* specifying <tt>maxSize</tt> and a <tt>step</tt>. The range of integers is
* divided into segments of size <tt>step</tt>:
* <tt>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</tt>, and the visitor
* calculates how many files in the system fall into each segment
* <tt>[s<sub>i-1</sub>, s<sub>i</sub>)</tt>. Note that files larger than
* <tt>maxSize</tt> always fall into the very last segment.
*
* <h3>Input.</h3>
* <ul>
* <li><tt>filename</tt> specifies the location of the image file;</li>
* <li><tt>maxSize</tt> determines the range <tt>[0, maxSize]</tt> of files
* sizes considered by the visitor;</li>
* <li><tt>step</tt> the range is divided into segments of size step.</li>
* </ul>
*
* <h3>Output.</h3> The output file is formatted as a tab separated two column
* table: Size and NumFiles. Where Size represents the start of the segment, and
* numFiles is the number of files form the image which size falls in this
* segment.
*
*/
final class FileDistributionCalculator {
private final static long MAX_SIZE_DEFAULT = 0x2000000000L; // 1/8 TB = 2^37
private final static int INTERVAL_DEFAULT = 0x200000; // 2 MB = 2^21
private final Configuration conf;
private final long maxSize;
private final int steps;
private final PrintWriter out;
private int[] distribution;
private int totalFiles;
private int totalDirectories;
private int totalBlocks;
private long totalSpace;
private long maxFileSize;
FileDistributionCalculator(Configuration conf, long maxSize, int steps,
PrintWriter out) {
this.conf = conf;
this.maxSize = maxSize == 0 ? MAX_SIZE_DEFAULT : maxSize;
this.steps = steps == 0 ? INTERVAL_DEFAULT : steps;
this.out = out;
long numIntervals = this.maxSize / this.steps;
this.distribution = new int[1 + (int) (numIntervals)];
Preconditions.checkState(numIntervals < Integer.MAX_VALUE,
"Too many distribution intervals");
}
void visit(RandomAccessFile file) throws IOException {
if (!FSImageUtil.checkFileFormat(file)) {
throw new IOException("Unrecognized FSImage");
}
FileSummary summary = FSImageUtil.loadSummary(file);
FileInputStream in = null;
try {
in = new FileInputStream(file.getFD());
for (FileSummary.Section s : summary.getSectionsList()) {
if (SectionName.fromString(s.getName()) != SectionName.INODE) {
continue;
}
in.getChannel().position(s.getOffset());
InputStream is = FSImageUtil.wrapInputStreamForCompression(conf,
summary.getCodec(), new BufferedInputStream(new LimitInputStream(
in, s.getLength())));
run(is);
output();
}
} finally {
IOUtils.cleanup(null, in);
}
}
private void run(InputStream in) throws IOException {
INodeSection s = INodeSection.parseDelimitedFrom(in);
for (int i = 0; i < s.getNumInodes(); ++i) {
INodeSection.INode p = INodeSection.INode.parseDelimitedFrom(in);
if (p.getType() == INodeSection.INode.Type.FILE) {
++totalFiles;
INodeSection.INodeFile f = p.getFile();
totalBlocks += f.getBlocksCount();
long fileSize = 0;
for (BlockProto b : f.getBlocksList()) {
fileSize += b.getNumBytes() * f.getReplication();
}
maxFileSize = Math.max(fileSize, maxFileSize);
totalSpace += fileSize;
int bucket = fileSize > maxSize ? distribution.length - 1 : (int) Math
.ceil((double)fileSize / steps);
++distribution[bucket];
} else if (p.getType() == INodeSection.INode.Type.DIRECTORY) {
++totalDirectories;
}
if (i % (1 << 20) == 0) {
out.println("Processed " + i + " inodes.");
}
}
}
private void output() {
// write the distribution into the output file
out.print("Size\tNumFiles\n");
for (int i = 0; i < distribution.length; i++) {
if (distribution[i] != 0) {
out.print(((long) i * steps) + "\t" + distribution[i]);
out.print('\n');
}
}
out.print("totalFiles = " + totalFiles + "\n");
out.print("totalDirectories = " + totalDirectories + "\n");
out.print("totalBlocks = " + totalBlocks + "\n");
out.print("totalSpace = " + totalSpace + "\n");
out.print("maxFileSize = " + maxFileSize + "\n");
}
}