blob: 2347ac7449e473d8507fd070f0159eeebacf40bb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.tools;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
/**
* Given a set of paths, finds all of the "*.orc" files under them and
* prints the sizes of each column, both as a percentage and the number of
* bytes per a row.
*/
public class ColumnSizes {
final Configuration conf;
final TypeDescription schema;
final long[] columnSizes;
int goodFiles = 0;
long rows = 0;
long padding = 0;
long totalSize = 0;
long stripeFooterSize = 0;
long fileFooterSize = 0;
long stripeIndex = 0;
// data bytes that aren't assigned to a specific column
long stripeData = 0;
public ColumnSizes(Configuration conf,
LocatedFileStatus file) throws IOException {
this.conf = conf;
try (Reader reader = OrcFile.createReader(file.getPath(),
OrcFile.readerOptions(conf))) {
this.schema = reader.getSchema();
columnSizes = new long[schema.getMaximumId() + 1];
addReader(file, reader);
}
}
private void checkStripes(LocatedFileStatus file,
Reader reader) {
// Count the magic as file overhead
long offset = OrcFile.MAGIC.length();
fileFooterSize += offset;
for (StripeInformation stripe: reader.getStripes()) {
padding += stripe.getOffset() - offset;
stripeIndex += stripe.getIndexLength();
stripeData += stripe.getDataLength();
stripeFooterSize += stripe.getFooterLength();
offset = stripe.getOffset() + stripe.getLength();
}
// Add everything else as the file footer
fileFooterSize += file.getLen() - offset;
}
private boolean addReader(LocatedFileStatus file,
Reader reader) {
// Validate that the schemas are the same
TypeDescription newSchema = reader.getSchema();
if (schema.equals(newSchema)) {
goodFiles += 1;
rows += reader.getNumberOfRows();
totalSize += file.getLen();
checkStripes(file, reader);
ColumnStatistics[] colStats = reader.getStatistics();
for (int c = 0; c < colStats.length && c < columnSizes.length; c++) {
columnSizes[c] += colStats[c].getBytesOnDisk();
// Don't double count. Either count the bytes as stripe data or as
// part of a column.
stripeData -= colStats[c].getBytesOnDisk();
}
} else {
System.err.println("Ignoring " + file.getPath()
+ " because of schema mismatch: " + newSchema);
return false;
}
return true;
}
public boolean addFile(LocatedFileStatus file) throws IOException {
try (Reader reader = OrcFile.createReader(file.getPath(),
OrcFile.readerOptions(conf))) {
return addReader(file, reader);
}
}
private static class StringLongPair {
final String name;
final long size;
StringLongPair(String name, long size) {
this.name = name;
this.size = size;
}
}
private void printResults(PrintStream out, boolean summary) {
List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
for(int column = 0; column < columnSizes.length; ++column) {
if (columnSizes[column] > 0) {
sizes.add(new StringLongPair(
schema.findSubtype(column).getFullFieldName(),
columnSizes[column]));
}
}
if (padding > 0) {
sizes.add(new StringLongPair("_padding", padding));
}
if (stripeFooterSize > 0) {
sizes.add(new StringLongPair("_stripe_footer", stripeFooterSize));
}
if (fileFooterSize > 0) {
sizes.add(new StringLongPair("_file_footer", fileFooterSize));
}
if (stripeIndex > 0) {
sizes.add(new StringLongPair("_index", stripeIndex));
}
if (stripeData > 0) {
sizes.add(new StringLongPair("_data", stripeData));
}
// sort by descending size, ascending name
sizes.sort((x, y) -> x.size != y.size ?
Long.compare(y.size, x.size) : x.name.compareTo(y.name));
if (summary) {
out.printf("Total Sizes: %d%n", totalSize);
out.printf("Total Rows: %d%n", rows);
}
out.println("Percent Bytes/Row Name");
for (StringLongPair item: sizes) {
out.println(String.format(" %-5.2f %-9.2f %s",
100.0 * item.size / totalSize, (double) item.size / rows, item.name));
}
}
public static void main(Configuration conf, String[] args) throws Exception {
Options opts = createOptions();
CommandLine cli = new DefaultParser().parse(opts, args);
if (cli.hasOption('h')) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("sizes", opts);
return;
}
boolean ignoreExtension = cli.hasOption("ignoreExtension");
boolean summary = cli.hasOption("summary");
String[] files = cli.getArgs();
ColumnSizes result = null;
int totalFiles = 0;
int badFiles = 0;
for(String root: files) {
Path rootPath = new Path(root);
FileSystem fs = rootPath.getFileSystem(conf);
for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
LocatedFileStatus status = itr.next();
if (status.isFile() && (ignoreExtension || status.getPath().getName().endsWith(".orc"))) {
totalFiles += 1;
try {
if (result == null) {
result = new ColumnSizes(conf, status);
} else {
if (!result.addFile(status)) {
badFiles += 1;
}
}
} catch (IOException err) {
badFiles += 1;
System.err.println("Failed to read " + status.getPath());
}
}
}
}
if (result == null) {
System.err.println("No files found");
} else {
if (summary) {
System.out.printf("Total Files: %d%n", totalFiles);
}
result.printResults(System.out, summary);
}
if (badFiles > 0) {
System.err.println(badFiles + " bad ORC files found.");
System.exit(1);
}
}
public static void main(String[] args) throws Exception {
main(new Configuration(), args);
}
private static Options createOptions() {
Options result = new Options();
result.addOption(Option.builder("i")
.longOpt("ignoreExtension")
.desc("Ignore ORC file extension")
.build());
result.addOption(Option.builder("s")
.longOpt("summary")
.desc("Summarize the number of files, file sizes, and file rows")
.build());
result.addOption(Option.builder("h")
.longOpt("help")
.desc("Print help message")
.build());
return result;
}
}