blob: 92711ebe62e0b6f2eacfe44d7421d9fd8ae07af5 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.tools;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.util.BloomFilter;
import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.impl.OrcAcidUtils;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.OrcProto;
import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.apache.orc.impl.RecordReaderImpl;
/**
* A tool for printing out the file structure of ORC files.
*/
public final class FileDump {
public static final String UNKNOWN = "UNKNOWN";
public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n";
public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
public boolean accept(Path p) {
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX);
}
};
// not used
private FileDump() {
}
public static void main(Configuration conf, String[] args) throws Exception {
List<Integer> rowIndexCols = new ArrayList<Integer>(0);
Options opts = createOptions();
CommandLine cli = new GnuParser().parse(opts, args);
if (cli.hasOption('h')) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("orcfiledump", opts);
return;
}
boolean dumpData = cli.hasOption('d');
boolean recover = cli.hasOption("recover");
boolean skipDump = cli.hasOption("skip-dump");
String backupPath = DEFAULT_BACKUP_PATH;
if (cli.hasOption("backup-path")) {
backupPath = cli.getOptionValue("backup-path");
}
if (cli.hasOption("r")) {
String val = cli.getOptionValue("r");
if (val != null && val.trim().equals("*")) {
rowIndexCols = null; // All the columns
} else {
String[] colStrs = cli.getOptionValue("r").split(",");
rowIndexCols = new ArrayList<Integer>(colStrs.length);
for (String colStr : colStrs) {
rowIndexCols.add(Integer.parseInt(colStr));
}
}
}
boolean printTimeZone = cli.hasOption('t');
boolean jsonFormat = cli.hasOption('j');
String[] files = cli.getArgs();
if (files.length == 0) {
System.err.println("Error : ORC files are not specified");
return;
}
// if the specified path is directory, iterate through all files and print the file dump
List<String> filesInPath = new ArrayList<>();
for (String filename : files) {
Path path = new Path(filename);
filesInPath.addAll(getAllFilesInPath(path, conf));
}
if (dumpData) {
PrintData.main(conf, filesInPath.toArray(new String[filesInPath.size()]));
} else if (recover && skipDump) {
recoverFiles(filesInPath, conf, backupPath);
} else {
if (jsonFormat) {
boolean prettyPrint = cli.hasOption('p');
JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
} else {
printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
main(conf, args);
}
/**
* This method returns an ORC reader object if the specified file is readable. If the specified
* file has side file (_flush_length) file, then max footer offset will be read from the side
* file and orc reader will be created from that offset. Since both data file and side file
* use hflush() for flushing the data, there could be some inconsistencies and both files could be
* out-of-sync. Following are the cases under which null will be returned
*
* 1) If the file specified by path or its side file is still open for writes
* 2) If *_flush_length file does not return any footer offset
* 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
* position (incomplete data file)
* 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
* previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
* be returned
*
* Also, if this method detects any file corruption (mismatch between data file and side file)
* then it will add the corresponding file to the specified input list for corrupted files.
*
* In all other cases, where the file is readable this method will return a reader object.
*
* @param path - file to get reader for
* @param conf - configuration object
* @param corruptFiles - fills this list with all possible corrupted files
* @return - reader for the specified file or null
* @throws IOException
*/
static Reader getReader(final Path path, final Configuration conf,
final List<String> corruptFiles) throws IOException {
FileSystem fs = path.getFileSystem(conf);
long dataFileLen = fs.getFileStatus(path).getLen();
System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
Path sideFile = OrcAcidUtils.getSideFile(path);
final boolean sideFileExists = fs.exists(sideFile);
boolean openDataFile = false;
boolean openSideFile = false;
if (fs instanceof DistributedFileSystem) {
DistributedFileSystem dfs = (DistributedFileSystem) fs;
openDataFile = !dfs.isFileClosed(path);
openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
}
if (openDataFile || openSideFile) {
if (openDataFile && openSideFile) {
System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
" are still open for writes.");
} else if (openSideFile) {
System.err.println("Unable to perform file dump as " + sideFile +
" is still open for writes.");
} else {
System.err.println("Unable to perform file dump as " + path +
" is still open for writes.");
}
return null;
}
Reader reader = null;
if (sideFileExists) {
final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path);
final long sideFileLen = fs.getFileStatus(sideFile).getLen();
System.err.println("Found flush length file " + sideFile
+ " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
// no offsets read from side file
if (maxLen == -1) {
// if data file is larger than last flush length, then additional data could be recovered
if (dataFileLen > maxLen) {
System.err.println("Data file has more data than max footer offset:" + maxLen +
". Adding data file to recovery list.");
if (corruptFiles != null) {
corruptFiles.add(path.toUri().toString());
}
}
return null;
}
try {
reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));
// if data file is larger than last flush length, then additional data could be recovered
if (dataFileLen > maxLen) {
System.err.println("Data file has more data than max footer offset:" + maxLen +
". Adding data file to recovery list.");
if (corruptFiles != null) {
corruptFiles.add(path.toUri().toString());
}
}
} catch (Exception e) {
if (corruptFiles != null) {
corruptFiles.add(path.toUri().toString());
}
System.err.println("Unable to read data from max footer offset." +
" Adding data file to recovery list.");
return null;
}
} else {
reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
}
return reader;
}
public static Collection<String> getAllFilesInPath(final Path path,
final Configuration conf) throws IOException {
List<String> filesInPath = new ArrayList<>();
FileSystem fs = path.getFileSystem(conf);
FileStatus fileStatus = fs.getFileStatus(path);
if (fileStatus.isDir()) {
FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
for (FileStatus fileInPath : fileStatuses) {
if (fileInPath.isDir()) {
filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
} else {
filesInPath.add(fileInPath.getPath().toString());
}
}
} else {
filesInPath.add(path.toString());
}
return filesInPath;
}
private static void printMetaData(List<String> files, Configuration conf,
List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
final String backupPath)
throws IOException {
List<String> corruptFiles = new ArrayList<>();
for (String filename : files) {
printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
System.out.println(SEPARATOR);
}
if (!corruptFiles.isEmpty()) {
if (recover) {
recoverFiles(corruptFiles, conf, backupPath);
} else {
System.err.println(corruptFiles.size() + " file(s) are corrupted." +
" Run the following command to recover corrupted files.\n");
StringBuilder buffer = new StringBuilder();
buffer.append("hive --orcfiledump --recover --skip-dump");
for(String file: corruptFiles) {
buffer.append(' ');
buffer.append(file);
}
System.err.println(buffer.toString());
System.out.println(SEPARATOR);
}
}
}
static void printTypeAnnotations(TypeDescription type, String prefix) {
List<String> attributes = type.getAttributeNames();
if (attributes.size() > 0) {
System.out.println("Attributes on " + prefix);
for(String attr: attributes) {
System.out.println(" " + attr + ": " + type.getAttributeValue(attr));
}
}
List<TypeDescription> children = type.getChildren();
if (children != null) {
switch (type.getCategory()) {
case STRUCT:
List<String> fields = type.getFieldNames();
for(int c = 0; c < children.size(); ++c) {
printTypeAnnotations(children.get(c), prefix + "." + fields.get(c));
}
break;
case MAP:
printTypeAnnotations(children.get(0), prefix + "._key");
printTypeAnnotations(children.get(1), prefix + "._value");
break;
case LIST:
printTypeAnnotations(children.get(0), prefix + "._elem");
break;
case UNION:
for(int c = 0; c < children.size(); ++c) {
printTypeAnnotations(children.get(c), prefix + "._" + c);
}
break;
}
}
}
private static void printMetaDataImpl(final String filename,
final Configuration conf, List<Integer> rowIndexCols, final boolean printTimeZone,
final List<String> corruptFiles) throws IOException {
Path file = new Path(filename);
Reader reader = getReader(file, conf, corruptFiles);
// if we can create reader then footer is not corrupt and file will readable
if (reader == null) {
return;
}
TypeDescription schema = reader.getSchema();
System.out.println("Structure for " + filename);
System.out.println("File Version: " + reader.getFileVersion().getName() +
" with " + reader.getWriterVersion());
RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
System.out.println("Rows: " + reader.getNumberOfRows());
System.out.println("Compression: " + reader.getCompressionKind());
if (reader.getCompressionKind() != CompressionKind.NONE) {
System.out.println("Compression size: " + reader.getCompressionSize());
}
System.out.println("Calendar: " + (reader.writerUsedProlepticGregorian()
? "Proleptic Gregorian"
: "Julian/Gregorian"));
System.out.println("Type: " + reader.getSchema().toString());
printTypeAnnotations(reader.getSchema(), "root");
System.out.println("\nStripe Statistics:");
List<StripeStatistics> stripeStats = reader.getStripeStatistics();
for (int n = 0; n < stripeStats.size(); n++) {
System.out.println(" Stripe " + (n + 1) + ":");
StripeStatistics ss = stripeStats.get(n);
for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
System.out.println(" Column " + i + ": " +
ss.getColumnStatistics()[i].toString());
}
}
ColumnStatistics[] stats = reader.getStatistics();
int colCount = stats.length;
if (rowIndexCols == null) {
rowIndexCols = new ArrayList<>(colCount);
for (int i = 0; i < colCount; ++i) {
rowIndexCols.add(i);
}
}
System.out.println("\nFile Statistics:");
for (int i = 0; i < stats.length; ++i) {
System.out.println(" Column " + i + ": " + stats[i].toString());
}
System.out.println("\nStripes:");
int stripeIx = -1;
for (StripeInformation stripe : reader.getStripes()) {
++stripeIx;
long stripeStart = stripe.getOffset();
OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
if (printTimeZone) {
String tz = footer.getWriterTimezone();
if (tz == null || tz.isEmpty()) {
tz = UNKNOWN;
}
System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz);
} else {
System.out.println(" Stripe: " + stripe.toString());
}
long sectionStart = stripeStart;
for (OrcProto.Stream section : footer.getStreamsList()) {
String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
System.out.println(" Stream: column " + section.getColumn() +
" section " + kind + " start: " + sectionStart +
" length " + section.getLength());
sectionStart += section.getLength();
}
for (int i = 0; i < footer.getColumnsCount(); ++i) {
OrcProto.ColumnEncoding encoding = footer.getColumns(i);
StringBuilder buf = new StringBuilder();
buf.append(" Encoding column ");
buf.append(i);
buf.append(": ");
buf.append(encoding.getKind());
if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
buf.append("[");
buf.append(encoding.getDictionarySize());
buf.append("]");
}
System.out.println(buf);
}
if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
// include the columns that are specified, only if the columns are included, bloom filter
// will be read
boolean[] sargColumns = new boolean[colCount];
for (int colIdx : rowIndexCols) {
sargColumns[colIdx] = true;
}
OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns);
for (int col : rowIndexCols) {
StringBuilder buf = new StringBuilder();
String rowIdxString = getFormattedRowIndices(col,
indices.getRowGroupIndex(), schema, (ReaderImpl) reader);
buf.append(rowIdxString);
String bloomFilString = getFormattedBloomFilters(col, indices,
reader.getWriterVersion(),
reader.getSchema().findSubtype(col).getCategory(),
footer.getColumns(col));
buf.append(bloomFilString);
System.out.println(buf);
}
}
}
FileSystem fs = file.getFileSystem(conf);
long fileLen = fs.getFileStatus(file).getLen();
long paddedBytes = getTotalPaddingSize(reader);
// empty ORC file is ~45 bytes. Assumption here is file length always >0
double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
DecimalFormat format = new DecimalFormat("##.##");
System.out.println("\nFile length: " + fileLen + " bytes");
System.out.println("Padding length: " + paddedBytes + " bytes");
System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
//print out any user metadata properties
List<String> keys = reader.getMetadataKeys();
for(int i = 0; i < keys.size(); i++) {
if(i == 0) {
System.out.println("\nUser Metadata:");
}
ByteBuffer byteBuffer = reader.getMetadataValue(keys.get(i));
System.out.println(" " + keys.get(i) + "="
+ StandardCharsets.UTF_8.decode(byteBuffer));
}
rows.close();
}
private static void recoverFiles(final List<String> corruptFiles, final Configuration conf,
final String backup)
throws IOException {
for (String corruptFile : corruptFiles) {
System.err.println("Recovering file " + corruptFile);
Path corruptPath = new Path(corruptFile);
FileSystem fs = corruptPath.getFileSystem(conf);
FSDataInputStream fdis = fs.open(corruptPath);
try {
long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
long remaining = corruptFileLen;
List<Long> footerOffsets = new ArrayList<>();
// start reading the data file form top to bottom and record the valid footers
while (remaining > 0) {
int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
byte[] data = new byte[toRead];
long startPos = corruptFileLen - remaining;
fdis.readFully(startPos, data, 0, toRead);
// find all MAGIC string and see if the file is readable from there
int index = 0;
long nextFooterOffset;
byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
while (index != -1) {
index = indexOf(data, magicBytes, index + 1);
if (index != -1) {
nextFooterOffset = startPos + index + magicBytes.length + 1;
if (isReadable(corruptPath, conf, nextFooterOffset)) {
footerOffsets.add(nextFooterOffset);
}
}
}
System.err.println("Scanning for valid footers - startPos: " + startPos +
" toRead: " + toRead + " remaining: " + remaining);
remaining = remaining - toRead;
}
System.err.println("Readable footerOffsets: " + footerOffsets);
recoverFile(corruptPath, fs, conf, footerOffsets, backup);
} catch (Exception e) {
Path recoveryFile = getRecoveryFile(corruptPath);
if (fs.exists(recoveryFile)) {
fs.delete(recoveryFile, false);
}
System.err.println("Unable to recover file " + corruptFile);
e.printStackTrace();
System.err.println(SEPARATOR);
continue;
} finally {
fdis.close();
}
System.err.println(corruptFile + " recovered successfully!");
System.err.println(SEPARATOR);
}
}
private static void recoverFile(final Path corruptPath, final FileSystem fs,
final Configuration conf, final List<Long> footerOffsets, final String backup)
throws IOException {
// first recover the file to .recovered file and then once successful rename it to actual file
Path recoveredPath = getRecoveryFile(corruptPath);
// make sure that file does not exist
if (fs.exists(recoveredPath)) {
fs.delete(recoveredPath, false);
}
// if there are no valid footers, the file should still be readable so create an empty orc file
if (footerOffsets == null || footerOffsets.isEmpty()) {
System.err.println("No readable footers found. Creating empty orc file.");
TypeDescription schema = TypeDescription.createStruct();
Writer writer = OrcFile.createWriter(recoveredPath,
OrcFile.writerOptions(conf).setSchema(schema));
writer.close();
} else {
FSDataInputStream fdis = fs.open(corruptPath);
FileStatus fileStatus = fs.getFileStatus(corruptPath);
// read corrupt file and copy it to recovered file until last valid footer
FSDataOutputStream fdos = fs.create(recoveredPath, true,
conf.getInt("io.file.buffer.size", 4096),
fileStatus.getReplication(),
fileStatus.getBlockSize());
try {
long fileLen = footerOffsets.get(footerOffsets.size() - 1);
long remaining = fileLen;
while (remaining > 0) {
int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
byte[] data = new byte[toRead];
long startPos = fileLen - remaining;
fdis.readFully(startPos, data, 0, toRead);
fdos.write(data);
System.err.println("Copying data to recovery file - startPos: " + startPos +
" toRead: " + toRead + " remaining: " + remaining);
remaining = remaining - toRead;
}
} catch (Exception e) {
fs.delete(recoveredPath, false);
throw new IOException(e);
} finally {
fdis.close();
fdos.close();
}
}
// validate the recovered file once again and start moving corrupt files to backup folder
if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
Path backupDataPath;
String scheme = corruptPath.toUri().getScheme();
String authority = corruptPath.toUri().getAuthority();
String filePath = corruptPath.toUri().getPath();
// use the same filesystem as corrupt file if backup-path is not explicitly specified
if (backup.equals(DEFAULT_BACKUP_PATH)) {
backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
} else {
backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
}
// Move data file to backup path
moveFiles(fs, corruptPath, backupDataPath);
// Move side file to backup path
Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath);
Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
moveFiles(fs, sideFilePath, backupSideFilePath);
// finally move recovered file to actual file
moveFiles(fs, recoveredPath, corruptPath);
// we are done recovering, backing up and validating
System.err.println("Validation of recovered file successful!");
}
}
private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
throws IOException {
try {
// create the dest directory if not exist
if (!fs.exists(dest.getParent())) {
fs.mkdirs(dest.getParent());
}
// if the destination file exists for some reason delete it
fs.delete(dest, false);
if (fs.rename(src, dest)) {
System.err.println("Moved " + src + " to " + dest);
} else {
throw new IOException("Unable to move " + src + " to " + dest);
}
} catch (Exception e) {
throw new IOException("Unable to move " + src + " to " + dest, e);
}
}
private static Path getRecoveryFile(final Path corruptPath) {
return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
}
private static boolean isReadable(final Path corruptPath, final Configuration conf,
final long maxLen) {
try {
OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
return true;
} catch (Exception e) {
// ignore this exception as maxLen is unreadable
return false;
}
}
// search for byte pattern in another byte array
private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
index > data.length || index < 0) {
return -1;
}
int j = 0;
for (int i = index; i < data.length; i++) {
if (pattern[j] == data[i]) {
j++;
} else {
j = 0;
}
if (j == pattern.length) {
return i - pattern.length + 1;
}
}
return -1;
}
private static String getFormattedBloomFilters(int col, OrcIndex index,
OrcFile.WriterVersion version,
TypeDescription.Category type,
OrcProto.ColumnEncoding encoding) {
OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex();
StringBuilder buf = new StringBuilder();
BloomFilter stripeLevelBF = null;
if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
int idx = 0;
buf.append("\n Bloom filters for column ").append(col).append(":");
for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
BloomFilter toMerge = BloomFilterIO.deserialize(
index.getBloomFilterKinds()[col], encoding, version, type, bf);
buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
if (stripeLevelBF == null) {
stripeLevelBF = toMerge;
} else {
stripeLevelBF.merge(toMerge);
}
}
String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
buf.append("\n Stripe level merge:").append(bloomFilterStats);
}
return buf.toString();
}
private static String getBloomFilterStats(BloomFilter bf) {
StringBuilder sb = new StringBuilder();
int bitCount = bf.getBitSize();
int popCount = 0;
for (long l : bf.getBitSet()) {
popCount += Long.bitCount(l);
}
int k = bf.getNumHashFunctions();
float loadFactor = (float) popCount / (float) bitCount;
float expectedFpp = (float) Math.pow(loadFactor, k);
DecimalFormat df = new DecimalFormat("###.####");
sb.append(" numHashFunctions: ").append(k);
sb.append(" bitCount: ").append(bitCount);
sb.append(" popCount: ").append(popCount);
sb.append(" loadFactor: ").append(df.format(loadFactor));
sb.append(" expectedFpp: ").append(expectedFpp);
return sb.toString();
}
private static String getFormattedRowIndices(int col,
OrcProto.RowIndex[] rowGroupIndex,
TypeDescription schema,
ReaderImpl reader) {
StringBuilder buf = new StringBuilder();
OrcProto.RowIndex index;
buf.append(" Row group indices for column ").append(col).append(":");
if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
((index = rowGroupIndex[col]) == null)) {
buf.append(" not found\n");
return buf.toString();
}
TypeDescription colSchema = schema.findSubtype(col);
for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
buf.append("\n Entry ").append(entryIx).append(": ");
OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
if (entry == null) {
buf.append("unknown\n");
continue;
}
OrcProto.ColumnStatistics colStats = entry.getStatistics();
if (colStats == null) {
buf.append("no stats at ");
} else {
ColumnStatistics cs =
ColumnStatisticsImpl.deserialize(colSchema, colStats, reader);
buf.append(cs.toString());
}
buf.append(" positions: ");
for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
if (posIx != 0) {
buf.append(",");
}
buf.append(entry.getPositions(posIx));
}
}
return buf.toString();
}
public static long getTotalPaddingSize(Reader reader) throws IOException {
long paddedBytes = 0;
List<StripeInformation> stripes = reader.getStripes();
for (int i = 1; i < stripes.size(); i++) {
long prevStripeOffset = stripes.get(i - 1).getOffset();
long prevStripeLen = stripes.get(i - 1).getLength();
paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
}
return paddedBytes;
}
@SuppressWarnings("static-access")
static Options createOptions() {
Options result = new Options();
// add -d and --data to print the rows
result.addOption(OptionBuilder
.withLongOpt("data")
.withDescription("Should the data be printed")
.create('d'));
// to avoid breaking unit tests (when run in different time zones) for file dump, printing
// of timezone is made optional
result.addOption(OptionBuilder
.withLongOpt("timezone")
.withDescription("Print writer's time zone")
.create('t'));
result.addOption(OptionBuilder
.withLongOpt("help")
.withDescription("print help message")
.create('h'));
result.addOption(OptionBuilder
.withLongOpt("rowindex")
.withArgName("comma separated list of column ids for which row index should be printed")
.withDescription("Dump stats for column number(s)")
.hasArg()
.create('r'));
result.addOption(OptionBuilder
.withLongOpt("json")
.withDescription("Print metadata in JSON format")
.create('j'));
result.addOption(OptionBuilder
.withLongOpt("pretty")
.withDescription("Pretty print json metadata output")
.create('p'));
result.addOption(OptionBuilder
.withLongOpt("recover")
.withDescription("recover corrupted orc files generated by streaming")
.create());
result.addOption(OptionBuilder
.withLongOpt("skip-dump")
.withDescription("used along with --recover to directly recover files without dumping")
.create());
result.addOption(OptionBuilder
.withLongOpt("backup-path")
.withDescription("specify a backup path to store the corrupted files (default: /tmp)")
.hasArg()
.create());
return result;
}
}