| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * <p/> |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * <p/> |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.orc.tools; |
| |
| import java.io.IOException; |
| import java.nio.ByteBuffer; |
| import java.nio.charset.StandardCharsets; |
| import java.text.DecimalFormat; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.List; |
| |
| import org.apache.commons.cli.CommandLine; |
| import org.apache.commons.cli.GnuParser; |
| import org.apache.commons.cli.HelpFormatter; |
| import org.apache.commons.cli.OptionBuilder; |
| import org.apache.commons.cli.Options; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FSDataInputStream; |
| import org.apache.hadoop.fs.FSDataOutputStream; |
| import org.apache.hadoop.fs.FileStatus; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.fs.PathFilter; |
| import org.apache.hadoop.hdfs.DistributedFileSystem; |
| import org.apache.orc.impl.ReaderImpl; |
| import org.apache.orc.util.BloomFilter; |
| import org.apache.orc.util.BloomFilterIO; |
| import org.apache.orc.ColumnStatistics; |
| import org.apache.orc.CompressionKind; |
| import org.apache.orc.OrcFile; |
| import org.apache.orc.Reader; |
| import org.apache.orc.TypeDescription; |
| import org.apache.orc.Writer; |
| import org.apache.orc.impl.ColumnStatisticsImpl; |
| import org.apache.orc.impl.OrcAcidUtils; |
| import org.apache.orc.impl.OrcIndex; |
| import org.apache.orc.OrcProto; |
| import org.apache.orc.StripeInformation; |
| import org.apache.orc.StripeStatistics; |
| import org.apache.orc.impl.RecordReaderImpl; |
| |
| /** |
| * A tool for printing out the file structure of ORC files. |
| */ |
| public final class FileDump { |
| public static final String UNKNOWN = "UNKNOWN"; |
| public static final String SEPARATOR = StringUtils.repeat("_", 120) + "\n"; |
| public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024; |
| public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir"); |
| public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() { |
| public boolean accept(Path p) { |
| String name = p.getName(); |
| return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith( |
| OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX); |
| } |
| }; |
| |
| // not used |
| private FileDump() { |
| } |
| |
| public static void main(Configuration conf, String[] args) throws Exception { |
| List<Integer> rowIndexCols = new ArrayList<Integer>(0); |
| Options opts = createOptions(); |
| CommandLine cli = new GnuParser().parse(opts, args); |
| |
| if (cli.hasOption('h')) { |
| HelpFormatter formatter = new HelpFormatter(); |
| formatter.printHelp("orcfiledump", opts); |
| return; |
| } |
| |
| boolean dumpData = cli.hasOption('d'); |
| boolean recover = cli.hasOption("recover"); |
| boolean skipDump = cli.hasOption("skip-dump"); |
| String backupPath = DEFAULT_BACKUP_PATH; |
| if (cli.hasOption("backup-path")) { |
| backupPath = cli.getOptionValue("backup-path"); |
| } |
| |
| if (cli.hasOption("r")) { |
| String val = cli.getOptionValue("r"); |
| if (val != null && val.trim().equals("*")) { |
| rowIndexCols = null; // All the columns |
| } else { |
| String[] colStrs = cli.getOptionValue("r").split(","); |
| rowIndexCols = new ArrayList<Integer>(colStrs.length); |
| for (String colStr : colStrs) { |
| rowIndexCols.add(Integer.parseInt(colStr)); |
| } |
| } |
| } |
| |
| boolean printTimeZone = cli.hasOption('t'); |
| boolean jsonFormat = cli.hasOption('j'); |
| String[] files = cli.getArgs(); |
| if (files.length == 0) { |
| System.err.println("Error : ORC files are not specified"); |
| return; |
| } |
| |
| // if the specified path is directory, iterate through all files and print the file dump |
| List<String> filesInPath = new ArrayList<>(); |
| for (String filename : files) { |
| Path path = new Path(filename); |
| filesInPath.addAll(getAllFilesInPath(path, conf)); |
| } |
| |
| if (dumpData) { |
| PrintData.main(conf, filesInPath.toArray(new String[filesInPath.size()])); |
| } else if (recover && skipDump) { |
| recoverFiles(filesInPath, conf, backupPath); |
| } else { |
| if (jsonFormat) { |
| boolean prettyPrint = cli.hasOption('p'); |
| JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone); |
| } else { |
| printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath); |
| } |
| } |
| } |
| |
| public static void main(String[] args) throws Exception { |
| Configuration conf = new Configuration(); |
| main(conf, args); |
| } |
| |
| /** |
| * This method returns an ORC reader object if the specified file is readable. If the specified |
| * file has side file (_flush_length) file, then max footer offset will be read from the side |
| * file and orc reader will be created from that offset. Since both data file and side file |
| * use hflush() for flushing the data, there could be some inconsistencies and both files could be |
| * out-of-sync. Following are the cases under which null will be returned |
| * |
| * 1) If the file specified by path or its side file is still open for writes |
| * 2) If *_flush_length file does not return any footer offset |
| * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that |
| * position (incomplete data file) |
| * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from |
| * previous valid footer. If there is no such footer (file length > 0 and < 8), then null will |
| * be returned |
| * |
| * Also, if this method detects any file corruption (mismatch between data file and side file) |
| * then it will add the corresponding file to the specified input list for corrupted files. |
| * |
| * In all other cases, where the file is readable this method will return a reader object. |
| * |
| * @param path - file to get reader for |
| * @param conf - configuration object |
| * @param corruptFiles - fills this list with all possible corrupted files |
| * @return - reader for the specified file or null |
| * @throws IOException |
| */ |
| static Reader getReader(final Path path, final Configuration conf, |
| final List<String> corruptFiles) throws IOException { |
| FileSystem fs = path.getFileSystem(conf); |
| long dataFileLen = fs.getFileStatus(path).getLen(); |
| System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]"); |
| Path sideFile = OrcAcidUtils.getSideFile(path); |
| final boolean sideFileExists = fs.exists(sideFile); |
| boolean openDataFile = false; |
| boolean openSideFile = false; |
| if (fs instanceof DistributedFileSystem) { |
| DistributedFileSystem dfs = (DistributedFileSystem) fs; |
| openDataFile = !dfs.isFileClosed(path); |
| openSideFile = sideFileExists && !dfs.isFileClosed(sideFile); |
| } |
| |
| if (openDataFile || openSideFile) { |
| if (openDataFile && openSideFile) { |
| System.err.println("Unable to perform file dump as " + path + " and " + sideFile + |
| " are still open for writes."); |
| } else if (openSideFile) { |
| System.err.println("Unable to perform file dump as " + sideFile + |
| " is still open for writes."); |
| } else { |
| System.err.println("Unable to perform file dump as " + path + |
| " is still open for writes."); |
| } |
| |
| return null; |
| } |
| |
| Reader reader = null; |
| if (sideFileExists) { |
| final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path); |
| final long sideFileLen = fs.getFileStatus(sideFile).getLen(); |
| System.err.println("Found flush length file " + sideFile |
| + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]"); |
| // no offsets read from side file |
| if (maxLen == -1) { |
| |
| // if data file is larger than last flush length, then additional data could be recovered |
| if (dataFileLen > maxLen) { |
| System.err.println("Data file has more data than max footer offset:" + maxLen + |
| ". Adding data file to recovery list."); |
| if (corruptFiles != null) { |
| corruptFiles.add(path.toUri().toString()); |
| } |
| } |
| return null; |
| } |
| |
| try { |
| reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen)); |
| |
| // if data file is larger than last flush length, then additional data could be recovered |
| if (dataFileLen > maxLen) { |
| System.err.println("Data file has more data than max footer offset:" + maxLen + |
| ". Adding data file to recovery list."); |
| if (corruptFiles != null) { |
| corruptFiles.add(path.toUri().toString()); |
| } |
| } |
| } catch (Exception e) { |
| if (corruptFiles != null) { |
| corruptFiles.add(path.toUri().toString()); |
| } |
| System.err.println("Unable to read data from max footer offset." + |
| " Adding data file to recovery list."); |
| return null; |
| } |
| } else { |
| reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); |
| } |
| |
| return reader; |
| } |
| |
| public static Collection<String> getAllFilesInPath(final Path path, |
| final Configuration conf) throws IOException { |
| List<String> filesInPath = new ArrayList<>(); |
| FileSystem fs = path.getFileSystem(conf); |
| FileStatus fileStatus = fs.getFileStatus(path); |
| if (fileStatus.isDir()) { |
| FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER); |
| for (FileStatus fileInPath : fileStatuses) { |
| if (fileInPath.isDir()) { |
| filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf)); |
| } else { |
| filesInPath.add(fileInPath.getPath().toString()); |
| } |
| } |
| } else { |
| filesInPath.add(path.toString()); |
| } |
| |
| return filesInPath; |
| } |
| |
| private static void printMetaData(List<String> files, Configuration conf, |
| List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover, |
| final String backupPath) |
| throws IOException { |
| List<String> corruptFiles = new ArrayList<>(); |
| for (String filename : files) { |
| printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles); |
| System.out.println(SEPARATOR); |
| } |
| |
| if (!corruptFiles.isEmpty()) { |
| if (recover) { |
| recoverFiles(corruptFiles, conf, backupPath); |
| } else { |
| System.err.println(corruptFiles.size() + " file(s) are corrupted." + |
| " Run the following command to recover corrupted files.\n"); |
| StringBuilder buffer = new StringBuilder(); |
| buffer.append("hive --orcfiledump --recover --skip-dump"); |
| for(String file: corruptFiles) { |
| buffer.append(' '); |
| buffer.append(file); |
| } |
| System.err.println(buffer.toString()); |
| System.out.println(SEPARATOR); |
| } |
| } |
| } |
| |
| static void printTypeAnnotations(TypeDescription type, String prefix) { |
| List<String> attributes = type.getAttributeNames(); |
| if (attributes.size() > 0) { |
| System.out.println("Attributes on " + prefix); |
| for(String attr: attributes) { |
| System.out.println(" " + attr + ": " + type.getAttributeValue(attr)); |
| } |
| } |
| List<TypeDescription> children = type.getChildren(); |
| if (children != null) { |
| switch (type.getCategory()) { |
| case STRUCT: |
| List<String> fields = type.getFieldNames(); |
| for(int c = 0; c < children.size(); ++c) { |
| printTypeAnnotations(children.get(c), prefix + "." + fields.get(c)); |
| } |
| break; |
| case MAP: |
| printTypeAnnotations(children.get(0), prefix + "._key"); |
| printTypeAnnotations(children.get(1), prefix + "._value"); |
| break; |
| case LIST: |
| printTypeAnnotations(children.get(0), prefix + "._elem"); |
| break; |
| case UNION: |
| for(int c = 0; c < children.size(); ++c) { |
| printTypeAnnotations(children.get(c), prefix + "._" + c); |
| } |
| break; |
| } |
| } |
| } |
| |
| private static void printMetaDataImpl(final String filename, |
| final Configuration conf, List<Integer> rowIndexCols, final boolean printTimeZone, |
| final List<String> corruptFiles) throws IOException { |
| Path file = new Path(filename); |
| Reader reader = getReader(file, conf, corruptFiles); |
| // if we can create reader then footer is not corrupt and file will readable |
| if (reader == null) { |
| return; |
| } |
| TypeDescription schema = reader.getSchema(); |
| System.out.println("Structure for " + filename); |
| System.out.println("File Version: " + reader.getFileVersion().getName() + |
| " with " + reader.getWriterVersion()); |
| RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); |
| System.out.println("Rows: " + reader.getNumberOfRows()); |
| System.out.println("Compression: " + reader.getCompressionKind()); |
| if (reader.getCompressionKind() != CompressionKind.NONE) { |
| System.out.println("Compression size: " + reader.getCompressionSize()); |
| } |
| System.out.println("Calendar: " + (reader.writerUsedProlepticGregorian() |
| ? "Proleptic Gregorian" |
| : "Julian/Gregorian")); |
| System.out.println("Type: " + reader.getSchema().toString()); |
| printTypeAnnotations(reader.getSchema(), "root"); |
| System.out.println("\nStripe Statistics:"); |
| List<StripeStatistics> stripeStats = reader.getStripeStatistics(); |
| for (int n = 0; n < stripeStats.size(); n++) { |
| System.out.println(" Stripe " + (n + 1) + ":"); |
| StripeStatistics ss = stripeStats.get(n); |
| for (int i = 0; i < ss.getColumnStatistics().length; ++i) { |
| System.out.println(" Column " + i + ": " + |
| ss.getColumnStatistics()[i].toString()); |
| } |
| } |
| ColumnStatistics[] stats = reader.getStatistics(); |
| int colCount = stats.length; |
| if (rowIndexCols == null) { |
| rowIndexCols = new ArrayList<>(colCount); |
| for (int i = 0; i < colCount; ++i) { |
| rowIndexCols.add(i); |
| } |
| } |
| System.out.println("\nFile Statistics:"); |
| for (int i = 0; i < stats.length; ++i) { |
| System.out.println(" Column " + i + ": " + stats[i].toString()); |
| } |
| System.out.println("\nStripes:"); |
| int stripeIx = -1; |
| for (StripeInformation stripe : reader.getStripes()) { |
| ++stripeIx; |
| long stripeStart = stripe.getOffset(); |
| OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); |
| if (printTimeZone) { |
| String tz = footer.getWriterTimezone(); |
| if (tz == null || tz.isEmpty()) { |
| tz = UNKNOWN; |
| } |
| System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz); |
| } else { |
| System.out.println(" Stripe: " + stripe.toString()); |
| } |
| long sectionStart = stripeStart; |
| for (OrcProto.Stream section : footer.getStreamsList()) { |
| String kind = section.hasKind() ? section.getKind().name() : UNKNOWN; |
| System.out.println(" Stream: column " + section.getColumn() + |
| " section " + kind + " start: " + sectionStart + |
| " length " + section.getLength()); |
| sectionStart += section.getLength(); |
| } |
| for (int i = 0; i < footer.getColumnsCount(); ++i) { |
| OrcProto.ColumnEncoding encoding = footer.getColumns(i); |
| StringBuilder buf = new StringBuilder(); |
| buf.append(" Encoding column "); |
| buf.append(i); |
| buf.append(": "); |
| buf.append(encoding.getKind()); |
| if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || |
| encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { |
| buf.append("["); |
| buf.append(encoding.getDictionarySize()); |
| buf.append("]"); |
| } |
| System.out.println(buf); |
| } |
| if (rowIndexCols != null && !rowIndexCols.isEmpty()) { |
| // include the columns that are specified, only if the columns are included, bloom filter |
| // will be read |
| boolean[] sargColumns = new boolean[colCount]; |
| for (int colIdx : rowIndexCols) { |
| sargColumns[colIdx] = true; |
| } |
| OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns); |
| for (int col : rowIndexCols) { |
| StringBuilder buf = new StringBuilder(); |
| String rowIdxString = getFormattedRowIndices(col, |
| indices.getRowGroupIndex(), schema, (ReaderImpl) reader); |
| buf.append(rowIdxString); |
| String bloomFilString = getFormattedBloomFilters(col, indices, |
| reader.getWriterVersion(), |
| reader.getSchema().findSubtype(col).getCategory(), |
| footer.getColumns(col)); |
| buf.append(bloomFilString); |
| System.out.println(buf); |
| } |
| } |
| } |
| |
| FileSystem fs = file.getFileSystem(conf); |
| long fileLen = fs.getFileStatus(file).getLen(); |
| long paddedBytes = getTotalPaddingSize(reader); |
| // empty ORC file is ~45 bytes. Assumption here is file length always >0 |
| double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; |
| DecimalFormat format = new DecimalFormat("##.##"); |
| System.out.println("\nFile length: " + fileLen + " bytes"); |
| System.out.println("Padding length: " + paddedBytes + " bytes"); |
| System.out.println("Padding ratio: " + format.format(percentPadding) + "%"); |
| //print out any user metadata properties |
| List<String> keys = reader.getMetadataKeys(); |
| for(int i = 0; i < keys.size(); i++) { |
| if(i == 0) { |
| System.out.println("\nUser Metadata:"); |
| } |
| ByteBuffer byteBuffer = reader.getMetadataValue(keys.get(i)); |
| System.out.println(" " + keys.get(i) + "=" |
| + StandardCharsets.UTF_8.decode(byteBuffer)); |
| } |
| rows.close(); |
| } |
| |
| private static void recoverFiles(final List<String> corruptFiles, final Configuration conf, |
| final String backup) |
| throws IOException { |
| for (String corruptFile : corruptFiles) { |
| System.err.println("Recovering file " + corruptFile); |
| Path corruptPath = new Path(corruptFile); |
| FileSystem fs = corruptPath.getFileSystem(conf); |
| FSDataInputStream fdis = fs.open(corruptPath); |
| try { |
| long corruptFileLen = fs.getFileStatus(corruptPath).getLen(); |
| long remaining = corruptFileLen; |
| List<Long> footerOffsets = new ArrayList<>(); |
| |
| // start reading the data file form top to bottom and record the valid footers |
| while (remaining > 0) { |
| int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining); |
| byte[] data = new byte[toRead]; |
| long startPos = corruptFileLen - remaining; |
| fdis.readFully(startPos, data, 0, toRead); |
| |
| // find all MAGIC string and see if the file is readable from there |
| int index = 0; |
| long nextFooterOffset; |
| byte[] magicBytes = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8); |
| while (index != -1) { |
| index = indexOf(data, magicBytes, index + 1); |
| if (index != -1) { |
| nextFooterOffset = startPos + index + magicBytes.length + 1; |
| if (isReadable(corruptPath, conf, nextFooterOffset)) { |
| footerOffsets.add(nextFooterOffset); |
| } |
| } |
| } |
| |
| System.err.println("Scanning for valid footers - startPos: " + startPos + |
| " toRead: " + toRead + " remaining: " + remaining); |
| remaining = remaining - toRead; |
| } |
| |
| System.err.println("Readable footerOffsets: " + footerOffsets); |
| recoverFile(corruptPath, fs, conf, footerOffsets, backup); |
| } catch (Exception e) { |
| Path recoveryFile = getRecoveryFile(corruptPath); |
| if (fs.exists(recoveryFile)) { |
| fs.delete(recoveryFile, false); |
| } |
| System.err.println("Unable to recover file " + corruptFile); |
| e.printStackTrace(); |
| System.err.println(SEPARATOR); |
| continue; |
| } finally { |
| fdis.close(); |
| } |
| System.err.println(corruptFile + " recovered successfully!"); |
| System.err.println(SEPARATOR); |
| } |
| } |
| |
| private static void recoverFile(final Path corruptPath, final FileSystem fs, |
| final Configuration conf, final List<Long> footerOffsets, final String backup) |
| throws IOException { |
| |
| // first recover the file to .recovered file and then once successful rename it to actual file |
| Path recoveredPath = getRecoveryFile(corruptPath); |
| |
| // make sure that file does not exist |
| if (fs.exists(recoveredPath)) { |
| fs.delete(recoveredPath, false); |
| } |
| |
| // if there are no valid footers, the file should still be readable so create an empty orc file |
| if (footerOffsets == null || footerOffsets.isEmpty()) { |
| System.err.println("No readable footers found. Creating empty orc file."); |
| TypeDescription schema = TypeDescription.createStruct(); |
| Writer writer = OrcFile.createWriter(recoveredPath, |
| OrcFile.writerOptions(conf).setSchema(schema)); |
| writer.close(); |
| } else { |
| FSDataInputStream fdis = fs.open(corruptPath); |
| FileStatus fileStatus = fs.getFileStatus(corruptPath); |
| // read corrupt file and copy it to recovered file until last valid footer |
| FSDataOutputStream fdos = fs.create(recoveredPath, true, |
| conf.getInt("io.file.buffer.size", 4096), |
| fileStatus.getReplication(), |
| fileStatus.getBlockSize()); |
| try { |
| long fileLen = footerOffsets.get(footerOffsets.size() - 1); |
| long remaining = fileLen; |
| |
| while (remaining > 0) { |
| int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining); |
| byte[] data = new byte[toRead]; |
| long startPos = fileLen - remaining; |
| fdis.readFully(startPos, data, 0, toRead); |
| fdos.write(data); |
| System.err.println("Copying data to recovery file - startPos: " + startPos + |
| " toRead: " + toRead + " remaining: " + remaining); |
| remaining = remaining - toRead; |
| } |
| } catch (Exception e) { |
| fs.delete(recoveredPath, false); |
| throw new IOException(e); |
| } finally { |
| fdis.close(); |
| fdos.close(); |
| } |
| } |
| |
| // validate the recovered file once again and start moving corrupt files to backup folder |
| if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) { |
| Path backupDataPath; |
| String scheme = corruptPath.toUri().getScheme(); |
| String authority = corruptPath.toUri().getAuthority(); |
| String filePath = corruptPath.toUri().getPath(); |
| |
| // use the same filesystem as corrupt file if backup-path is not explicitly specified |
| if (backup.equals(DEFAULT_BACKUP_PATH)) { |
| backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath); |
| } else { |
| backupDataPath = Path.mergePaths(new Path(backup), corruptPath); |
| } |
| |
| // Move data file to backup path |
| moveFiles(fs, corruptPath, backupDataPath); |
| |
| // Move side file to backup path |
| Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath); |
| Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName()); |
| moveFiles(fs, sideFilePath, backupSideFilePath); |
| |
| // finally move recovered file to actual file |
| moveFiles(fs, recoveredPath, corruptPath); |
| |
| // we are done recovering, backing up and validating |
| System.err.println("Validation of recovered file successful!"); |
| } |
| } |
| |
| private static void moveFiles(final FileSystem fs, final Path src, final Path dest) |
| throws IOException { |
| try { |
| // create the dest directory if not exist |
| if (!fs.exists(dest.getParent())) { |
| fs.mkdirs(dest.getParent()); |
| } |
| |
| // if the destination file exists for some reason delete it |
| fs.delete(dest, false); |
| |
| if (fs.rename(src, dest)) { |
| System.err.println("Moved " + src + " to " + dest); |
| } else { |
| throw new IOException("Unable to move " + src + " to " + dest); |
| } |
| |
| } catch (Exception e) { |
| throw new IOException("Unable to move " + src + " to " + dest, e); |
| } |
| } |
| |
| private static Path getRecoveryFile(final Path corruptPath) { |
| return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered"); |
| } |
| |
| private static boolean isReadable(final Path corruptPath, final Configuration conf, |
| final long maxLen) { |
| try { |
| OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen)); |
| return true; |
| } catch (Exception e) { |
| // ignore this exception as maxLen is unreadable |
| return false; |
| } |
| } |
| |
| // search for byte pattern in another byte array |
| private static int indexOf(final byte[] data, final byte[] pattern, final int index) { |
| if (data == null || data.length == 0 || pattern == null || pattern.length == 0 || |
| index > data.length || index < 0) { |
| return -1; |
| } |
| |
| int j = 0; |
| for (int i = index; i < data.length; i++) { |
| if (pattern[j] == data[i]) { |
| j++; |
| } else { |
| j = 0; |
| } |
| |
| if (j == pattern.length) { |
| return i - pattern.length + 1; |
| } |
| } |
| |
| return -1; |
| } |
| |
| private static String getFormattedBloomFilters(int col, OrcIndex index, |
| OrcFile.WriterVersion version, |
| TypeDescription.Category type, |
| OrcProto.ColumnEncoding encoding) { |
| OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex(); |
| StringBuilder buf = new StringBuilder(); |
| BloomFilter stripeLevelBF = null; |
| if (bloomFilterIndex != null && bloomFilterIndex[col] != null) { |
| int idx = 0; |
| buf.append("\n Bloom filters for column ").append(col).append(":"); |
| for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { |
| BloomFilter toMerge = BloomFilterIO.deserialize( |
| index.getBloomFilterKinds()[col], encoding, version, type, bf); |
| buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge)); |
| if (stripeLevelBF == null) { |
| stripeLevelBF = toMerge; |
| } else { |
| stripeLevelBF.merge(toMerge); |
| } |
| } |
| String bloomFilterStats = getBloomFilterStats(stripeLevelBF); |
| buf.append("\n Stripe level merge:").append(bloomFilterStats); |
| } |
| return buf.toString(); |
| } |
| |
| private static String getBloomFilterStats(BloomFilter bf) { |
| StringBuilder sb = new StringBuilder(); |
| int bitCount = bf.getBitSize(); |
| int popCount = 0; |
| for (long l : bf.getBitSet()) { |
| popCount += Long.bitCount(l); |
| } |
| int k = bf.getNumHashFunctions(); |
| float loadFactor = (float) popCount / (float) bitCount; |
| float expectedFpp = (float) Math.pow(loadFactor, k); |
| DecimalFormat df = new DecimalFormat("###.####"); |
| sb.append(" numHashFunctions: ").append(k); |
| sb.append(" bitCount: ").append(bitCount); |
| sb.append(" popCount: ").append(popCount); |
| sb.append(" loadFactor: ").append(df.format(loadFactor)); |
| sb.append(" expectedFpp: ").append(expectedFpp); |
| return sb.toString(); |
| } |
| |
| private static String getFormattedRowIndices(int col, |
| OrcProto.RowIndex[] rowGroupIndex, |
| TypeDescription schema, |
| ReaderImpl reader) { |
| StringBuilder buf = new StringBuilder(); |
| OrcProto.RowIndex index; |
| buf.append(" Row group indices for column ").append(col).append(":"); |
| if (rowGroupIndex == null || (col >= rowGroupIndex.length) || |
| ((index = rowGroupIndex[col]) == null)) { |
| buf.append(" not found\n"); |
| return buf.toString(); |
| } |
| |
| TypeDescription colSchema = schema.findSubtype(col); |
| for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) { |
| buf.append("\n Entry ").append(entryIx).append(": "); |
| OrcProto.RowIndexEntry entry = index.getEntry(entryIx); |
| if (entry == null) { |
| buf.append("unknown\n"); |
| continue; |
| } |
| OrcProto.ColumnStatistics colStats = entry.getStatistics(); |
| if (colStats == null) { |
| buf.append("no stats at "); |
| } else { |
| ColumnStatistics cs = |
| ColumnStatisticsImpl.deserialize(colSchema, colStats, reader); |
| buf.append(cs.toString()); |
| } |
| buf.append(" positions: "); |
| for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) { |
| if (posIx != 0) { |
| buf.append(","); |
| } |
| buf.append(entry.getPositions(posIx)); |
| } |
| } |
| return buf.toString(); |
| } |
| |
| public static long getTotalPaddingSize(Reader reader) throws IOException { |
| long paddedBytes = 0; |
| List<StripeInformation> stripes = reader.getStripes(); |
| for (int i = 1; i < stripes.size(); i++) { |
| long prevStripeOffset = stripes.get(i - 1).getOffset(); |
| long prevStripeLen = stripes.get(i - 1).getLength(); |
| paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen); |
| } |
| return paddedBytes; |
| } |
| |
| @SuppressWarnings("static-access") |
| static Options createOptions() { |
| Options result = new Options(); |
| |
| // add -d and --data to print the rows |
| result.addOption(OptionBuilder |
| .withLongOpt("data") |
| .withDescription("Should the data be printed") |
| .create('d')); |
| |
| // to avoid breaking unit tests (when run in different time zones) for file dump, printing |
| // of timezone is made optional |
| result.addOption(OptionBuilder |
| .withLongOpt("timezone") |
| .withDescription("Print writer's time zone") |
| .create('t')); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("help") |
| .withDescription("print help message") |
| .create('h')); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("rowindex") |
| .withArgName("comma separated list of column ids for which row index should be printed") |
| .withDescription("Dump stats for column number(s)") |
| .hasArg() |
| .create('r')); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("json") |
| .withDescription("Print metadata in JSON format") |
| .create('j')); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("pretty") |
| .withDescription("Pretty print json metadata output") |
| .create('p')); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("recover") |
| .withDescription("recover corrupted orc files generated by streaming") |
| .create()); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("skip-dump") |
| .withDescription("used along with --recover to directly recover files without dumping") |
| .create()); |
| |
| result.addOption(OptionBuilder |
| .withLongOpt("backup-path") |
| .withDescription("specify a backup path to store the corrupted files (default: /tmp)") |
| .hasArg() |
| .create()); |
| return result; |
| } |
| |
| } |