| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.parquet.tools.command; |
| |
| import java.io.IOException; |
| import java.math.BigInteger; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetDecoder; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| import java.util.zip.CRC32; |
| |
| import org.apache.commons.cli.CommandLine; |
| import org.apache.commons.cli.Option; |
| import org.apache.commons.cli.OptionBuilder; |
| import org.apache.commons.cli.Options; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.Path; |
| |
| import org.apache.parquet.column.ColumnDescriptor; |
| import org.apache.parquet.column.ColumnReader; |
| import org.apache.parquet.column.impl.ColumnReadStoreImpl; |
| import org.apache.parquet.column.page.DataPage; |
| import org.apache.parquet.column.page.DataPage.Visitor; |
| import org.apache.parquet.column.page.DataPageV1; |
| import org.apache.parquet.column.page.DataPageV2; |
| import org.apache.parquet.column.page.DictionaryPage; |
| import org.apache.parquet.column.page.PageReadStore; |
| import org.apache.parquet.column.page.PageReader; |
| import org.apache.parquet.column.statistics.Statistics; |
| import org.apache.parquet.hadoop.ParquetFileReader; |
| import org.apache.parquet.hadoop.metadata.BlockMetaData; |
| import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; |
| import org.apache.parquet.hadoop.metadata.ParquetMetadata; |
| import org.apache.parquet.io.api.Binary; |
| import org.apache.parquet.io.api.Converter; |
| import org.apache.parquet.io.api.GroupConverter; |
| import org.apache.parquet.io.api.PrimitiveConverter; |
| import org.apache.parquet.schema.MessageType; |
| import org.apache.parquet.schema.PrimitiveStringifier; |
| import org.apache.parquet.tools.util.PrettyPrintWriter; |
| import org.apache.parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler; |
| |
| import com.google.common.base.Joiner; |
| |
| import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER; |
| |
| public class DumpCommand extends ArgsOnlyCommand { |
| private static final Charset UTF8 = Charset.forName("UTF-8"); |
| private static final CharsetDecoder UTF8_DECODER = UTF8.newDecoder(); |
| |
| public static final String TABS = " "; |
| public static final int BLOCK_BUFFER_SIZE = 64 * 1024; |
| public static final String[] USAGE = new String[] { "<input>", "where <input> is the parquet file to print to stdout" }; |
| |
| private static CRC32 crc = new CRC32(); |
| |
| public static final Options OPTIONS; |
| static { |
| OPTIONS = new Options(); |
| Option md = OptionBuilder.withLongOpt("disable-meta") |
| .withDescription("Do not dump row group and page metadata") |
| .create('m'); |
| |
| Option dt = OptionBuilder.withLongOpt("disable-data") |
| .withDescription("Do not dump column data") |
| .create('d'); |
| |
| Option nocrop = OptionBuilder.withLongOpt("disable-crop") |
| .withDescription("Do not crop the output based on console width") |
| .create('n'); |
| |
| Option cl = OptionBuilder.withLongOpt("column") |
| .withDescription("Dump only the given column, can be specified more than once") |
| .hasArg() |
| .create('c'); |
| |
| OPTIONS.addOption(md); |
| OPTIONS.addOption(dt); |
| OPTIONS.addOption(nocrop); |
| OPTIONS.addOption(cl); |
| } |
| |
| public DumpCommand() { |
| super(1, 1); |
| } |
| |
| @Override |
| public Options getOptions() { |
| return OPTIONS; |
| } |
| |
| @Override |
| public String[] getUsageDescription() { |
| return USAGE; |
| } |
| |
| @Override |
| public String getCommandDescription() { |
| return "Prints the content and metadata of a Parquet file"; |
| } |
| |
| @Override |
| public void execute(CommandLine options) throws Exception { |
| super.execute(options); |
| |
| String[] args = options.getArgs(); |
| String input = args[0]; |
| |
| Configuration conf = new Configuration(); |
| Path inpath = new Path(input); |
| |
| ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER); |
| MessageType schema = metaData.getFileMetaData().getSchema(); |
| |
| boolean showmd = !options.hasOption('m'); |
| boolean showdt = !options.hasOption('d'); |
| boolean cropoutput = !options.hasOption('n'); |
| |
| Set<String> showColumns = null; |
| if (options.hasOption('c')) { |
| String[] cols = options.getOptionValues('c'); |
| showColumns = new HashSet<String>(Arrays.asList(cols)); |
| } |
| |
| PrettyPrintWriter out = prettyPrintWriter(cropoutput); |
| dump(out, metaData, schema, inpath, showmd, showdt, showColumns); |
| } |
| |
| public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException { |
| Configuration conf = new Configuration(); |
| |
| List<BlockMetaData> blocks = meta.getBlocks(); |
| List<ColumnDescriptor> columns = schema.getColumns(); |
| if (showColumns != null) { |
| columns = new ArrayList<ColumnDescriptor>(); |
| for (ColumnDescriptor column : schema.getColumns()) { |
| String path = Joiner.on('.').skipNulls().join(column.getPath()); |
| if (showColumns.contains(path)) { |
| columns.add(column); |
| } |
| } |
| } |
| |
| ParquetFileReader freader = null; |
| if (showmd) { |
| try { |
| long group = 0; |
| for (BlockMetaData block : blocks) { |
| if (group != 0) out.println(); |
| out.format("row group %d%n", group++); |
| out.rule('-'); |
| |
| |
| List<ColumnChunkMetaData> ccmds = block.getColumns(); |
| if (showColumns != null) { |
| ccmds = new ArrayList<ColumnChunkMetaData>(); |
| for (ColumnChunkMetaData ccmd : block.getColumns()) { |
| String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray()); |
| if (showColumns.contains(path)) { |
| ccmds.add(ccmd); |
| } |
| } |
| } |
| |
| MetadataUtils.showDetails(out, ccmds); |
| |
| List<BlockMetaData> rblocks = Collections.singletonList(block); |
| freader = new ParquetFileReader( |
| conf, meta.getFileMetaData(), inpath, rblocks, columns); |
| PageReadStore store = freader.readNextRowGroup(); |
| while (store != null) { |
| out.incrementTabLevel(); |
| for (ColumnDescriptor column : columns) { |
| out.println(); |
| dump(out, store, column); |
| } |
| out.decrementTabLevel(); |
| |
| store = freader.readNextRowGroup(); |
| } |
| out.flushColumns(); |
| } |
| } finally { |
| if (freader != null) { |
| freader.close(); |
| } |
| } |
| } |
| |
| if (showdt) { |
| boolean first = true; |
| for (ColumnDescriptor column : columns) { |
| if (!first || showmd) out.println(); |
| first = false; |
| |
| out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath())); |
| out.rule('-'); |
| try { |
| long page = 1; |
| long total = blocks.size(); |
| long offset = 1; |
| freader = new ParquetFileReader( |
| conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column)); |
| PageReadStore store = freader.readNextRowGroup(); |
| while (store != null) { |
| ColumnReadStoreImpl crstore = new ColumnReadStoreImpl( |
| store, new DumpGroupConverter(), schema, |
| meta.getFileMetaData().getCreatedBy()); |
| dump(out, crstore, column, page++, total, offset); |
| |
| offset += store.getRowCount(); |
| store = freader.readNextRowGroup(); |
| } |
| |
| out.flushColumns(); |
| } finally { |
| out.flushColumns(); |
| if (freader != null) { |
| freader.close(); |
| } |
| } |
| } |
| } |
| } |
| |
| private static boolean verifyCrc(int referenceCrc, byte[] bytes) { |
| crc.reset(); |
| crc.update(bytes); |
| return crc.getValue() == ((long) referenceCrc & 0xffffffffL); |
| } |
| |
| public static void dump(final PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException { |
| PageReader reader = store.getPageReader(column); |
| |
| long vc = reader.getTotalValueCount(); |
| int rmax = column.getMaxRepetitionLevel(); |
| int dmax = column.getMaxDefinitionLevel(); |
| out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax); |
| |
| DictionaryPage dict = reader.readDictionaryPage(); |
| if (dict != null) { |
| out.format(" DS:%d", dict.getDictionarySize()); |
| out.format(" DE:%s", dict.getEncoding()); |
| } |
| |
| out.println(); |
| out.rule('-'); |
| |
| DataPage page = reader.readPage(); |
| for (long count = 0; page != null; count++) { |
| out.format("page %d:", count); |
| page.accept(new Visitor<Void>() { |
| @Override |
| public Void visit(DataPageV1 pageV1) { |
| out.format(" DLE:%s", pageV1.getDlEncoding()); |
| out.format(" RLE:%s", pageV1.getRlEncoding()); |
| out.format(" VLE:%s", pageV1.getValueEncoding()); |
| Statistics<?> statistics = pageV1.getStatistics(); |
| if (statistics != null) { |
| out.format(" ST:[%s]", statistics); |
| } else { |
| out.format(" ST:[none]"); |
| } |
| if (pageV1.getCrc().isPresent()) { |
| try { |
| out.format(" CRC:%s", verifyCrc(pageV1.getCrc().getAsInt(), pageV1.getBytes().toByteArray()) ? "[verified]" : "[PAGE CORRUPT]"); |
| } catch (IOException e) { |
| out.format(" CRC:[error getting page bytes]"); |
| } |
| } else { |
| out.format(" CRC:[none]"); |
| } |
| return null; |
| } |
| |
| @Override |
| public Void visit(DataPageV2 pageV2) { |
| out.format(" DLE:RLE"); |
| out.format(" RLE:RLE"); |
| out.format(" VLE:%s", pageV2.getDataEncoding()); |
| Statistics<?> statistics = pageV2.getStatistics(); |
| if (statistics != null) { |
| out.format(" ST:[%s]", statistics); |
| } else { |
| out.format(" ST:[none]"); |
| } |
| return null; |
| } |
| }); |
| out.format(" SZ:%d", page.getUncompressedSize()); |
| out.format(" VC:%d", page.getValueCount()); |
| out.println(); |
| page = reader.readPage(); |
| } |
| } |
| |
| public static void dump(PrettyPrintWriter out, ColumnReadStoreImpl crstore, ColumnDescriptor column, long page, long total, long offset) throws IOException { |
| int dmax = column.getMaxDefinitionLevel(); |
| ColumnReader creader = crstore.getColumnReader(column); |
| out.format("*** row group %d of %d, values %d to %d ***%n", page, total, offset, offset + creader.getTotalValueCount() - 1); |
| |
| for (long i = 0, e = creader.getTotalValueCount(); i < e; ++i) { |
| int rlvl = creader.getCurrentRepetitionLevel(); |
| int dlvl = creader.getCurrentDefinitionLevel(); |
| |
| out.format("value %d: R:%d D:%d V:", offset+i, rlvl, dlvl); |
| if (dlvl == dmax) { |
| PrimitiveStringifier stringifier = column.getPrimitiveType().stringifier(); |
| switch (column.getType()) { |
| case FIXED_LEN_BYTE_ARRAY: |
| case INT96: |
| case BINARY: |
| out.print(stringifier.stringify(creader.getBinary())); |
| break; |
| case BOOLEAN: |
| out.print(stringifier.stringify(creader.getBoolean())); |
| break; |
| case DOUBLE: |
| out.print(stringifier.stringify(creader.getDouble())); |
| break; |
| case FLOAT: |
| out.print(stringifier.stringify(creader.getFloat())); |
| break; |
| case INT32: |
| out.print(stringifier.stringify(creader.getInteger())); |
| break; |
| case INT64: |
| out.print(stringifier.stringify(creader.getLong())); |
| break; |
| } |
| } else { |
| out.format("<null>"); |
| } |
| |
| out.println(); |
| creader.consume(); |
| } |
| } |
| |
| public static String binaryToString(Binary value) { |
| byte[] data = value.getBytesUnsafe(); |
| if (data == null) return null; |
| |
| try { |
| CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer()); |
| return buffer.toString(); |
| } catch (Exception ex) { |
| } |
| |
| return "<bytes...>"; |
| } |
| |
| public static BigInteger binaryToBigInteger(Binary value) { |
| byte[] data = value.getBytesUnsafe(); |
| if (data == null) return null; |
| |
| return new BigInteger(data); |
| } |
| |
| private static PrettyPrintWriter prettyPrintWriter(boolean cropOutput) { |
| PrettyPrintWriter.Builder builder = PrettyPrintWriter.stdoutPrettyPrinter() |
| .withAutoColumn() |
| .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES) |
| .withColumnPadding(1) |
| .withMaxBufferedLines(1000000) |
| .withFlushOnTab(); |
| |
| if (cropOutput) { |
| builder.withAutoCrop(); |
| } |
| |
| return builder.build(); |
| } |
| |
| private static final class DumpGroupConverter extends GroupConverter { |
| @Override public void start() { } |
| @Override public void end() { } |
| @Override public Converter getConverter(int fieldIndex) { return new DumpConverter(); } |
| } |
| |
| private static final class DumpConverter extends PrimitiveConverter { |
| @Override public GroupConverter asGroupConverter() { return new DumpGroupConverter(); } |
| } |
| } |