Check pages tool
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/CheckPages.java b/parquet-hadoop/src/main/java/parquet/hadoop/CheckPages.java new file mode 100644 index 0000000..fa8aa29 --- /dev/null +++ b/parquet-hadoop/src/main/java/parquet/hadoop/CheckPages.java
@@ -0,0 +1,210 @@ +/** + * Copyright 2014 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package parquet.hadoop; + +import java.net.URI; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import parquet.column.ColumnDescriptor; +import parquet.column.Dictionary; +import parquet.column.ValuesType; +import parquet.column.page.DictionaryPage; +import parquet.column.page.Page; +import parquet.column.page.PageReadStore; +import parquet.column.page.PageReader; +import parquet.column.values.ValuesReader; +import parquet.hadoop.metadata.BlockMetaData; +import parquet.hadoop.metadata.ParquetMetadata; +import parquet.schema.PrimitiveType.PrimitiveTypeName; +import parquet.schema.PrimitiveType.PrimitiveTypeNameConverter; + +/** + * Utility to check pages metadata + * @author Julien Le Dem + * + */ +public class CheckPages { + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + System.err.println("usage CheckPages <path> [<col>]*"); + return; + } + Path path = new Path(new URI(args[0])); + final Configuration configuration = new Configuration(); + + final FileSystem fs = path.getFileSystem(configuration); + FileStatus fileStatus = fs.getFileStatus(path); + ParquetMetadata footer = ParquetFileReader.readFooter(configuration, fileStatus); + List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns(); + if (args.length > 1) { + Set<String> requested = new HashSet<String>(); + for (int i = 1; i < args.length; i++) { + requested.add(args[i]); + } + Iterator<ColumnDescriptor> iterator = columns.iterator(); + while (iterator.hasNext()) { + if (!requested.contains(dotted(iterator.next().getPath()))) { + iterator.remove(); + } + } + } + Map<ColumnDescriptor, Counts> totalCounts = new HashMap<ColumnDescriptor, Counts>(); + for (ColumnDescriptor col : columns) { + totalCounts.put(col, new Counts()); + } + + System.out.println(footer.getBlocks().size() + " blocks"); + int i = 0; + for (BlockMetaData blockMetaData : footer.getBlocks()) { + System.out.println("block " + i); + ++ i; + List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(1); + blocks.add(blockMetaData); + ParquetFileReader reader = new ParquetFileReader(configuration, path, blocks, columns); + PageReadStore pages = reader.readNextRowGroup(); + for (ColumnDescriptor col : columns) { + PageReader pageReader = pages.getPageReader(col); + System.out.println("col " + col + ": values=" + pageReader.getTotalValueCount() + " rows=" + pages.getRowCount()); + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + Dictionary dictionary = null; + if (dictionaryPage != null) { + System.out.println("Dic: " + dictionaryPage); + dictionary = dictionaryPage.getEncoding().initDictionary(col, dictionaryPage); + } + Counts counts = totalCounts.get(col); + Page page; + long totalValues = 0; + long recordCount = 0; + int nulls = 0; + int p = 0; + while ((page = pageReader.readPage()) != null) { + System.out.println("page " + p + ": size=" + page.getBytes().size() + " valueCount=" + page.getValueCount() + " uncompressedSize=" + page.getUncompressedSize()); + ++ p; + int pageValueCount = page.getValueCount(); + totalValues += pageValueCount; + ValuesReader repetitionLevelColumn = page.getRlEncoding().getValuesReader(col, ValuesType.REPETITION_LEVEL); + ValuesReader definitionLevelColumn = page.getDlEncoding().getValuesReader(col, ValuesType.DEFINITION_LEVEL); + final ValuesReader dataColumn; + if (page.getValueEncoding().usesDictionary()) { + dataColumn = page.getValueEncoding().getDictionaryBasedValuesReader(col, ValuesType.VALUES, dictionary); + } else { + dataColumn = page.getValueEncoding().getValuesReader(col, ValuesType.VALUES); + } + byte[] bytes = page.getBytes().toByteArray(); + repetitionLevelColumn.initFromPage(pageValueCount, bytes, 0); + int next = repetitionLevelColumn.getNextOffset(); + definitionLevelColumn.initFromPage(pageValueCount, bytes, next); + next = definitionLevelColumn.getNextOffset(); + dataColumn.initFromPage(pageValueCount, bytes, next); + PrimitiveTypeNameConverter<Object, RuntimeException> converter = new PrimitiveTypeNameConverter<Object, RuntimeException>() { + @Override public Object convertFLOAT(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readFloat(); + } + @Override public Object convertDOUBLE(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readDouble(); + } + @Override public Object convertINT32(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readInteger(); + } + @Override public Object convertINT64(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readLong(); + } + @Override public Object convertINT96(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readBytes(); + } + @Override public Object convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readBytes(); + } + @Override public Object convertBOOLEAN(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readBoolean(); + } + @Override public Object convertBINARY(PrimitiveTypeName primitiveTypeName) throws RuntimeException { + return dataColumn.readBytes(); + } + }; + for (int j = 0; j < pageValueCount; j++) { + int r = repetitionLevelColumn.readInteger(); + if (r == 0) { + ++ recordCount; + } + int d = definitionLevelColumn.readInteger(); +// System.out.println("r = " + r + " d = " + d); + if (d == col.getMaxDefinitionLevel()) { + Object result = col.getType().convert(converter); +// System.out.println(result); + } else { + ++ nulls; + } + } + } + counts.totalRowsInRowGroupMetadata += pages.getRowCount(); + counts.totalRowsInData += recordCount; + counts.totalValuesInRowGroupMetadata += pageReader.getTotalValueCount(); + counts.totalValuesInPageMetadata += totalValues; + System.out.println("nulls: " + nulls); + if (pages.getRowCount() != recordCount) { + System.out.println(col + "!!!!!! " + recordCount + " != " + pages.getRowCount() + " missing " + (pages.getRowCount() - recordCount)); + } + if (totalValues != pageReader.getTotalValueCount()) { + System.out.println(col + "!!!!!! " + totalValues + " != " + pageReader.getTotalValueCount()); + } + } + } + System.out.println("totals:"); + for (ColumnDescriptor col : columns) { + System.out.println(col + " " + totalCounts.get(col)); + } + + } + + private static class Counts { + public long totalValuesInPageMetadata; + public long totalValuesInRowGroupMetadata; + public long totalRowsInData; + public long totalRowsInRowGroupMetadata; + @Override + public String toString() { + String rows = (totalRowsInData == totalRowsInRowGroupMetadata) ? String.valueOf(totalRowsInData) : + "data = " + totalRowsInData + " row groups = " + totalRowsInRowGroupMetadata + " diff = " + (totalRowsInData - totalRowsInRowGroupMetadata); + String values = (totalValuesInPageMetadata == totalValuesInRowGroupMetadata) ? String.valueOf(totalValuesInPageMetadata) : + "pages = " + totalValuesInPageMetadata + " row groups = " + totalValuesInRowGroupMetadata + " diff = " + (totalValuesInPageMetadata - totalValuesInRowGroupMetadata); + return "values: " + values + ", rows: " + rows; + } + } + + private static String dotted(String[] path) { + String result = ""; + for (int i = 0; i < path.length; i++) { + if (i != 0) { + result += "."; + } + result += path[i]; + } + return result; + } +} \ No newline at end of file