| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * <p/> |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * <p/> |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.orc.tools; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.orc.CollectionColumnStatistics; |
| import org.apache.orc.CompressionKind; |
| import org.apache.orc.OrcFile; |
| import org.apache.orc.Reader; |
| import org.apache.orc.TypeDescription; |
| import org.apache.orc.impl.AcidStats; |
| import org.apache.orc.impl.OrcAcidUtils; |
| import org.apache.orc.impl.ReaderImpl; |
| import org.apache.orc.impl.RecordReaderImpl; |
| import org.apache.orc.util.BloomFilter; |
| import org.codehaus.jettison.json.JSONArray; |
| import org.apache.orc.util.BloomFilterIO; |
| import org.apache.orc.BinaryColumnStatistics; |
| import org.apache.orc.BooleanColumnStatistics; |
| import org.apache.orc.ColumnStatistics; |
| import org.apache.orc.impl.ColumnStatisticsImpl; |
| import org.apache.orc.DateColumnStatistics; |
| import org.apache.orc.DecimalColumnStatistics; |
| import org.apache.orc.DoubleColumnStatistics; |
| import org.apache.orc.IntegerColumnStatistics; |
| import org.apache.orc.impl.OrcIndex; |
| import org.apache.orc.OrcProto; |
| import org.apache.orc.StringColumnStatistics; |
| import org.apache.orc.StripeInformation; |
| import org.apache.orc.StripeStatistics; |
| import org.apache.orc.TimestampColumnStatistics; |
| import org.codehaus.jettison.json.JSONException; |
| import org.codehaus.jettison.json.JSONObject; |
| import org.codehaus.jettison.json.JSONStringer; |
| import org.codehaus.jettison.json.JSONWriter; |
| |
| /** |
| * File dump tool with json formatted output. |
| */ |
| public class JsonFileDump { |
| |
| public static void printJsonMetaData(List<String> files, |
| Configuration conf, |
| List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone) |
| throws JSONException, IOException { |
| if (files.isEmpty()) { |
| return; |
| } |
| JSONStringer writer = new JSONStringer(); |
| boolean multiFile = files.size() > 1; |
| if (multiFile) { |
| writer.array(); |
| } else { |
| writer.object(); |
| } |
| for (String filename : files) { |
| try { |
| if (multiFile) { |
| writer.object(); |
| } |
| writer.key("fileName").value(filename); |
| Path path = new Path(filename); |
| Reader reader = FileDump.getReader(path, conf, null); |
| if (reader == null) { |
| writer.key("status").value("FAILED"); |
| continue; |
| } |
| writer.key("fileVersion").value(reader.getFileVersion().getName()); |
| writer.key("writerVersion").value(reader.getWriterVersion()); |
| RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); |
| writer.key("numberOfRows").value(reader.getNumberOfRows()); |
| writer.key("compression").value(reader.getCompressionKind()); |
| if (reader.getCompressionKind() != CompressionKind.NONE) { |
| writer.key("compressionBufferSize").value(reader.getCompressionSize()); |
| } |
| writer.key("schemaString").value(reader.getSchema().toString()); |
| writer.key("schema"); |
| writeSchema(writer, reader.getSchema()); |
| writer.key("calendar").value(reader.writerUsedProlepticGregorian() |
| ? "proleptic Gregorian" |
| : "Julian/Gregorian"); |
| writer.key("stripeStatistics").array(); |
| List<StripeStatistics> stripeStatistics = reader.getStripeStatistics(); |
| for (int n = 0; n < stripeStatistics.size(); n++) { |
| writer.object(); |
| writer.key("stripeNumber").value(n + 1); |
| StripeStatistics ss = stripeStatistics.get(n); |
| writer.key("columnStatistics").array(); |
| for (int i = 0; i < ss.getColumnStatistics().length; i++) { |
| writer.object(); |
| writer.key("columnId").value(i); |
| writeColumnStatistics(writer, ss.getColumnStatistics()[i]); |
| writer.endObject(); |
| } |
| writer.endArray(); |
| writer.endObject(); |
| } |
| writer.endArray(); |
| |
| ColumnStatistics[] stats = reader.getStatistics(); |
| int colCount = stats.length; |
| if (rowIndexCols == null) { |
| rowIndexCols = new ArrayList<>(colCount); |
| for (int i = 0; i < colCount; ++i) { |
| rowIndexCols.add(i); |
| } |
| } |
| writer.key("fileStatistics").array(); |
| for (int i = 0; i < stats.length; ++i) { |
| writer.object(); |
| writer.key("columnId").value(i); |
| writeColumnStatistics(writer, stats[i]); |
| writer.endObject(); |
| } |
| writer.endArray(); |
| |
| writer.key("stripes").array(); |
| int stripeIx = -1; |
| for (StripeInformation stripe : reader.getStripes()) { |
| ++stripeIx; |
| long stripeStart = stripe.getOffset(); |
| OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); |
| writer.object(); // start of stripe information |
| writer.key("stripeNumber").value(stripeIx + 1); |
| writer.key("stripeInformation"); |
| writeStripeInformation(writer, stripe); |
| if (printTimeZone) { |
| writer.key("writerTimezone").value( |
| footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN); |
| } |
| long sectionStart = stripeStart; |
| |
| writer.key("streams").array(); |
| for (OrcProto.Stream section : footer.getStreamsList()) { |
| writer.object(); |
| String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN; |
| writer.key("columnId").value(section.getColumn()); |
| writer.key("section").value(kind); |
| writer.key("startOffset").value(sectionStart); |
| writer.key("length").value(section.getLength()); |
| sectionStart += section.getLength(); |
| writer.endObject(); |
| } |
| writer.endArray(); |
| |
| writer.key("encodings").array(); |
| for (int i = 0; i < footer.getColumnsCount(); ++i) { |
| writer.object(); |
| OrcProto.ColumnEncoding encoding = footer.getColumns(i); |
| writer.key("columnId").value(i); |
| writer.key("kind").value(encoding.getKind()); |
| if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || |
| encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { |
| writer.key("dictionarySize").value(encoding.getDictionarySize()); |
| } |
| writer.endObject(); |
| } |
| writer.endArray(); |
| if (!rowIndexCols.isEmpty()) { |
| // include the columns that are specified, only if the columns are included, bloom filter |
| // will be read |
| boolean[] sargColumns = new boolean[colCount]; |
| for (int colIdx : rowIndexCols) { |
| sargColumns[colIdx] = true; |
| } |
| OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns); |
| writer.key("indexes").array(); |
| for (int col : rowIndexCols) { |
| writer.object(); |
| writer.key("columnId").value(col); |
| writeRowGroupIndexes(writer, col, indices.getRowGroupIndex(), |
| reader.getSchema(), (ReaderImpl) reader); |
| writeBloomFilterIndexes(writer, col, indices, |
| reader.getWriterVersion(), |
| reader.getSchema().findSubtype(col).getCategory(), |
| footer.getColumns(col)); |
| writer.endObject(); |
| } |
| writer.endArray(); |
| } |
| writer.endObject(); // end of stripe information |
| } |
| writer.endArray(); |
| |
| FileSystem fs = path.getFileSystem(conf); |
| long fileLen = fs.getContentSummary(path).getLength(); |
| long paddedBytes = FileDump.getTotalPaddingSize(reader); |
| // empty ORC file is ~45 bytes. Assumption here is file length always >0 |
| double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; |
| writer.key("fileLength").value(fileLen); |
| writer.key("paddingLength").value(paddedBytes); |
| writer.key("paddingRatio").value(percentPadding); |
| AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader); |
| if (acidStats != null) { |
| writer.key("numInserts").value(acidStats.inserts); |
| writer.key("numDeletes").value(acidStats.deletes); |
| writer.key("numUpdates").value(acidStats.updates); |
| } |
| writer.key("status").value("OK"); |
| rows.close(); |
| |
| writer.endObject(); |
| } catch (Throwable e) { |
| writer.key("status").value("FAILED"); |
| throw e; |
| } |
| } |
| if (multiFile) { |
| writer.endArray(); |
| } |
| |
| if (prettyPrint) { |
| final String prettyJson; |
| if (multiFile) { |
| JSONArray jsonArray = new JSONArray(writer.toString()); |
| prettyJson = jsonArray.toString(2); |
| } else { |
| JSONObject jsonObject = new JSONObject(writer.toString()); |
| prettyJson = jsonObject.toString(2); |
| } |
| System.out.println(prettyJson); |
| } else { |
| System.out.println(writer.toString()); |
| } |
| } |
| |
| private static void writeSchema(JSONStringer writer, TypeDescription type) |
| throws JSONException { |
| writer.object(); |
| writer.key("columnId").value(type.getId()); |
| writer.key("columnType").value(type.getCategory()); |
| List<String> attributes = type.getAttributeNames(); |
| if (attributes.size() > 0) { |
| writer.key("attributes").object(); |
| for (String name : attributes) { |
| writer.key(name).value(type.getAttributeValue(name)); |
| } |
| writer.endObject(); |
| } |
| switch (type.getCategory()) { |
| case DECIMAL: |
| writer.key("precision").value(type.getPrecision()); |
| writer.key("scale").value(type.getScale()); |
| break; |
| case VARCHAR: |
| case CHAR: |
| writer.key("maxLength").value(type.getMaxLength()); |
| break; |
| default: |
| break; |
| } |
| List<TypeDescription> children = type.getChildren(); |
| if (children != null) { |
| writer.key("children"); |
| switch (type.getCategory()) { |
| case STRUCT: |
| writer.object(); |
| List<String> fields = type.getFieldNames(); |
| for (int c = 0; c < fields.size(); ++c) { |
| writer.key(fields.get(c)); |
| writeSchema(writer, children.get(c)); |
| } |
| writer.endObject(); |
| break; |
| case LIST: |
| writer.array(); |
| writeSchema(writer, children.get(0)); |
| writer.endArray(); |
| break; |
| case MAP: |
| writer.array(); |
| writeSchema(writer, children.get(0)); |
| writeSchema(writer, children.get(1)); |
| writer.endArray(); |
| break; |
| case UNION: |
| writer.array(); |
| for (TypeDescription child : children) { |
| writeSchema(writer, child); |
| } |
| writer.endArray(); |
| break; |
| default: |
| break; |
| } |
| } |
| writer.endObject(); |
| } |
| |
| private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe) |
| throws JSONException { |
| writer.object(); |
| writer.key("offset").value(stripe.getOffset()); |
| writer.key("indexLength").value(stripe.getIndexLength()); |
| writer.key("dataLength").value(stripe.getDataLength()); |
| writer.key("footerLength").value(stripe.getFooterLength()); |
| writer.key("rowCount").value(stripe.getNumberOfRows()); |
| writer.endObject(); |
| } |
| |
| private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs) |
| throws JSONException { |
| if (cs != null) { |
| writer.key("count").value(cs.getNumberOfValues()); |
| writer.key("hasNull").value(cs.hasNull()); |
| if (cs.getBytesOnDisk() != 0) { |
| writer.key("bytesOnDisk").value(cs.getBytesOnDisk()); |
| } |
| if (cs instanceof BinaryColumnStatistics) { |
| writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum()); |
| writer.key("type").value(OrcProto.Type.Kind.BINARY); |
| } else if (cs instanceof BooleanColumnStatistics) { |
| writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount()); |
| writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount()); |
| writer.key("type").value(OrcProto.Type.Kind.BOOLEAN); |
| } else if (cs instanceof IntegerColumnStatistics) { |
| writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum()); |
| writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum()); |
| if (((IntegerColumnStatistics) cs).isSumDefined()) { |
| writer.key("sum").value(((IntegerColumnStatistics) cs).getSum()); |
| } |
| writer.key("type").value(OrcProto.Type.Kind.LONG); |
| } else if (cs instanceof DoubleColumnStatistics) { |
| writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum()); |
| writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum()); |
| writer.key("sum").value(((DoubleColumnStatistics) cs).getSum()); |
| writer.key("type").value(OrcProto.Type.Kind.DOUBLE); |
| } else if (cs instanceof StringColumnStatistics) { |
| String lower = ((StringColumnStatistics) cs).getLowerBound(); |
| if (((StringColumnStatistics) cs).getMinimum() != null) { |
| writer.key("min").value(lower); |
| } else if (lower != null) { |
| writer.key("lowerBound").value(lower); |
| } |
| String upper = ((StringColumnStatistics) cs).getUpperBound(); |
| if (((StringColumnStatistics) cs).getMaximum() != null) { |
| writer.key("max").value(upper); |
| } else if (upper != null) { |
| writer.key("upperBound").value(upper); |
| } |
| writer.key("totalLength").value(((StringColumnStatistics) cs).getSum()); |
| writer.key("type").value(OrcProto.Type.Kind.STRING); |
| } else if (cs instanceof DateColumnStatistics) { |
| if (((DateColumnStatistics) cs).getMaximum() != null) { |
| writer.key("min").value(((DateColumnStatistics) cs).getMinimum()); |
| writer.key("max").value(((DateColumnStatistics) cs).getMaximum()); |
| } |
| writer.key("type").value(OrcProto.Type.Kind.DATE); |
| } else if (cs instanceof TimestampColumnStatistics) { |
| if (((TimestampColumnStatistics) cs).getMaximum() != null) { |
| writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum()); |
| writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum()); |
| } |
| writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP); |
| } else if (cs instanceof DecimalColumnStatistics) { |
| if (((DecimalColumnStatistics) cs).getMaximum() != null) { |
| writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum()); |
| writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum()); |
| writer.key("sum").value(((DecimalColumnStatistics) cs).getSum()); |
| } |
| writer.key("type").value(OrcProto.Type.Kind.DECIMAL); |
| } else if (cs instanceof CollectionColumnStatistics) { |
| writer.key("minChildren").value(((CollectionColumnStatistics) cs).getMinimumChildren()); |
| writer.key("maxChildren").value(((CollectionColumnStatistics) cs).getMaximumChildren()); |
| writer.key("totalChildren").value(((CollectionColumnStatistics) cs).getTotalChildren()); |
| } |
| } |
| } |
| |
| private static void writeBloomFilterIndexes(JSONWriter writer, int col, |
| OrcIndex index, |
| OrcFile.WriterVersion version, |
| TypeDescription.Category type, |
| OrcProto.ColumnEncoding encoding |
| ) throws JSONException { |
| |
| BloomFilter stripeLevelBF = null; |
| OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex(); |
| if (bloomFilterIndex != null && bloomFilterIndex[col] != null) { |
| int entryIx = 0; |
| writer.key("bloomFilterIndexes").array(); |
| for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { |
| writer.object(); |
| writer.key("entryId").value(entryIx++); |
| BloomFilter toMerge = BloomFilterIO.deserialize( |
| index.getBloomFilterKinds()[col], encoding, version, type, bf); |
| writeBloomFilterStats(writer, toMerge); |
| if (stripeLevelBF == null) { |
| stripeLevelBF = toMerge; |
| } else { |
| stripeLevelBF.merge(toMerge); |
| } |
| writer.endObject(); |
| } |
| writer.endArray(); |
| } |
| if (stripeLevelBF != null) { |
| writer.key("stripeLevelBloomFilter"); |
| writer.object(); |
| writeBloomFilterStats(writer, stripeLevelBF); |
| writer.endObject(); |
| } |
| } |
| |
| private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf) |
| throws JSONException { |
| int bitCount = bf.getBitSize(); |
| int popCount = 0; |
| for (long l : bf.getBitSet()) { |
| popCount += Long.bitCount(l); |
| } |
| int k = bf.getNumHashFunctions(); |
| float loadFactor = (float) popCount / (float) bitCount; |
| float expectedFpp = (float) Math.pow(loadFactor, k); |
| writer.key("numHashFunctions").value(k); |
| writer.key("bitCount").value(bitCount); |
| writer.key("popCount").value(popCount); |
| writer.key("loadFactor").value(loadFactor); |
| writer.key("expectedFpp").value(expectedFpp); |
| } |
| |
| private static void writeRowGroupIndexes(JSONWriter writer, int col, |
| OrcProto.RowIndex[] rowGroupIndex, |
| TypeDescription schema, |
| ReaderImpl reader) throws JSONException { |
| OrcProto.RowIndex index; |
| if (rowGroupIndex == null || (col >= rowGroupIndex.length) || |
| ((index = rowGroupIndex[col]) == null)) { |
| return; |
| } |
| |
| writer.key("rowGroupIndexes").array(); |
| for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) { |
| writer.object(); |
| writer.key("entryId").value(entryIx); |
| OrcProto.RowIndexEntry entry = index.getEntry(entryIx); |
| if (entry == null) { |
| continue; |
| } |
| OrcProto.ColumnStatistics colStats = entry.getStatistics(); |
| writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize( |
| schema.findSubtype(col), colStats, reader.writerUsedProlepticGregorian(), |
| reader.getConvertToProlepticGregorian())); |
| writer.key("positions").array(); |
| for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) { |
| writer.value(entry.getPositions(posIx)); |
| } |
| writer.endArray(); |
| writer.endObject(); |
| } |
| writer.endArray(); |
| } |
| |
| } |