| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.carbondata.tool; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.nio.charset.Charset; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.LinkedHashMap; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.carbondata.common.Strings; |
| import org.apache.carbondata.core.constants.CarbonCommonConstants; |
| import org.apache.carbondata.core.datastore.filesystem.CarbonFile; |
| import org.apache.carbondata.core.datastore.impl.FileFactory; |
| import org.apache.carbondata.core.memory.MemoryException; |
| import org.apache.carbondata.core.metadata.datatype.DataTypes; |
| import org.apache.carbondata.core.metadata.encoder.Encoding; |
| import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema; |
| import org.apache.carbondata.core.reader.CarbonHeaderReader; |
| import org.apache.carbondata.core.statusmanager.LoadMetadataDetails; |
| import org.apache.carbondata.core.statusmanager.SegmentStatusManager; |
| import org.apache.carbondata.core.util.CarbonUtil; |
| import org.apache.carbondata.core.util.DataTypeUtil; |
| import org.apache.carbondata.format.BlockletInfo3; |
| import org.apache.carbondata.format.DataChunk2; |
| import org.apache.carbondata.format.DataChunk3; |
| import org.apache.carbondata.format.FileFooter3; |
| import org.apache.carbondata.format.FileHeader; |
| import org.apache.carbondata.format.TableInfo; |
| |
| import static org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET; |
| |
| import org.apache.commons.cli.CommandLine; |
| |
| /** |
| * Data Summary command implementation for {@link CarbonCli} |
| */ |
| class DataSummary implements Command { |
| private String dataFolder; |
| private List<String> outPuts; |
| |
| // file path mapping to file object |
| private LinkedHashMap<String, DataFile> dataFiles; |
| |
| DataSummary(String dataFolder, List<String> outPuts) { |
| this.dataFolder = dataFolder; |
| this.outPuts = outPuts; |
| } |
| |
| @Override |
| public void run(CommandLine line) throws IOException, MemoryException { |
| FileCollector collector = new FileCollector(outPuts); |
| collector.collectFiles(dataFolder); |
| collector.printBasicStats(); |
| if (collector.getNumDataFiles() == 0) { |
| return; |
| } |
| dataFiles = collector.getDataFiles(); |
| boolean printAll = false; |
| if (line.hasOption("a")) { |
| printAll = true; |
| } |
| if (line.hasOption("s") || printAll) { |
| if (dataFiles.size() > 0) { |
| List<String> dataFilesSet = new ArrayList<>(dataFiles.keySet()); |
| Collections.reverse(dataFilesSet); |
| collectSchemaDetails(dataFiles.get(dataFilesSet.get(0))); |
| } |
| } |
| if (line.hasOption("m") || printAll) { |
| collectSegmentsDetails(collector.getTableStatusFile()); |
| } |
| if (line.hasOption("t") || printAll) { |
| collectTableProperties(collector.getSchemaFile()); |
| } |
| if (line.hasOption("b") || printAll) { |
| String limitSize = line.getOptionValue("b"); |
| if (limitSize == null) { |
| // by default we can limit the output to two shards and user can increase this limit |
| limitSize = "2"; |
| } |
| collectBlockletDetail(Integer.parseInt(limitSize)); |
| } |
| if (line.hasOption("v") || printAll) { |
| collectVersionDetails(); |
| } |
| if (line.hasOption("B")) { |
| String blockFileName = line.getOptionValue("B"); |
| collectBlockDetails(blockFileName); |
| } |
| if (line.hasOption("c")) { |
| String columName = line.getOptionValue("c"); |
| printColumnStats(columName); |
| if (line.hasOption("k")) { |
| collectColumnChunkMeta(columName); |
| } |
| } |
| if (line.hasOption("C")) { |
| printAllColumnStats(); |
| } |
| collector.close(); |
| for (DataFile file : dataFiles.values()) { |
| file.close(); |
| } |
| } |
| |
| private void collectSchemaDetails(DataFile dataFile) throws IOException { |
| CarbonFile file = FileFactory.getCarbonFile(dataFile.getFilePath()); |
| outPuts.add(""); |
| outPuts.add("## Schema"); |
| outPuts.add(String.format("schema in %s", file.getName())); |
| CarbonHeaderReader reader = new CarbonHeaderReader(file.getPath()); |
| FileHeader header = reader.readHeader(); |
| outPuts.add("version: V" + header.version); |
| outPuts.add("timestamp: " + new java.sql.Timestamp(header.time_stamp)); |
| List<ColumnSchema> columns = reader.readSchema(); |
| TableFormatter tableFormatter = new TableFormatter( |
| new String[]{"Column Name", "Data Type", "Column Type", |
| "SortColumn", "Encoding", "Ordinal", "Id"}, outPuts); |
| for (ColumnSchema column : columns) { |
| String shortColumnId = "NA"; |
| if (column.getColumnUniqueId() != null && column.getColumnUniqueId().length() > 4) { |
| shortColumnId = "*" + |
| column.getColumnUniqueId().substring(column.getColumnUniqueId().length() - 4); |
| } |
| tableFormatter.addRow(new String[]{ |
| column.getColumnName(), |
| column.getDataType().getName(), |
| column.isDimensionColumn() ? "dimension" : "measure", |
| String.valueOf(column.isSortColumn()), |
| column.getEncodingList().toString(), |
| Integer.toString(column.getSchemaOrdinal()), |
| shortColumnId |
| }); |
| } |
| tableFormatter.printFormatted(); |
| } |
| |
| private void collectSegmentsDetails(CarbonFile tableStatusFile) throws IOException { |
| outPuts.add(""); |
| outPuts.add("## Segment"); |
| if (tableStatusFile != null) { |
| // first collect all information in memory then print a formatted table |
| LoadMetadataDetails[] segments = |
| SegmentStatusManager.readTableStatusFile(tableStatusFile.getPath()); |
| TableFormatter tableFormatter = new TableFormatter( |
| new String[]{"SegmentID", "Status", "Load Start", "Load End", |
| "Merged To", "Format", "Data Size", "Index Size"}, outPuts); |
| for (LoadMetadataDetails segment : segments) { |
| String dataSize, indexSize; |
| if (segment.getDataSize() == null) { |
| dataSize = "NA"; |
| } else { |
| dataSize = Strings.formatSize(Long.parseLong(segment.getDataSize())); |
| } |
| if (segment.getIndexSize() == null) { |
| indexSize = "NA"; |
| } else { |
| indexSize = Strings.formatSize(Long.parseLong(segment.getIndexSize())); |
| } |
| tableFormatter.addRow(new String[]{ |
| segment.getLoadName(), |
| segment.getSegmentStatus().toString(), |
| new java.sql.Timestamp(segment.getLoadStartTime()).toString(), |
| new java.sql.Timestamp(segment.getLoadEndTime()).toString(), |
| segment.getMergedLoadName() == null ? "NA" : segment.getMergedLoadName(), |
| segment.getFileFormat().toString(), |
| dataSize, |
| indexSize} |
| ); |
| } |
| tableFormatter.printFormatted(); |
| } else { |
| outPuts.add("table status file not found"); |
| } |
| } |
| |
| private void collectTableProperties(CarbonFile schemaFile) throws IOException { |
| outPuts.add(""); |
| outPuts.add("## Table Properties"); |
| if (schemaFile != null) { |
| TableInfo thriftTableInfo = CarbonUtil.readSchemaFile(schemaFile.getPath()); |
| Map<String, String> tblProperties = thriftTableInfo.fact_table.tableProperties; |
| TableFormatter tableFormatter = new TableFormatter( |
| new String[]{"Property Name", "Property Value"}, outPuts); |
| for (Map.Entry<String, String> entry : tblProperties.entrySet()) { |
| tableFormatter.addRow(new String[] { |
| String.format("'%s'", entry.getKey()), |
| String.format("'%s'", entry.getValue()) |
| }); |
| } |
| tableFormatter.printFormatted(); |
| } else { |
| outPuts.add("schema file not found"); |
| } |
| } |
| |
| private void collectBlockletDetail(int limitSize) { |
| outPuts.add(""); |
| outPuts.add("## Block Detail"); |
| |
| ShardPrinter printer = |
| new ShardPrinter(new String[] { "BLK", "BLKLT", "NumPages", "NumRows", "Size" }, outPuts); |
| |
| for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) { |
| DataFile file = entry.getValue(); |
| FileFooter3 footer = file.getFooter(); |
| for (int blockletId = 0; blockletId < footer.blocklet_info_list3.size(); blockletId++) { |
| BlockletInfo3 blocklet = footer.blocklet_info_list3.get(blockletId); |
| printer.addRow(file.getShardName(), new String[]{ |
| file.getPartNo(), |
| String.valueOf(blockletId), |
| String.format("%,d", blocklet.number_number_of_pages), |
| String.format("%,d", blocklet.num_rows), |
| Strings.formatSize(file.getBlockletSizeInBytes(blockletId)) |
| }); |
| } |
| limitSize--; |
| if (limitSize == 0) { |
| break; |
| } |
| } |
| printer.collectFormattedData(); |
| } |
| |
| private void collectBlockDetails(String blockFilePath) throws IOException { |
| outPuts.add(""); |
| outPuts.add("## Filtered Block Details for: " + blockFilePath |
| .substring(blockFilePath.lastIndexOf(File.separator) + 1, blockFilePath.length())); |
| TableFormatter tableFormatter = |
| new TableFormatter(new String[] { "BLKLT", "NumPages", "NumRows", "Size" }, outPuts); |
| CarbonFile datafile = FileFactory.getCarbonFile(blockFilePath); |
| DataFile dataFile = new DataFile(datafile); |
| dataFile.collectAllMeta(); |
| FileFooter3 footer = dataFile.getFooter(); |
| for (int blockletId = 0; blockletId < footer.blocklet_info_list3.size(); blockletId++) { |
| BlockletInfo3 blocklet = footer.blocklet_info_list3.get(blockletId); |
| tableFormatter.addRow(new String[]{ |
| String.valueOf(blockletId), |
| String.format("%,d", blocklet.number_number_of_pages), |
| String.format("%,d", blocklet.num_rows), |
| Strings.formatSize(dataFile.getBlockletSizeInBytes(blockletId)) |
| }); |
| } |
| tableFormatter.printFormatted(); |
| } |
| |
| private void collectVersionDetails() { |
| DataFile file = dataFiles.entrySet().iterator().next().getValue(); |
| FileFooter3 footer = file.getFooter(); |
| if (null != footer.getExtra_info()) { |
| outPuts.add(""); |
| outPuts.add("## version Details"); |
| TableFormatter tableFormatter = |
| new TableFormatter(new String[] { "written_by", "Version" }, outPuts); |
| tableFormatter.addRow(new String[] { String.format("%s", |
| footer.getExtra_info().get(CarbonCommonConstants.CARBON_WRITTEN_BY_FOOTER_INFO)), |
| String.format("%s", |
| footer.getExtra_info().get(CarbonCommonConstants.CARBON_WRITTEN_VERSION)) }); |
| tableFormatter.printFormatted(); |
| } |
| } |
| |
| private int getColumnIndex(String columnName) { |
| if (dataFiles.size() > 0) { |
| return dataFiles.entrySet().iterator().next().getValue().getColumnIndex(columnName); |
| } |
| throw new RuntimeException("schema for column " + columnName + " not found"); |
| } |
| |
| // true if blockled stats are collected |
| private boolean collected = false; |
| |
| private void printColumnStats(String columnName) throws IOException, MemoryException { |
| outPuts.add(""); |
| outPuts.add("## Column Statistics for '" + columnName + "'"); |
| collectStats(columnName); |
| |
| int columnIndex = getColumnIndex(columnName); |
| String[] header = new String[]{"BLK", "BLKLT", "Meta Size", "Data Size", |
| "LocalDict", "DictEntries", "DictSize", "AvgPageSize", "Min%", "Max%", "Min", "Max"}; |
| |
| ShardPrinter printer = new ShardPrinter(header, outPuts); |
| for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) { |
| DataFile file = entry.getValue(); |
| for (DataFile.Blocklet blocklet : file.getAllBlocklets()) { |
| String min, max, minPercent, maxPercent; |
| byte[] blockletMin = blocklet.getColumnChunk().min; |
| byte[] blockletMax = blocklet.getColumnChunk().max; |
| if (blocklet.getColumnChunk().getDataType() == DataTypes.STRING) { |
| minPercent = "NA"; |
| maxPercent = "NA"; |
| // for complex types min max can be given as NA and for varchar where min max is not |
| // written, can give NA |
| if (blocklet.getColumnChunk().column.hasEncoding(Encoding.DICTIONARY) || |
| blocklet.getColumnChunk().column.isComplexColumn() || |
| !blocklet.getColumnChunk().isMinMaxPresent) { |
| min = "NA"; |
| max = "NA"; |
| } else { |
| min = new String(blockletMin, Charset.forName(DEFAULT_CHARSET)); |
| max = new String(blockletMax, Charset.forName(DEFAULT_CHARSET)); |
| } |
| } else { |
| // for column has global dictionary and for complex columns,min and max percentage can be |
| // NA |
| if (blocklet.getColumnChunk().column.hasEncoding(Encoding.DICTIONARY) || |
| blocklet.getColumnChunk().column.isComplexColumn() || |
| blocklet.getColumnChunk().column.getDataType().isComplexType()) { |
| minPercent = "NA"; |
| maxPercent = "NA"; |
| } else { |
| minPercent = |
| String.format("%.1f", Math.abs(blocklet.getColumnChunk().getMinPercentage() * 100)); |
| maxPercent = |
| String.format("%.1f", Math.abs(blocklet.getColumnChunk().getMaxPercentage() * 100)); |
| } |
| DataFile.ColumnChunk columnChunk = blocklet.columnChunk; |
| // need to consider dictionary and complex columns |
| if (columnChunk.column.hasEncoding(Encoding.DICTIONARY) || |
| blocklet.getColumnChunk().column.isComplexColumn() || |
| blocklet.getColumnChunk().column.getDataType().isComplexType()) { |
| min = "NA"; |
| max = "NA"; |
| } else if (columnChunk.column.isDimensionColumn() && DataTypeUtil |
| .isPrimitiveColumn(columnChunk.column.getDataType())) { |
| min = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(blockletMin, |
| columnChunk.column.getDataType()).toString(); |
| max = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(blockletMax, |
| columnChunk.column.getDataType()).toString(); |
| if (columnChunk.column.getDataType().equals(DataTypes.TIMESTAMP)) { |
| min = new java.sql.Timestamp(Long.parseLong(min) / 1000).toString(); |
| max = new java.sql.Timestamp(Long.parseLong(max) / 1000).toString(); |
| } |
| } else { |
| min = String.valueOf(DataTypeUtil |
| .getMeasureObjectFromDataType(blockletMin, columnChunk.column.getDataType())); |
| max = String.valueOf(DataTypeUtil |
| .getMeasureObjectFromDataType(blockletMax, columnChunk.column.getDataType())); |
| } |
| } |
| printer.addRow( |
| blocklet.getShardName(), |
| new String[]{ |
| file.getPartNo(), |
| String.valueOf(blocklet.id), |
| Strings.formatSize(file.getColumnMetaSizeInBytes(blocklet.id, columnIndex)), |
| Strings.formatSize(file.getColumnDataSizeInBytes(blocklet.id, columnIndex)), |
| String.valueOf(blocklet.getColumnChunk().localDict), |
| String.valueOf(blocklet.getColumnChunk().blockletDictionaryEntries), |
| Strings.formatSize(blocklet.getColumnChunk().blocketletDictionarySize), |
| Strings.formatSize(blocklet.getColumnChunk().avgPageLengthInBytes), |
| minPercent, |
| maxPercent, |
| min, |
| max} |
| ); |
| } |
| } |
| printer.collectFormattedData(); |
| } |
| |
| private void printAllColumnStats() { |
| if (!dataFiles.isEmpty()) { |
| outPuts.add(""); |
| outPuts.add("## Statistics for All Columns"); |
| String[] header = |
| new String[] { "Block", "Blocklet", "Column Name", "Meta Size", "Data Size" }; |
| ShardPrinter printer = new ShardPrinter(header, outPuts); |
| for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) { |
| DataFile dataFile = entry.getValue(); |
| List<ColumnSchema> columns = dataFile.getSchema(); |
| int columnNum = columns.size(); |
| int blockletNum = dataFile.getNumBlocklet(); |
| for (int j = 0; j < blockletNum; j++) { |
| for (int i = 0; i < columnNum; i++) { |
| printer.addRow(dataFile.getShardName(), |
| new String[] { dataFile.getPartNo(), String.valueOf(j), |
| columns.get(i).getColumnName(), |
| Strings.formatSize(dataFile.getColumnMetaSizeInBytes(j, i)), |
| Strings.formatSize(dataFile.getColumnDataSizeInBytes(j, i)) }); |
| } |
| } |
| } |
| printer.collectFormattedData(); |
| } |
| } |
| |
| private void collectStats(String columnName) throws IOException, MemoryException { |
| if (!collected) { |
| for (DataFile dataFile : dataFiles.values()) { |
| dataFile.initAllBlockletStats(columnName); |
| } |
| collectAllBlockletStats(dataFiles.values()); |
| collected = true; |
| } |
| } |
| |
| private void collectColumnChunkMeta(String columnName) throws IOException, MemoryException { |
| for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) { |
| DataFile file = entry.getValue(); |
| outPuts.add(""); |
| outPuts.add("## Page Meta for column '" + columnName + "' in file " + file.getFilePath()); |
| collectStats(columnName); |
| for (int i = 0; i < file.getAllBlocklets().size(); i++) { |
| DataFile.Blocklet blocklet = file.getAllBlocklets().get(i); |
| DataChunk3 dataChunk3 = blocklet.getColumnChunk().getDataChunk3(); |
| List<DataChunk2> dataChunk2List = dataChunk3.getData_chunk_list(); |
| outPuts.add(String.format("Blocklet %d:", i)); |
| |
| // There will be many pages, for debugging purpose, |
| // just print 3 page for each blocklet is enough |
| for (int j = 0; j < dataChunk2List.size() && j < 3; j++) { |
| outPuts.add(String |
| .format("Page %d (offset %d, length %d): %s", j, dataChunk3.page_offset.get(j), |
| dataChunk3.page_length.get(j), dataChunk2List.get(j).toString())); |
| } |
| outPuts.add(""); |
| } |
| } |
| } |
| |
| private void collectAllBlockletStats(Collection<DataFile> dataFiles) { |
| // shard name mapping to blocklets belong to same shard |
| Map<String, List<DataFile.Blocklet>> shards = new HashMap<>(); |
| |
| // collect blocklets based on shard name |
| for (DataFile dataFile : dataFiles) { |
| List<DataFile.Blocklet> blocklets = dataFile.getAllBlocklets(); |
| List<DataFile.Blocklet> existing = shards.get(dataFile.getShardName()); |
| if (existing == null) { |
| existing = new LinkedList<>(); |
| } |
| existing.addAll(blocklets); |
| shards.put(dataFile.getShardName(), existing); |
| } |
| |
| // calculate min/max for each shard |
| Map<String, byte[]> shardMinMap = new HashMap<>(); |
| Map<String, byte[]> shardMaxMap = new HashMap<>(); |
| for (Map.Entry<String, List<DataFile.Blocklet>> shard : shards.entrySet()) { |
| byte[] shardMin = null; |
| byte[] shardMax = null; |
| for (DataFile.Blocklet blocklet : shard.getValue()) { |
| shardMin = blocklet.getColumnChunk().min(shardMin); |
| shardMax = blocklet.getColumnChunk().max(shardMax); |
| } |
| shardMinMap.put(shard.getKey(), shardMin); |
| shardMaxMap.put(shard.getKey(), shardMax); |
| } |
| |
| // calculate min/max percentage for each blocklet |
| for (Map.Entry<String, List<DataFile.Blocklet>> shard : shards.entrySet()) { |
| byte[] shardMin = shardMinMap.get(shard.getKey()); |
| byte[] shardMax = shardMaxMap.get(shard.getKey()); |
| for (DataFile.Blocklet blocklet : shard.getValue()) { |
| blocklet.computePercentage(shardMin, shardMax); |
| } |
| } |
| } |
| |
| } |