blob: db957b7674f2ab5935afd613be8a29323926c3ee [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.carbondata.tool;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.carbondata.common.Strings;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
import org.apache.carbondata.core.datastore.impl.FileFactory;
import org.apache.carbondata.core.memory.MemoryException;
import org.apache.carbondata.core.metadata.datatype.DataTypes;
import org.apache.carbondata.core.metadata.encoder.Encoding;
import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
import org.apache.carbondata.core.reader.CarbonHeaderReader;
import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
import org.apache.carbondata.core.statusmanager.SegmentStatusManager;
import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.DataTypeUtil;
import org.apache.carbondata.format.BlockletInfo3;
import org.apache.carbondata.format.DataChunk2;
import org.apache.carbondata.format.DataChunk3;
import org.apache.carbondata.format.FileFooter3;
import org.apache.carbondata.format.FileHeader;
import org.apache.carbondata.format.TableInfo;
import static org.apache.carbondata.core.constants.CarbonCommonConstants.DEFAULT_CHARSET;
import org.apache.commons.cli.CommandLine;
/**
* Data Summary command implementation for {@link CarbonCli}
*/
class DataSummary implements Command {
private String dataFolder;
private List<String> outPuts;
// file path mapping to file object
private LinkedHashMap<String, DataFile> dataFiles;
DataSummary(String dataFolder, List<String> outPuts) {
this.dataFolder = dataFolder;
this.outPuts = outPuts;
}
@Override
public void run(CommandLine line) throws IOException, MemoryException {
FileCollector collector = new FileCollector(outPuts);
collector.collectFiles(dataFolder);
collector.printBasicStats();
if (collector.getNumDataFiles() == 0) {
return;
}
dataFiles = collector.getDataFiles();
boolean printAll = false;
if (line.hasOption("a")) {
printAll = true;
}
if (line.hasOption("s") || printAll) {
if (dataFiles.size() > 0) {
List<String> dataFilesSet = new ArrayList<>(dataFiles.keySet());
Collections.reverse(dataFilesSet);
collectSchemaDetails(dataFiles.get(dataFilesSet.get(0)));
}
}
if (line.hasOption("m") || printAll) {
collectSegmentsDetails(collector.getTableStatusFile());
}
if (line.hasOption("t") || printAll) {
collectTableProperties(collector.getSchemaFile());
}
if (line.hasOption("b") || printAll) {
String limitSize = line.getOptionValue("b");
if (limitSize == null) {
// by default we can limit the output to two shards and user can increase this limit
limitSize = "2";
}
collectBlockletDetail(Integer.parseInt(limitSize));
}
if (line.hasOption("v") || printAll) {
collectVersionDetails();
}
if (line.hasOption("B")) {
String blockFileName = line.getOptionValue("B");
collectBlockDetails(blockFileName);
}
if (line.hasOption("c")) {
String columName = line.getOptionValue("c");
printColumnStats(columName);
if (line.hasOption("k")) {
collectColumnChunkMeta(columName);
}
}
if (line.hasOption("C")) {
printAllColumnStats();
}
collector.close();
for (DataFile file : dataFiles.values()) {
file.close();
}
}
private void collectSchemaDetails(DataFile dataFile) throws IOException {
CarbonFile file = FileFactory.getCarbonFile(dataFile.getFilePath());
outPuts.add("");
outPuts.add("## Schema");
outPuts.add(String.format("schema in %s", file.getName()));
CarbonHeaderReader reader = new CarbonHeaderReader(file.getPath());
FileHeader header = reader.readHeader();
outPuts.add("version: V" + header.version);
outPuts.add("timestamp: " + new java.sql.Timestamp(header.time_stamp));
List<ColumnSchema> columns = reader.readSchema();
TableFormatter tableFormatter = new TableFormatter(
new String[]{"Column Name", "Data Type", "Column Type",
"SortColumn", "Encoding", "Ordinal", "Id"}, outPuts);
for (ColumnSchema column : columns) {
String shortColumnId = "NA";
if (column.getColumnUniqueId() != null && column.getColumnUniqueId().length() > 4) {
shortColumnId = "*" +
column.getColumnUniqueId().substring(column.getColumnUniqueId().length() - 4);
}
tableFormatter.addRow(new String[]{
column.getColumnName(),
column.getDataType().getName(),
column.isDimensionColumn() ? "dimension" : "measure",
String.valueOf(column.isSortColumn()),
column.getEncodingList().toString(),
Integer.toString(column.getSchemaOrdinal()),
shortColumnId
});
}
tableFormatter.printFormatted();
}
private void collectSegmentsDetails(CarbonFile tableStatusFile) throws IOException {
outPuts.add("");
outPuts.add("## Segment");
if (tableStatusFile != null) {
// first collect all information in memory then print a formatted table
LoadMetadataDetails[] segments =
SegmentStatusManager.readTableStatusFile(tableStatusFile.getPath());
TableFormatter tableFormatter = new TableFormatter(
new String[]{"SegmentID", "Status", "Load Start", "Load End",
"Merged To", "Format", "Data Size", "Index Size"}, outPuts);
for (LoadMetadataDetails segment : segments) {
String dataSize, indexSize;
if (segment.getDataSize() == null) {
dataSize = "NA";
} else {
dataSize = Strings.formatSize(Long.parseLong(segment.getDataSize()));
}
if (segment.getIndexSize() == null) {
indexSize = "NA";
} else {
indexSize = Strings.formatSize(Long.parseLong(segment.getIndexSize()));
}
tableFormatter.addRow(new String[]{
segment.getLoadName(),
segment.getSegmentStatus().toString(),
new java.sql.Timestamp(segment.getLoadStartTime()).toString(),
new java.sql.Timestamp(segment.getLoadEndTime()).toString(),
segment.getMergedLoadName() == null ? "NA" : segment.getMergedLoadName(),
segment.getFileFormat().toString(),
dataSize,
indexSize}
);
}
tableFormatter.printFormatted();
} else {
outPuts.add("table status file not found");
}
}
private void collectTableProperties(CarbonFile schemaFile) throws IOException {
outPuts.add("");
outPuts.add("## Table Properties");
if (schemaFile != null) {
TableInfo thriftTableInfo = CarbonUtil.readSchemaFile(schemaFile.getPath());
Map<String, String> tblProperties = thriftTableInfo.fact_table.tableProperties;
TableFormatter tableFormatter = new TableFormatter(
new String[]{"Property Name", "Property Value"}, outPuts);
for (Map.Entry<String, String> entry : tblProperties.entrySet()) {
tableFormatter.addRow(new String[] {
String.format("'%s'", entry.getKey()),
String.format("'%s'", entry.getValue())
});
}
tableFormatter.printFormatted();
} else {
outPuts.add("schema file not found");
}
}
private void collectBlockletDetail(int limitSize) {
outPuts.add("");
outPuts.add("## Block Detail");
ShardPrinter printer =
new ShardPrinter(new String[] { "BLK", "BLKLT", "NumPages", "NumRows", "Size" }, outPuts);
for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) {
DataFile file = entry.getValue();
FileFooter3 footer = file.getFooter();
for (int blockletId = 0; blockletId < footer.blocklet_info_list3.size(); blockletId++) {
BlockletInfo3 blocklet = footer.blocklet_info_list3.get(blockletId);
printer.addRow(file.getShardName(), new String[]{
file.getPartNo(),
String.valueOf(blockletId),
String.format("%,d", blocklet.number_number_of_pages),
String.format("%,d", blocklet.num_rows),
Strings.formatSize(file.getBlockletSizeInBytes(blockletId))
});
}
limitSize--;
if (limitSize == 0) {
break;
}
}
printer.collectFormattedData();
}
private void collectBlockDetails(String blockFilePath) throws IOException {
outPuts.add("");
outPuts.add("## Filtered Block Details for: " + blockFilePath
.substring(blockFilePath.lastIndexOf(File.separator) + 1, blockFilePath.length()));
TableFormatter tableFormatter =
new TableFormatter(new String[] { "BLKLT", "NumPages", "NumRows", "Size" }, outPuts);
CarbonFile datafile = FileFactory.getCarbonFile(blockFilePath);
DataFile dataFile = new DataFile(datafile);
dataFile.collectAllMeta();
FileFooter3 footer = dataFile.getFooter();
for (int blockletId = 0; blockletId < footer.blocklet_info_list3.size(); blockletId++) {
BlockletInfo3 blocklet = footer.blocklet_info_list3.get(blockletId);
tableFormatter.addRow(new String[]{
String.valueOf(blockletId),
String.format("%,d", blocklet.number_number_of_pages),
String.format("%,d", blocklet.num_rows),
Strings.formatSize(dataFile.getBlockletSizeInBytes(blockletId))
});
}
tableFormatter.printFormatted();
}
private void collectVersionDetails() {
DataFile file = dataFiles.entrySet().iterator().next().getValue();
FileFooter3 footer = file.getFooter();
if (null != footer.getExtra_info()) {
outPuts.add("");
outPuts.add("## version Details");
TableFormatter tableFormatter =
new TableFormatter(new String[] { "written_by", "Version" }, outPuts);
tableFormatter.addRow(new String[] { String.format("%s",
footer.getExtra_info().get(CarbonCommonConstants.CARBON_WRITTEN_BY_FOOTER_INFO)),
String.format("%s",
footer.getExtra_info().get(CarbonCommonConstants.CARBON_WRITTEN_VERSION)) });
tableFormatter.printFormatted();
}
}
private int getColumnIndex(String columnName) {
if (dataFiles.size() > 0) {
return dataFiles.entrySet().iterator().next().getValue().getColumnIndex(columnName);
}
throw new RuntimeException("schema for column " + columnName + " not found");
}
// true if blockled stats are collected
private boolean collected = false;
private void printColumnStats(String columnName) throws IOException, MemoryException {
outPuts.add("");
outPuts.add("## Column Statistics for '" + columnName + "'");
collectStats(columnName);
int columnIndex = getColumnIndex(columnName);
String[] header = new String[]{"BLK", "BLKLT", "Meta Size", "Data Size",
"LocalDict", "DictEntries", "DictSize", "AvgPageSize", "Min%", "Max%", "Min", "Max"};
ShardPrinter printer = new ShardPrinter(header, outPuts);
for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) {
DataFile file = entry.getValue();
for (DataFile.Blocklet blocklet : file.getAllBlocklets()) {
String min, max, minPercent, maxPercent;
byte[] blockletMin = blocklet.getColumnChunk().min;
byte[] blockletMax = blocklet.getColumnChunk().max;
if (blocklet.getColumnChunk().getDataType() == DataTypes.STRING) {
minPercent = "NA";
maxPercent = "NA";
// for complex types min max can be given as NA and for varchar where min max is not
// written, can give NA
if (blocklet.getColumnChunk().column.hasEncoding(Encoding.DICTIONARY) ||
blocklet.getColumnChunk().column.isComplexColumn() ||
!blocklet.getColumnChunk().isMinMaxPresent) {
min = "NA";
max = "NA";
} else {
min = new String(blockletMin, Charset.forName(DEFAULT_CHARSET));
max = new String(blockletMax, Charset.forName(DEFAULT_CHARSET));
}
} else {
// for column has global dictionary and for complex columns,min and max percentage can be
// NA
if (blocklet.getColumnChunk().column.hasEncoding(Encoding.DICTIONARY) ||
blocklet.getColumnChunk().column.isComplexColumn() ||
blocklet.getColumnChunk().column.getDataType().isComplexType()) {
minPercent = "NA";
maxPercent = "NA";
} else {
minPercent =
String.format("%.1f", Math.abs(blocklet.getColumnChunk().getMinPercentage() * 100));
maxPercent =
String.format("%.1f", Math.abs(blocklet.getColumnChunk().getMaxPercentage() * 100));
}
DataFile.ColumnChunk columnChunk = blocklet.columnChunk;
// need to consider dictionary and complex columns
if (columnChunk.column.hasEncoding(Encoding.DICTIONARY) ||
blocklet.getColumnChunk().column.isComplexColumn() ||
blocklet.getColumnChunk().column.getDataType().isComplexType()) {
min = "NA";
max = "NA";
} else if (columnChunk.column.isDimensionColumn() && DataTypeUtil
.isPrimitiveColumn(columnChunk.column.getDataType())) {
min = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(blockletMin,
columnChunk.column.getDataType()).toString();
max = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(blockletMax,
columnChunk.column.getDataType()).toString();
if (columnChunk.column.getDataType().equals(DataTypes.TIMESTAMP)) {
min = new java.sql.Timestamp(Long.parseLong(min) / 1000).toString();
max = new java.sql.Timestamp(Long.parseLong(max) / 1000).toString();
}
} else {
min = String.valueOf(DataTypeUtil
.getMeasureObjectFromDataType(blockletMin, columnChunk.column.getDataType()));
max = String.valueOf(DataTypeUtil
.getMeasureObjectFromDataType(blockletMax, columnChunk.column.getDataType()));
}
}
printer.addRow(
blocklet.getShardName(),
new String[]{
file.getPartNo(),
String.valueOf(blocklet.id),
Strings.formatSize(file.getColumnMetaSizeInBytes(blocklet.id, columnIndex)),
Strings.formatSize(file.getColumnDataSizeInBytes(blocklet.id, columnIndex)),
String.valueOf(blocklet.getColumnChunk().localDict),
String.valueOf(blocklet.getColumnChunk().blockletDictionaryEntries),
Strings.formatSize(blocklet.getColumnChunk().blocketletDictionarySize),
Strings.formatSize(blocklet.getColumnChunk().avgPageLengthInBytes),
minPercent,
maxPercent,
min,
max}
);
}
}
printer.collectFormattedData();
}
private void printAllColumnStats() {
if (!dataFiles.isEmpty()) {
outPuts.add("");
outPuts.add("## Statistics for All Columns");
String[] header =
new String[] { "Block", "Blocklet", "Column Name", "Meta Size", "Data Size" };
ShardPrinter printer = new ShardPrinter(header, outPuts);
for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) {
DataFile dataFile = entry.getValue();
List<ColumnSchema> columns = dataFile.getSchema();
int columnNum = columns.size();
int blockletNum = dataFile.getNumBlocklet();
for (int j = 0; j < blockletNum; j++) {
for (int i = 0; i < columnNum; i++) {
printer.addRow(dataFile.getShardName(),
new String[] { dataFile.getPartNo(), String.valueOf(j),
columns.get(i).getColumnName(),
Strings.formatSize(dataFile.getColumnMetaSizeInBytes(j, i)),
Strings.formatSize(dataFile.getColumnDataSizeInBytes(j, i)) });
}
}
}
printer.collectFormattedData();
}
}
private void collectStats(String columnName) throws IOException, MemoryException {
if (!collected) {
for (DataFile dataFile : dataFiles.values()) {
dataFile.initAllBlockletStats(columnName);
}
collectAllBlockletStats(dataFiles.values());
collected = true;
}
}
private void collectColumnChunkMeta(String columnName) throws IOException, MemoryException {
for (Map.Entry<String, DataFile> entry : dataFiles.entrySet()) {
DataFile file = entry.getValue();
outPuts.add("");
outPuts.add("## Page Meta for column '" + columnName + "' in file " + file.getFilePath());
collectStats(columnName);
for (int i = 0; i < file.getAllBlocklets().size(); i++) {
DataFile.Blocklet blocklet = file.getAllBlocklets().get(i);
DataChunk3 dataChunk3 = blocklet.getColumnChunk().getDataChunk3();
List<DataChunk2> dataChunk2List = dataChunk3.getData_chunk_list();
outPuts.add(String.format("Blocklet %d:", i));
// There will be many pages, for debugging purpose,
// just print 3 page for each blocklet is enough
for (int j = 0; j < dataChunk2List.size() && j < 3; j++) {
outPuts.add(String
.format("Page %d (offset %d, length %d): %s", j, dataChunk3.page_offset.get(j),
dataChunk3.page_length.get(j), dataChunk2List.get(j).toString()));
}
outPuts.add("");
}
}
}
private void collectAllBlockletStats(Collection<DataFile> dataFiles) {
// shard name mapping to blocklets belong to same shard
Map<String, List<DataFile.Blocklet>> shards = new HashMap<>();
// collect blocklets based on shard name
for (DataFile dataFile : dataFiles) {
List<DataFile.Blocklet> blocklets = dataFile.getAllBlocklets();
List<DataFile.Blocklet> existing = shards.get(dataFile.getShardName());
if (existing == null) {
existing = new LinkedList<>();
}
existing.addAll(blocklets);
shards.put(dataFile.getShardName(), existing);
}
// calculate min/max for each shard
Map<String, byte[]> shardMinMap = new HashMap<>();
Map<String, byte[]> shardMaxMap = new HashMap<>();
for (Map.Entry<String, List<DataFile.Blocklet>> shard : shards.entrySet()) {
byte[] shardMin = null;
byte[] shardMax = null;
for (DataFile.Blocklet blocklet : shard.getValue()) {
shardMin = blocklet.getColumnChunk().min(shardMin);
shardMax = blocklet.getColumnChunk().max(shardMax);
}
shardMinMap.put(shard.getKey(), shardMin);
shardMaxMap.put(shard.getKey(), shardMax);
}
// calculate min/max percentage for each blocklet
for (Map.Entry<String, List<DataFile.Blocklet>> shard : shards.entrySet()) {
byte[] shardMin = shardMinMap.get(shard.getKey());
byte[] shardMax = shardMaxMap.get(shard.getKey());
for (DataFile.Blocklet blocklet : shard.getValue()) {
blocklet.computePercentage(shardMin, shardMax);
}
}
}
}