blob: d232d61e1afc6deb17c1f884a615f616c18d60d9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.carbondata.core.util;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.carbondata.core.datastore.block.SegmentProperties;
import org.apache.carbondata.core.datastore.page.EncodedTablePage;
import org.apache.carbondata.core.datastore.page.statistics.TablePageStatistics;
import org.apache.carbondata.core.metadata.ColumnarFormatVersion;
import org.apache.carbondata.core.metadata.datatype.DataType;
import org.apache.carbondata.core.metadata.datatype.DataTypes;
import org.apache.carbondata.core.metadata.index.BlockIndexInfo;
import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure;
import org.apache.carbondata.format.BlockIndex;
import org.apache.carbondata.format.BlockletBTreeIndex;
import org.apache.carbondata.format.BlockletIndex;
import org.apache.carbondata.format.BlockletInfo3;
import org.apache.carbondata.format.BlockletMinMaxIndex;
import org.apache.carbondata.format.ChunkCompressionMeta;
import org.apache.carbondata.format.ColumnSchema;
import org.apache.carbondata.format.CompressionCodec;
import org.apache.carbondata.format.DataChunk2;
import org.apache.carbondata.format.DataChunk3;
import org.apache.carbondata.format.Encoding;
import org.apache.carbondata.format.FileFooter3;
import org.apache.carbondata.format.FileHeader;
import org.apache.carbondata.format.IndexHeader;
import org.apache.carbondata.format.SegmentInfo;
/**
* Util class to convert to thrift metdata classes
*/
public class CarbonMetadataUtil {
/**
* Below method prepares the file footer object for carbon data file version 3
*
* @param infoList
* @param blockletIndexs
* @param cardinalities
* @param numberOfColumns
* @return FileFooter
*/
public static FileFooter3 convertFileFooterVersion3(List<BlockletInfo3> infoList,
List<BlockletIndex> blockletIndexs, int[] cardinalities, int numberOfColumns)
throws IOException {
FileFooter3 footer = getFileFooter3(infoList, blockletIndexs, cardinalities, numberOfColumns);
for (BlockletInfo3 info : infoList) {
footer.addToBlocklet_info_list3(info);
}
return footer;
}
/**
* Below method will be used to get the file footer object
*
* @param infoList blocklet info
* @param blockletIndexs
* @param cardinalities cardinlaity of dimension columns
* @param numberOfColumns
* @return file footer
*/
private static FileFooter3 getFileFooter3(List<BlockletInfo3> infoList,
List<BlockletIndex> blockletIndexs, int[] cardinalities, int numberOfColumns) {
SegmentInfo segmentInfo = new SegmentInfo();
segmentInfo.setNum_cols(numberOfColumns);
segmentInfo.setColumn_cardinalities(CarbonUtil.convertToIntegerList(cardinalities));
FileFooter3 footer = new FileFooter3();
footer.setNum_rows(getNumberOfRowForFooter(infoList));
footer.setSegment_info(segmentInfo);
for (BlockletIndex info : blockletIndexs) {
footer.addToBlocklet_index_list(info);
}
return footer;
}
private static BlockletIndex getBlockletIndex(
org.apache.carbondata.core.metadata.blocklet.index.BlockletIndex info) {
BlockletMinMaxIndex blockletMinMaxIndex = new BlockletMinMaxIndex();
for (int i = 0; i < info.getMinMaxIndex().getMaxValues().length; i++) {
blockletMinMaxIndex.addToMax_values(ByteBuffer.wrap(info.getMinMaxIndex().getMaxValues()[i]));
blockletMinMaxIndex.addToMin_values(ByteBuffer.wrap(info.getMinMaxIndex().getMinValues()[i]));
}
BlockletBTreeIndex blockletBTreeIndex = new BlockletBTreeIndex();
blockletBTreeIndex.setStart_key(info.getBtreeIndex().getStartKey());
blockletBTreeIndex.setEnd_key(info.getBtreeIndex().getEndKey());
BlockletIndex blockletIndex = new BlockletIndex();
blockletIndex.setMin_max_index(blockletMinMaxIndex);
blockletIndex.setB_tree_index(blockletBTreeIndex);
return blockletIndex;
}
/**
* Get total number of rows for the file.
*
* @param infoList
* @return
*/
private static long getNumberOfRowForFooter(List<BlockletInfo3> infoList) {
long numberOfRows = 0;
for (BlockletInfo3 info : infoList) {
numberOfRows += info.num_rows;
}
return numberOfRows;
}
public static BlockletIndex getBlockletIndex(List<EncodedTablePage> encodedTablePageList,
List<CarbonMeasure> carbonMeasureList) {
BlockletMinMaxIndex blockletMinMaxIndex = new BlockletMinMaxIndex();
// Calculating min/max for every each column.
TablePageStatistics stats = new TablePageStatistics(encodedTablePageList.get(0).getDimensions(),
encodedTablePageList.get(0).getMeasures());
byte[][] minCol = stats.getDimensionMinValue().clone();
byte[][] maxCol = stats.getDimensionMaxValue().clone();
for (EncodedTablePage encodedTablePage : encodedTablePageList) {
stats = new TablePageStatistics(encodedTablePage.getDimensions(),
encodedTablePage.getMeasures());
byte[][] columnMaxData = stats.getDimensionMaxValue();
byte[][] columnMinData = stats.getDimensionMinValue();
for (int i = 0; i < maxCol.length; i++) {
if (ByteUtil.UnsafeComparer.INSTANCE.compareTo(columnMaxData[i], maxCol[i]) > 0) {
maxCol[i] = columnMaxData[i];
}
if (ByteUtil.UnsafeComparer.INSTANCE.compareTo(columnMinData[i], minCol[i]) < 0) {
minCol[i] = columnMinData[i];
}
}
}
// Writing min/max to thrift file
for (byte[] max : maxCol) {
blockletMinMaxIndex.addToMax_values(ByteBuffer.wrap(max));
}
for (byte[] min : minCol) {
blockletMinMaxIndex.addToMin_values(ByteBuffer.wrap(min));
}
stats = new TablePageStatistics(encodedTablePageList.get(0).getDimensions(),
encodedTablePageList.get(0).getMeasures());
byte[][] measureMaxValue = stats.getMeasureMaxValue().clone();
byte[][] measureMinValue = stats.getMeasureMinValue().clone();
byte[] minVal = null;
byte[] maxVal = null;
for (int i = 1; i < encodedTablePageList.size(); i++) {
for (int j = 0; j < measureMinValue.length; j++) {
stats = new TablePageStatistics(
encodedTablePageList.get(i).getDimensions(), encodedTablePageList.get(i).getMeasures());
minVal = stats.getMeasureMinValue()[j];
maxVal = stats.getMeasureMaxValue()[j];
if (compareMeasureData(measureMaxValue[j], maxVal, carbonMeasureList.get(j).getDataType())
< 0) {
measureMaxValue[j] = maxVal.clone();
}
if (compareMeasureData(measureMinValue[j], minVal, carbonMeasureList.get(j).getDataType())
> 0) {
measureMinValue[j] = minVal.clone();
}
}
}
for (byte[] max : measureMaxValue) {
blockletMinMaxIndex.addToMax_values(ByteBuffer.wrap(max));
}
for (byte[] min : measureMinValue) {
blockletMinMaxIndex.addToMin_values(ByteBuffer.wrap(min));
}
BlockletBTreeIndex blockletBTreeIndex = new BlockletBTreeIndex();
byte[] startKey = encodedTablePageList.get(0).getPageKey().serializeStartKey();
blockletBTreeIndex.setStart_key(startKey);
byte[] endKey = encodedTablePageList.get(
encodedTablePageList.size() - 1).getPageKey().serializeEndKey();
blockletBTreeIndex.setEnd_key(endKey);
BlockletIndex blockletIndex = new BlockletIndex();
blockletIndex.setMin_max_index(blockletMinMaxIndex);
blockletIndex.setB_tree_index(blockletBTreeIndex);
return blockletIndex;
}
/**
* @param blockIndex
* @param encoding
* @param columnSchemas
* @param segmentProperties
* @return return true if given encoding is present in column
*/
private static boolean containsEncoding(int blockIndex, Encoding encoding,
List<ColumnSchema> columnSchemas, SegmentProperties segmentProperties) {
Set<Integer> dimOrdinals = segmentProperties.getDimensionOrdinalForBlock(blockIndex);
// column groups will always have dictionary encoding
if (dimOrdinals.size() > 1 && Encoding.DICTIONARY == encoding) {
return true;
}
for (Integer dimOrdinal : dimOrdinals) {
if (columnSchemas.get(dimOrdinal).encoders.contains(encoding)) {
return true;
}
}
return false;
}
/**
* Right now it is set to default values. We may use this in future
*/
public static ChunkCompressionMeta getSnappyChunkCompressionMeta() {
ChunkCompressionMeta chunkCompressionMeta = new ChunkCompressionMeta();
chunkCompressionMeta.setCompression_codec(CompressionCodec.SNAPPY);
chunkCompressionMeta.setTotal_compressed_size(0);
chunkCompressionMeta.setTotal_uncompressed_size(0);
return chunkCompressionMeta;
}
/**
* Below method will be used to get the index header
*
* @param columnCardinality cardinality of each column
* @param columnSchemaList list of column present in the table
* @return Index header object
*/
public static IndexHeader getIndexHeader(int[] columnCardinality,
List<ColumnSchema> columnSchemaList, int bucketNumber) {
// create segment info object
SegmentInfo segmentInfo = new SegmentInfo();
// set the number of columns
segmentInfo.setNum_cols(columnSchemaList.size());
// setting the column cardinality
segmentInfo.setColumn_cardinalities(CarbonUtil.convertToIntegerList(columnCardinality));
// create index header object
IndexHeader indexHeader = new IndexHeader();
ColumnarFormatVersion version = CarbonProperties.getInstance().getFormatVersion();
indexHeader.setVersion(version.number());
// set the segment info
indexHeader.setSegment_info(segmentInfo);
// set the column names
indexHeader.setTable_columns(columnSchemaList);
// set the bucket number
indexHeader.setBucket_id(bucketNumber);
return indexHeader;
}
/**
* Below method will be used to get the block index info thrift object for
* each block present in the segment
*
* @param blockIndexInfoList block index info list
* @return list of block index
*/
public static List<BlockIndex> getBlockIndexInfo(List<BlockIndexInfo> blockIndexInfoList) {
List<BlockIndex> thriftBlockIndexList = new ArrayList<BlockIndex>();
BlockIndex blockIndex = null;
// below code to create block index info object for each block
for (BlockIndexInfo blockIndexInfo : blockIndexInfoList) {
blockIndex = new BlockIndex();
blockIndex.setNum_rows(blockIndexInfo.getNumberOfRows());
blockIndex.setOffset(blockIndexInfo.getOffset());
blockIndex.setFile_name(blockIndexInfo.getFileName());
blockIndex.setBlock_index(getBlockletIndex(blockIndexInfo.getBlockletIndex()));
if (blockIndexInfo.getBlockletInfo() != null) {
blockIndex.setBlocklet_info(getBlocletInfo3(blockIndexInfo.getBlockletInfo()));
}
thriftBlockIndexList.add(blockIndex);
}
return thriftBlockIndexList;
}
private static BlockletInfo3 getBlocletInfo3(
org.apache.carbondata.core.metadata.blocklet.BlockletInfo blockletInfo) {
List<Long> dimensionChunkOffsets = blockletInfo.getDimensionChunkOffsets();
dimensionChunkOffsets.addAll(blockletInfo.getMeasureChunkOffsets());
List<Integer> dimensionChunksLength = blockletInfo.getDimensionChunksLength();
dimensionChunksLength.addAll(blockletInfo.getMeasureChunksLength());
return new BlockletInfo3(blockletInfo.getNumberOfRows(), dimensionChunkOffsets,
dimensionChunksLength, blockletInfo.getDimensionOffset(), blockletInfo.getMeasureOffsets(),
blockletInfo.getNumberOfPages());
}
/**
* return DataChunk3 that contains the input DataChunk2 list
*/
public static DataChunk3 getDataChunk3(List<DataChunk2> dataChunksList) {
int offset = 0;
DataChunk3 dataChunk = new DataChunk3();
List<Integer> pageOffsets = new ArrayList<>();
List<Integer> pageLengths = new ArrayList<>();
int length = 0;
for (DataChunk2 dataChunk2 : dataChunksList) {
pageOffsets.add(offset);
length = dataChunk2.getData_page_length() + dataChunk2.getRle_page_length() +
dataChunk2.getRowid_page_length();
pageLengths.add(length);
offset += length;
}
dataChunk.setData_chunk_list(dataChunksList);
dataChunk.setPage_length(pageLengths);
dataChunk.setPage_offset(pageOffsets);
return dataChunk;
}
/**
* return DataChunk3 for the dimension column (specifed by `columnIndex`)
* in `encodedTablePageList`
*/
public static DataChunk3 getDimensionDataChunk3(List<EncodedTablePage> encodedTablePageList,
int columnIndex) throws IOException {
List<DataChunk2> dataChunksList = new ArrayList<>(encodedTablePageList.size());
for (EncodedTablePage encodedTablePage : encodedTablePageList) {
dataChunksList.add(encodedTablePage.getDimension(columnIndex).getPageMetadata());
}
return CarbonMetadataUtil.getDataChunk3(dataChunksList);
}
/**
* return DataChunk3 for the measure column (specifed by `columnIndex`)
* in `encodedTablePageList`
*/
public static DataChunk3 getMeasureDataChunk3(List<EncodedTablePage> encodedTablePageList,
int columnIndex) throws IOException {
List<DataChunk2> dataChunksList = new ArrayList<>(encodedTablePageList.size());
for (EncodedTablePage encodedTablePage : encodedTablePageList) {
dataChunksList.add(encodedTablePage.getMeasure(columnIndex).getPageMetadata());
}
return CarbonMetadataUtil.getDataChunk3(dataChunksList);
}
private static int compareMeasureData(byte[] first, byte[] second, DataType dataType) {
ByteBuffer firstBuffer = null;
ByteBuffer secondBuffer = null;
if (dataType == DataTypes.BOOLEAN) {
return first[0] - second[0];
} else if (dataType == DataTypes.DOUBLE) {
firstBuffer = ByteBuffer.allocate(8);
firstBuffer.put(first);
secondBuffer = ByteBuffer.allocate(8);
secondBuffer.put(second);
firstBuffer.flip();
secondBuffer.flip();
return (int) (firstBuffer.getDouble() - secondBuffer.getDouble());
} else if (dataType == DataTypes.LONG || dataType == DataTypes.INT
|| dataType == DataTypes.SHORT) {
firstBuffer = ByteBuffer.allocate(8);
firstBuffer.put(first);
secondBuffer = ByteBuffer.allocate(8);
secondBuffer.put(second);
firstBuffer.flip();
secondBuffer.flip();
return (int) (firstBuffer.getLong() - secondBuffer.getLong());
} else if (dataType == DataTypes.DECIMAL) {
return DataTypeUtil.byteToBigDecimal(first).compareTo(DataTypeUtil.byteToBigDecimal(second));
} else {
throw new IllegalArgumentException("Invalid data type");
}
}
/**
* Below method will be used to prepare the file header object for carbondata file
*
* @param isFooterPresent is footer present in carbon data file
* @param columnSchemaList list of column schema
* @param schemaUpdatedTimeStamp schema updated time stamp to be used for restructure scenarios
* @return file header thrift object
*/
public static FileHeader getFileHeader(boolean isFooterPresent,
List<ColumnSchema> columnSchemaList, long schemaUpdatedTimeStamp) {
FileHeader fileHeader = new FileHeader();
ColumnarFormatVersion version = CarbonProperties.getInstance().getFormatVersion();
fileHeader.setIs_footer_present(isFooterPresent);
fileHeader.setColumn_schema(columnSchemaList);
fileHeader.setVersion(version.number());
fileHeader.setTime_stamp(schemaUpdatedTimeStamp);
return fileHeader;
}
}