tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java - carbondata - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.carbondata.tool;

 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.math.BigDecimal;
 import java.nio.ByteBuffer;
 import java.util.LinkedList;
 import java.util.List;

 import org.apache.carbondata.core.datastore.FileReader;
 import org.apache.carbondata.core.datastore.chunk.impl.DimensionRawColumnChunk;
 import org.apache.carbondata.core.datastore.compression.Compressor;
 import org.apache.carbondata.core.datastore.compression.CompressorFactory;
 import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
 import org.apache.carbondata.core.datastore.impl.FileFactory;
 import org.apache.carbondata.core.memory.MemoryException;
 import org.apache.carbondata.core.metadata.datatype.DataType;
 import org.apache.carbondata.core.metadata.datatype.DataTypes;
 import org.apache.carbondata.core.metadata.encoder.Encoding;
 import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
 import org.apache.carbondata.core.reader.CarbonFooterReaderV3;
 import org.apache.carbondata.core.reader.CarbonHeaderReader;
 import org.apache.carbondata.core.scan.result.vector.CarbonDictionary;
 import org.apache.carbondata.core.util.ByteUtil;
 import org.apache.carbondata.core.util.CarbonMetadataUtil;
 import org.apache.carbondata.core.util.CarbonUtil;
 import org.apache.carbondata.core.util.DataTypeUtil;
 import org.apache.carbondata.core.util.path.CarbonTablePath;
 import org.apache.carbondata.format.BlockletIndex;
 import org.apache.carbondata.format.BlockletInfo3;
 import org.apache.carbondata.format.DataChunk3;
 import org.apache.carbondata.format.FileFooter3;
 import org.apache.carbondata.format.FileHeader;
 import org.apache.carbondata.format.LocalDictionaryChunk;

 import static org.apache.carbondata.core.constants.CarbonCommonConstants.FILE_SEPARATOR;

 /**
  * Contains information extracted from a .carbondata file
  */
 class DataFile {
   private CarbonFile dataFile;

   // file full path
   private String filePath;

   // reader for this file
   private FileReader fileReader;

   // shard name
   private String shardName;

   // part id
   private String partNo;

   private long fileSizeInBytes;
   private long footerSizeInBytes;

   // size in bytes of each blocklet
   private LinkedList<Long> blockletSizeInBytes = new LinkedList<>();

   // data size in bytes of each column in each blocklet
   private LinkedList<LinkedList<Long>> columnDataSizeInBytes = new LinkedList<>();
   // meta size (DataChunk3) in bytes of each column in each blocklet
   private LinkedList<LinkedList<Long>> columnMetaSizeInBytes = new LinkedList<>();

   private FileHeader header;
   private FileFooter3 footer;
   private long footerOffset;
   private List<ColumnSchema> schema;
   private List<Blocklet> blocklets;

   DataFile(CarbonFile dataFile) {
     this.dataFile = dataFile;
     this.filePath = dataFile.getPath();
     this.fileSizeInBytes = dataFile.getSize();
   }

   void collectAllMeta() throws IOException {
     FileHeader header = null;
     FileFooter3 footer = null;
     try {
       header = readHeader();
     } catch (IOException e) {
       throw new IOException("failed to read header in " + dataFile.getPath(), e);
     }
     if (header.isSetSync_marker()) {
       // if sync_marker is set, it is a streaming format file
       throw new UnsupportedOperationException("streaming file is not supported");
     }
     try {
       footer = readFooter();
     } catch (IOException e) {
       throw new IOException("failed to read footer in " + dataFile.getPath(), e);
     }

     this.header = header;
     this.footer = footer;
     String filePath = dataFile.getPath();
     // folder path that contains this file
     String fileName =
         filePath.substring(filePath.replaceAll("\\\\", FILE_SEPARATOR).lastIndexOf(FILE_SEPARATOR));
     this.shardName = CarbonTablePath.getShardName(fileName);
     this.partNo = CarbonTablePath.DataFileUtil.getPartNo(fileName);

     // calculate blocklet size and column size
     // first calculate the header size, it equals the offset of first
     // column chunk in first blocklet
     long headerSizeInBytes = footer.blocklet_info_list3.get(0).column_data_chunks_offsets.get(0);
     long previousOffset = headerSizeInBytes;
     for (BlockletInfo3 blockletInfo3 : footer.blocklet_info_list3) {
       // calculate blocklet size in bytes
       long blockletOffset = blockletInfo3.column_data_chunks_offsets.get(0);
       blockletSizeInBytes.add(blockletOffset - previousOffset);
       previousOffset = blockletOffset;

       // calculate column size in bytes for each column
       LinkedList<Long> columnDataSize = new LinkedList<>();
       LinkedList<Long> columnMetaSize = new LinkedList<>();
       long previousChunkOffset = blockletInfo3.column_data_chunks_offsets.get(0);
       for (int i = 0; i < schema.size(); i++) {
         columnDataSize.add(blockletInfo3.column_data_chunks_offsets.get(i) - previousChunkOffset);
         columnMetaSize.add(blockletInfo3.column_data_chunks_length.get(i).longValue());
         previousChunkOffset = blockletInfo3.column_data_chunks_offsets.get(i);
       }
       // last column chunk data size
       columnDataSize.add(fileSizeInBytes - footerSizeInBytes - previousChunkOffset);
       columnDataSize.removeFirst();
       this.columnDataSizeInBytes.add(columnDataSize);
       this.columnMetaSizeInBytes.add(columnMetaSize);

     }
     // last blocklet size
     blockletSizeInBytes.add(
         fileSizeInBytes - footerSizeInBytes - headerSizeInBytes - previousOffset);
     this.blockletSizeInBytes.removeFirst();

     assert (blockletSizeInBytes.size() == getNumBlocklets());
   }

   FileHeader readHeader() throws IOException {
     CarbonHeaderReader reader = new CarbonHeaderReader(dataFile.getPath());
     this.schema = reader.readSchema();
     return reader.readHeader();
   }

   FileFooter3 readFooter() throws IOException {
     if (this.fileReader == null) {
       this.fileReader = FileFactory.getFileHolder(FileFactory.getFileType(dataFile.getPath()));
     }
     ByteBuffer buffer = fileReader.readByteBuffer(FileFactory.getUpdatedFilePath(
         dataFile.getPath()), dataFile.getSize() - 8, 8);
     this.footerOffset = buffer.getLong();
     this.footerSizeInBytes = this.fileSizeInBytes - footerOffset;
     CarbonFooterReaderV3 footerReader =
         new CarbonFooterReaderV3(dataFile.getAbsolutePath(), footerOffset);
     return footerReader.readFooterVersion3();
   }

   String getFilePath() {
     return filePath;
   }

   String getShardName() {
     return shardName;
   }

   String getPartNo() {
     return partNo;
   }

   FileHeader getHeader() {
     return header;
   }

   FileFooter3 getFooter() {
     return footer;
   }

   List<ColumnSchema> getSchema() {
     return schema;
   }

   FileReader getFileReader() {
     return fileReader;
   }

   long getFooterOffset() {
     return footerOffset;
   }

   int getNumBlocklet() {
     return blockletSizeInBytes.size();
   }

   long getFileSizeInBytes() {
     return fileSizeInBytes;
   }

   int getColumnIndex(String columnName) {
     List<ColumnSchema> columns = getSchema();
     for (int i = 0; i < columns.size(); i++) {
       if (columns.get(i).getColumnName().equalsIgnoreCase(columnName)) {
         return i;
       }
     }
     throw new IllegalArgumentException(columnName + " not found");
   }

   ColumnSchema getColumn(String columnName) {
     List<ColumnSchema> columns = getSchema();
     for (int i = 0; i < columns.size(); i++) {
       if (columns.get(i).getColumnName().equalsIgnoreCase(columnName)) {
         return columns.get(i);
       }
     }
     throw new IllegalArgumentException(columnName + " not found");
   }

   int numDimensions() {
     int numDimensions = 0;
     List<ColumnSchema> columns = getSchema();
     for (ColumnSchema column : columns) {
       if (column.isDimensionColumn()) {
         numDimensions++;
       }
     }
     return numDimensions;
   }

   private int getNumBlocklets() {
     return footer.blocklet_info_list3.size();
   }

   Long getBlockletSizeInBytes(int blockletId) {
     if (blockletId < 0 || blockletId >= getNumBlocklets()) {
       throw new IllegalArgumentException("invalid blockletId: " + blockletId);
     }
     return blockletSizeInBytes.get(blockletId);
   }

   Long getColumnDataSizeInBytes(int blockletId, int columnIndex) {
     if (blockletId < 0 || blockletId >= getNumBlocklets()) {
       throw new IllegalArgumentException("invalid blockletId: " + blockletId);
     }
     LinkedList<Long> columnSize = this.columnDataSizeInBytes.get(blockletId);
     if (columnIndex >= columnSize.size()) {
       throw new IllegalArgumentException("invalid columnIndex: " + columnIndex);
     }
     return columnSize.get(columnIndex);
   }

   Long getColumnMetaSizeInBytes(int blockletId, int columnIndex) {
     if (blockletId < 0 || blockletId >= getNumBlocklets()) {
       throw new IllegalArgumentException("invalid blockletId: " + blockletId);
     }
     LinkedList<Long> columnSize = this.columnMetaSizeInBytes.get(blockletId);
     if (columnIndex >= columnSize.size()) {
       throw new IllegalArgumentException("invalid columnIndex: " + columnIndex);
     }
     return columnSize.get(columnIndex);
   }

   void initAllBlockletStats(String columnName) throws IOException, MemoryException {
     int columnIndex = -1;
     ColumnSchema column = null;
     for (int i = 0; i < schema.size(); i++) {
       if (schema.get(i).getColumnName().equalsIgnoreCase(columnName)) {
         columnIndex = i;
         column = schema.get(i);
       }
     }
     if (column == null) {
       throw new IllegalArgumentException("column name " + columnName + " not exist");
     }
     List<Blocklet> blocklets = new LinkedList<>();
     for (int blockletId = 0; blockletId < footer.blocklet_index_list.size(); blockletId++) {
       blocklets.add(new Blocklet(this, blockletId, column, columnIndex, footer));
     }
     this.blocklets = blocklets;
   }

   List<Blocklet> getAllBlocklets() {
     return blocklets;
   }

   // Column chunk in one blocklet
   class ColumnChunk {

     ColumnSchema column;

     // true if local dictionary is used in this column chunk
     boolean localDict;

     // average length in bytes for all pages in this column chunk
     long avgPageLengthInBytes;

     // the size in bytes of local dictionary for this column chunk
     long blocketletDictionarySize;

     // the number of entry in local dictionary for this column chunk
     long blockletDictionaryEntries;

     // min/max stats of this column chunk
     byte[] min, max;

     // to set whether min max is present for the column chunck, as we may not right min max after
     // specific size
     boolean isMinMaxPresent;

     // percentage of min/max comparing to min/max scope collected in all blocklets
     // they are set after calculation in DataSummary
     double minPercentage, maxPercentage;

     DataChunk3 dataChunk;

     /**
      * Constructor
      * @param blockletInfo blocklet info which this column chunk belongs to
      * @param index blocklet index which this column chunk belongs to
      * @param column column schema of this column chunk
      * @param columnIndex column index of this column chunk
      */
     ColumnChunk(BlockletInfo3 blockletInfo, BlockletIndex index, ColumnSchema column,
         int columnIndex) throws IOException, MemoryException {
       this.column = column;
       min = index.min_max_index.min_values.get(columnIndex).array();
       max = index.min_max_index.max_values.get(columnIndex).array();
       isMinMaxPresent = index.min_max_index.min_max_presence.get(columnIndex);

       // read the column chunk metadata: DataChunk3
       ByteBuffer buffer = fileReader.readByteBuffer(
           filePath, blockletInfo.column_data_chunks_offsets.get(columnIndex),
           blockletInfo.column_data_chunks_length.get(columnIndex));
       dataChunk = CarbonUtil.readDataChunk3(new ByteArrayInputStream(buffer.array()));
       this.localDict = dataChunk.isSetLocal_dictionary();
       if (this.localDict) {
         String compressorName = CarbonMetadataUtil.getCompressorNameFromChunkMeta(
             dataChunk.data_chunk_list.get(0).chunk_meta);
         LocalDictionaryChunk dictionaryChunk = dataChunk.local_dictionary;
         Compressor comp = CompressorFactory.getInstance().getCompressor(compressorName);
         CarbonDictionary dictionary = DimensionRawColumnChunk.getDictionary(dictionaryChunk, comp);
         blockletDictionaryEntries = dictionary.getDictionaryActualSize();
         blocketletDictionarySize = dataChunk.local_dictionary.dictionary_data.array().length;
       }
       long pageLength = 0;
       for (int size : dataChunk.page_length) {
         pageLength += size;
       }
       avgPageLengthInBytes = pageLength / dataChunk.page_length.size();
     }

     void setMinPercentage(double minPercentage) {
       this.minPercentage = minPercentage;
     }

     void setMaxPercentage(double maxPercentage) {
       this.maxPercentage = maxPercentage;
     }

     double getMinPercentage() {
       return minPercentage;
     }

     double getMaxPercentage() {
       return maxPercentage;
     }

     DataType getDataType() {
       return column.getDataType();
     }

     public DataChunk3 getDataChunk3() {
       return dataChunk;
     }

     byte[] min(byte[] minValue) {
       if (minValue == null) {
         return min;
       } else {
         return ByteUtil.UnsafeComparer.INSTANCE.compareTo(minValue, min) < 0 ? minValue : min;
       }
     }

     byte[] max(byte[] maxValue) {
       if (maxValue == null) {
         return max;
       } else {
         return ByteUtil.UnsafeComparer.INSTANCE.compareTo(maxValue, max) > 0 ? maxValue : max;
       }
     }
   }

   class Blocklet {
     DataFile file;
     int id;
     ColumnChunk columnChunk;

     Blocklet(DataFile file, int blockletId, ColumnSchema column, int columnIndex,
         FileFooter3 footer) throws IOException, MemoryException {
       this.file = file;
       this.id = blockletId;
       BlockletIndex index = footer.blocklet_index_list.get(blockletId);
       BlockletInfo3 info = footer.blocklet_info_list3.get(blockletId);
       this.columnChunk = new ColumnChunk(info, index, column, columnIndex);
     }

     String getShardName() {
       return file.getShardName();
     }

     ColumnChunk getColumnChunk() {
       return columnChunk;
     }

     // compute and set min and max percentage for this blocklet
     void computePercentage(byte[] shardMin, byte[] shardMax) {
       double min = computePercentage(columnChunk.min, shardMin, shardMax, columnChunk.column);
       double max = computePercentage(columnChunk.max, shardMin, shardMax, columnChunk.column);
       columnChunk.setMinPercentage(min);
       columnChunk.setMaxPercentage(max);
     }

     /**
      * Calculate data percentage in [min, max] scope based on data type
      * @param data data to calculate the percentage
      * @param min min value
      * @param max max value
      * @param column column schema including data type
      * @return result
      */
     private double computePercentage(byte[] data, byte[] min, byte[] max, ColumnSchema column) {
       if (column.getDataType() == DataTypes.STRING || column.getDataType() == DataTypes.BOOLEAN
           || column.hasEncoding(Encoding.DICTIONARY) || column.getDataType().isComplexType()) {
         // for string, we do not calculate
         return 0;
       } else if (DataTypes.isDecimal(column.getDataType())) {
         BigDecimal minValue = DataTypeUtil.byteToBigDecimal(min);
         BigDecimal dataValue = DataTypeUtil.byteToBigDecimal(data).subtract(minValue);
         BigDecimal factorValue = DataTypeUtil.byteToBigDecimal(max).subtract(minValue);
         return dataValue.divide(factorValue).doubleValue();
       }
       double dataValue, minValue, factorValue;
       if (columnChunk.column.isDimensionColumn() &&
           DataTypeUtil.isPrimitiveColumn(columnChunk.column.getDataType())) {
         minValue = Double.valueOf(String.valueOf(
             DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(min, column.getDataType())));
         dataValue = Double.valueOf(String.valueOf(
             DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(data, column.getDataType())));
         factorValue = Double.valueOf(String.valueOf(
             DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(max, column.getDataType())))
             - minValue;
       } else if (column.getDataType() == DataTypes.SHORT) {
         minValue = ByteUtil.toShort(min, 0);
         dataValue = ByteUtil.toShort(data, 0) - minValue;
         factorValue = ByteUtil.toShort(max, 0) - ByteUtil.toShort(min, 0);
       } else if (column.getDataType() == DataTypes.INT) {
         if (column.isSortColumn()) {
           minValue = ByteUtil.toXorInt(min, 0, min.length);
           dataValue = ByteUtil.toXorInt(data, 0, data.length) - minValue;
           factorValue =
               ByteUtil.toXorInt(max, 0, max.length) - ByteUtil.toXorInt(min, 0, min.length);
         } else {
           minValue = ByteUtil.toLong(min, 0, min.length);
           dataValue = ByteUtil.toLong(data, 0, data.length) - minValue;
           factorValue =
               ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length);
         }
       } else if (column.getDataType() == DataTypes.LONG) {
         minValue = ByteUtil.toLong(min, 0, min.length);
         dataValue = ByteUtil.toLong(data, 0, data.length) - minValue;
         factorValue =
             ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length);
       } else if (column.getDataType() == DataTypes.DATE) {
         minValue = ByteUtil.toInt(min, 0, min.length);
         dataValue = ByteUtil.toInt(data, 0, data.length) - minValue;
         factorValue =
             ByteUtil.toInt(max, 0, max.length) - ByteUtil.toInt(min, 0, min.length);
       } else if (column.getDataType() == DataTypes.TIMESTAMP) {
         minValue = ByteUtil.toLong(min, 0, min.length);
         dataValue = ByteUtil.toLong(data, 0, data.length) - minValue;
         factorValue =
             ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length);
       } else if (column.getDataType() == DataTypes.DOUBLE) {
         minValue = ByteUtil.toDouble(min, 0, min.length);
         dataValue = ByteUtil.toDouble(data, 0, data.length) - minValue;
         factorValue =
             ByteUtil.toDouble(max, 0, max.length) - ByteUtil.toDouble(min, 0, min.length);
       } else {
         throw new UnsupportedOperationException("data type: " + column.getDataType());
       }

       if (factorValue == 0d) {
         return 1;
       }
       return dataValue / factorValue;
     }
   }

   public void close() throws IOException {
     this.fileReader.finish();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.carbondata.tool;

	import java.io.ByteArrayInputStream;
	import java.io.IOException;
	import java.math.BigDecimal;
	import java.nio.ByteBuffer;
	import java.util.LinkedList;
	import java.util.List;

	import org.apache.carbondata.core.datastore.FileReader;
	import org.apache.carbondata.core.datastore.chunk.impl.DimensionRawColumnChunk;
	import org.apache.carbondata.core.datastore.compression.Compressor;
	import org.apache.carbondata.core.datastore.compression.CompressorFactory;
	import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
	import org.apache.carbondata.core.datastore.impl.FileFactory;
	import org.apache.carbondata.core.memory.MemoryException;
	import org.apache.carbondata.core.metadata.datatype.DataType;
	import org.apache.carbondata.core.metadata.datatype.DataTypes;
	import org.apache.carbondata.core.metadata.encoder.Encoding;
	import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
	import org.apache.carbondata.core.reader.CarbonFooterReaderV3;
	import org.apache.carbondata.core.reader.CarbonHeaderReader;
	import org.apache.carbondata.core.scan.result.vector.CarbonDictionary;
	import org.apache.carbondata.core.util.ByteUtil;
	import org.apache.carbondata.core.util.CarbonMetadataUtil;
	import org.apache.carbondata.core.util.CarbonUtil;
	import org.apache.carbondata.core.util.DataTypeUtil;
	import org.apache.carbondata.core.util.path.CarbonTablePath;
	import org.apache.carbondata.format.BlockletIndex;
	import org.apache.carbondata.format.BlockletInfo3;
	import org.apache.carbondata.format.DataChunk3;
	import org.apache.carbondata.format.FileFooter3;
	import org.apache.carbondata.format.FileHeader;
	import org.apache.carbondata.format.LocalDictionaryChunk;

	import static org.apache.carbondata.core.constants.CarbonCommonConstants.FILE_SEPARATOR;

	/**
	* Contains information extracted from a .carbondata file
	*/
	class DataFile {
	private CarbonFile dataFile;

	// file full path
	private String filePath;

	// reader for this file
	private FileReader fileReader;

	// shard name
	private String shardName;

	// part id
	private String partNo;

	private long fileSizeInBytes;
	private long footerSizeInBytes;

	// size in bytes of each blocklet
	private LinkedList<Long> blockletSizeInBytes = new LinkedList<>();

	// data size in bytes of each column in each blocklet
	private LinkedList<LinkedList<Long>> columnDataSizeInBytes = new LinkedList<>();
	// meta size (DataChunk3) in bytes of each column in each blocklet
	private LinkedList<LinkedList<Long>> columnMetaSizeInBytes = new LinkedList<>();

	private FileHeader header;
	private FileFooter3 footer;
	private long footerOffset;
	private List<ColumnSchema> schema;
	private List<Blocklet> blocklets;

	DataFile(CarbonFile dataFile) {
	this.dataFile = dataFile;
	this.filePath = dataFile.getPath();
	this.fileSizeInBytes = dataFile.getSize();
	}

	void collectAllMeta() throws IOException {
	FileHeader header = null;
	FileFooter3 footer = null;
	try {
	header = readHeader();
	} catch (IOException e) {
	throw new IOException("failed to read header in " + dataFile.getPath(), e);
	}
	if (header.isSetSync_marker()) {
	// if sync_marker is set, it is a streaming format file
	throw new UnsupportedOperationException("streaming file is not supported");
	}
	try {
	footer = readFooter();
	} catch (IOException e) {
	throw new IOException("failed to read footer in " + dataFile.getPath(), e);
	}

	this.header = header;
	this.footer = footer;
	String filePath = dataFile.getPath();
	// folder path that contains this file
	String fileName =
	filePath.substring(filePath.replaceAll("\\\\", FILE_SEPARATOR).lastIndexOf(FILE_SEPARATOR));
	this.shardName = CarbonTablePath.getShardName(fileName);
	this.partNo = CarbonTablePath.DataFileUtil.getPartNo(fileName);

	// calculate blocklet size and column size
	// first calculate the header size, it equals the offset of first
	// column chunk in first blocklet
	long headerSizeInBytes = footer.blocklet_info_list3.get(0).column_data_chunks_offsets.get(0);
	long previousOffset = headerSizeInBytes;
	for (BlockletInfo3 blockletInfo3 : footer.blocklet_info_list3) {
	// calculate blocklet size in bytes
	long blockletOffset = blockletInfo3.column_data_chunks_offsets.get(0);
	blockletSizeInBytes.add(blockletOffset - previousOffset);
	previousOffset = blockletOffset;

	// calculate column size in bytes for each column
	LinkedList<Long> columnDataSize = new LinkedList<>();
	LinkedList<Long> columnMetaSize = new LinkedList<>();
	long previousChunkOffset = blockletInfo3.column_data_chunks_offsets.get(0);
	for (int i = 0; i < schema.size(); i++) {
	columnDataSize.add(blockletInfo3.column_data_chunks_offsets.get(i) - previousChunkOffset);
	columnMetaSize.add(blockletInfo3.column_data_chunks_length.get(i).longValue());
	previousChunkOffset = blockletInfo3.column_data_chunks_offsets.get(i);
	}
	// last column chunk data size
	columnDataSize.add(fileSizeInBytes - footerSizeInBytes - previousChunkOffset);
	columnDataSize.removeFirst();
	this.columnDataSizeInBytes.add(columnDataSize);
	this.columnMetaSizeInBytes.add(columnMetaSize);

	}
	// last blocklet size
	blockletSizeInBytes.add(
	fileSizeInBytes - footerSizeInBytes - headerSizeInBytes - previousOffset);
	this.blockletSizeInBytes.removeFirst();

	assert (blockletSizeInBytes.size() == getNumBlocklets());
	}

	FileHeader readHeader() throws IOException {
	CarbonHeaderReader reader = new CarbonHeaderReader(dataFile.getPath());
	this.schema = reader.readSchema();
	return reader.readHeader();
	}

	FileFooter3 readFooter() throws IOException {
	if (this.fileReader == null) {
	this.fileReader = FileFactory.getFileHolder(FileFactory.getFileType(dataFile.getPath()));
	}
	ByteBuffer buffer = fileReader.readByteBuffer(FileFactory.getUpdatedFilePath(
	dataFile.getPath()), dataFile.getSize() - 8, 8);
	this.footerOffset = buffer.getLong();
	this.footerSizeInBytes = this.fileSizeInBytes - footerOffset;
	CarbonFooterReaderV3 footerReader =
	new CarbonFooterReaderV3(dataFile.getAbsolutePath(), footerOffset);
	return footerReader.readFooterVersion3();
	}

	String getFilePath() {
	return filePath;
	}

	String getShardName() {
	return shardName;
	}

	String getPartNo() {
	return partNo;
	}

	FileHeader getHeader() {
	return header;
	}

	FileFooter3 getFooter() {
	return footer;
	}

	List<ColumnSchema> getSchema() {
	return schema;
	}

	FileReader getFileReader() {
	return fileReader;
	}

	long getFooterOffset() {
	return footerOffset;
	}

	int getNumBlocklet() {
	return blockletSizeInBytes.size();
	}

	long getFileSizeInBytes() {
	return fileSizeInBytes;
	}

	int getColumnIndex(String columnName) {
	List<ColumnSchema> columns = getSchema();
	for (int i = 0; i < columns.size(); i++) {
	if (columns.get(i).getColumnName().equalsIgnoreCase(columnName)) {
	return i;
	}
	}
	throw new IllegalArgumentException(columnName + " not found");
	}

	ColumnSchema getColumn(String columnName) {
	List<ColumnSchema> columns = getSchema();
	for (int i = 0; i < columns.size(); i++) {
	if (columns.get(i).getColumnName().equalsIgnoreCase(columnName)) {
	return columns.get(i);
	}
	}
	throw new IllegalArgumentException(columnName + " not found");
	}

	int numDimensions() {
	int numDimensions = 0;
	List<ColumnSchema> columns = getSchema();
	for (ColumnSchema column : columns) {
	if (column.isDimensionColumn()) {
	numDimensions++;
	}
	}
	return numDimensions;
	}

	private int getNumBlocklets() {
	return footer.blocklet_info_list3.size();
	}

	Long getBlockletSizeInBytes(int blockletId) {
	if (blockletId < 0 \|\| blockletId >= getNumBlocklets()) {
	throw new IllegalArgumentException("invalid blockletId: " + blockletId);
	}
	return blockletSizeInBytes.get(blockletId);
	}

	Long getColumnDataSizeInBytes(int blockletId, int columnIndex) {
	if (blockletId < 0 \|\| blockletId >= getNumBlocklets()) {
	throw new IllegalArgumentException("invalid blockletId: " + blockletId);
	}
	LinkedList<Long> columnSize = this.columnDataSizeInBytes.get(blockletId);
	if (columnIndex >= columnSize.size()) {
	throw new IllegalArgumentException("invalid columnIndex: " + columnIndex);
	}
	return columnSize.get(columnIndex);
	}

	Long getColumnMetaSizeInBytes(int blockletId, int columnIndex) {
	if (blockletId < 0 \|\| blockletId >= getNumBlocklets()) {
	throw new IllegalArgumentException("invalid blockletId: " + blockletId);
	}
	LinkedList<Long> columnSize = this.columnMetaSizeInBytes.get(blockletId);
	if (columnIndex >= columnSize.size()) {
	throw new IllegalArgumentException("invalid columnIndex: " + columnIndex);
	}
	return columnSize.get(columnIndex);
	}

	void initAllBlockletStats(String columnName) throws IOException, MemoryException {
	int columnIndex = -1;
	ColumnSchema column = null;
	for (int i = 0; i < schema.size(); i++) {
	if (schema.get(i).getColumnName().equalsIgnoreCase(columnName)) {
	columnIndex = i;
	column = schema.get(i);
	}
	}
	if (column == null) {
	throw new IllegalArgumentException("column name " + columnName + " not exist");
	}
	List<Blocklet> blocklets = new LinkedList<>();
	for (int blockletId = 0; blockletId < footer.blocklet_index_list.size(); blockletId++) {
	blocklets.add(new Blocklet(this, blockletId, column, columnIndex, footer));
	}
	this.blocklets = blocklets;
	}

	List<Blocklet> getAllBlocklets() {
	return blocklets;
	}

	// Column chunk in one blocklet
	class ColumnChunk {

	ColumnSchema column;

	// true if local dictionary is used in this column chunk
	boolean localDict;

	// average length in bytes for all pages in this column chunk
	long avgPageLengthInBytes;

	// the size in bytes of local dictionary for this column chunk
	long blocketletDictionarySize;

	// the number of entry in local dictionary for this column chunk
	long blockletDictionaryEntries;

	// min/max stats of this column chunk
	byte[] min, max;

	// to set whether min max is present for the column chunck, as we may not right min max after
	// specific size
	boolean isMinMaxPresent;

	// percentage of min/max comparing to min/max scope collected in all blocklets
	// they are set after calculation in DataSummary
	double minPercentage, maxPercentage;

	DataChunk3 dataChunk;

	/**
	* Constructor
	* @param blockletInfo blocklet info which this column chunk belongs to
	* @param index blocklet index which this column chunk belongs to
	* @param column column schema of this column chunk
	* @param columnIndex column index of this column chunk
	*/
	ColumnChunk(BlockletInfo3 blockletInfo, BlockletIndex index, ColumnSchema column,
	int columnIndex) throws IOException, MemoryException {
	this.column = column;
	min = index.min_max_index.min_values.get(columnIndex).array();
	max = index.min_max_index.max_values.get(columnIndex).array();
	isMinMaxPresent = index.min_max_index.min_max_presence.get(columnIndex);

	// read the column chunk metadata: DataChunk3
	ByteBuffer buffer = fileReader.readByteBuffer(
	filePath, blockletInfo.column_data_chunks_offsets.get(columnIndex),
	blockletInfo.column_data_chunks_length.get(columnIndex));
	dataChunk = CarbonUtil.readDataChunk3(new ByteArrayInputStream(buffer.array()));
	this.localDict = dataChunk.isSetLocal_dictionary();
	if (this.localDict) {
	String compressorName = CarbonMetadataUtil.getCompressorNameFromChunkMeta(
	dataChunk.data_chunk_list.get(0).chunk_meta);
	LocalDictionaryChunk dictionaryChunk = dataChunk.local_dictionary;
	Compressor comp = CompressorFactory.getInstance().getCompressor(compressorName);
	CarbonDictionary dictionary = DimensionRawColumnChunk.getDictionary(dictionaryChunk, comp);
	blockletDictionaryEntries = dictionary.getDictionaryActualSize();
	blocketletDictionarySize = dataChunk.local_dictionary.dictionary_data.array().length;
	}
	long pageLength = 0;
	for (int size : dataChunk.page_length) {
	pageLength += size;
	}
	avgPageLengthInBytes = pageLength / dataChunk.page_length.size();
	}

	void setMinPercentage(double minPercentage) {
	this.minPercentage = minPercentage;
	}

	void setMaxPercentage(double maxPercentage) {
	this.maxPercentage = maxPercentage;
	}

	double getMinPercentage() {
	return minPercentage;
	}

	double getMaxPercentage() {
	return maxPercentage;
	}

	DataType getDataType() {
	return column.getDataType();
	}

	public DataChunk3 getDataChunk3() {
	return dataChunk;
	}

	byte[] min(byte[] minValue) {
	if (minValue == null) {
	return min;
	} else {
	return ByteUtil.UnsafeComparer.INSTANCE.compareTo(minValue, min) < 0 ? minValue : min;
	}
	}

	byte[] max(byte[] maxValue) {
	if (maxValue == null) {
	return max;
	} else {
	return ByteUtil.UnsafeComparer.INSTANCE.compareTo(maxValue, max) > 0 ? maxValue : max;
	}
	}
	}

	class Blocklet {
	DataFile file;
	int id;
	ColumnChunk columnChunk;

	Blocklet(DataFile file, int blockletId, ColumnSchema column, int columnIndex,
	FileFooter3 footer) throws IOException, MemoryException {
	this.file = file;
	this.id = blockletId;
	BlockletIndex index = footer.blocklet_index_list.get(blockletId);
	BlockletInfo3 info = footer.blocklet_info_list3.get(blockletId);
	this.columnChunk = new ColumnChunk(info, index, column, columnIndex);
	}

	String getShardName() {
	return file.getShardName();
	}

	ColumnChunk getColumnChunk() {
	return columnChunk;
	}

	// compute and set min and max percentage for this blocklet
	void computePercentage(byte[] shardMin, byte[] shardMax) {
	double min = computePercentage(columnChunk.min, shardMin, shardMax, columnChunk.column);
	double max = computePercentage(columnChunk.max, shardMin, shardMax, columnChunk.column);
	columnChunk.setMinPercentage(min);
	columnChunk.setMaxPercentage(max);
	}

	/**
	* Calculate data percentage in [min, max] scope based on data type
	* @param data data to calculate the percentage
	* @param min min value
	* @param max max value
	* @param column column schema including data type
	* @return result
	*/
	private double computePercentage(byte[] data, byte[] min, byte[] max, ColumnSchema column) {
	if (column.getDataType() == DataTypes.STRING \|\| column.getDataType() == DataTypes.BOOLEAN
	\|\| column.hasEncoding(Encoding.DICTIONARY) \|\| column.getDataType().isComplexType()) {
	// for string, we do not calculate
	return 0;
	} else if (DataTypes.isDecimal(column.getDataType())) {
	BigDecimal minValue = DataTypeUtil.byteToBigDecimal(min);
	BigDecimal dataValue = DataTypeUtil.byteToBigDecimal(data).subtract(minValue);
	BigDecimal factorValue = DataTypeUtil.byteToBigDecimal(max).subtract(minValue);
	return dataValue.divide(factorValue).doubleValue();
	}
	double dataValue, minValue, factorValue;
	if (columnChunk.column.isDimensionColumn() &&
	DataTypeUtil.isPrimitiveColumn(columnChunk.column.getDataType())) {
	minValue = Double.valueOf(String.valueOf(
	DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(min, column.getDataType())));
	dataValue = Double.valueOf(String.valueOf(
	DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(data, column.getDataType())));
	factorValue = Double.valueOf(String.valueOf(
	DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(max, column.getDataType())))
	- minValue;
	} else if (column.getDataType() == DataTypes.SHORT) {
	minValue = ByteUtil.toShort(min, 0);
	dataValue = ByteUtil.toShort(data, 0) - minValue;
	factorValue = ByteUtil.toShort(max, 0) - ByteUtil.toShort(min, 0);
	} else if (column.getDataType() == DataTypes.INT) {
	if (column.isSortColumn()) {
	minValue = ByteUtil.toXorInt(min, 0, min.length);
	dataValue = ByteUtil.toXorInt(data, 0, data.length) - minValue;
	factorValue =
	ByteUtil.toXorInt(max, 0, max.length) - ByteUtil.toXorInt(min, 0, min.length);
	} else {
	minValue = ByteUtil.toLong(min, 0, min.length);
	dataValue = ByteUtil.toLong(data, 0, data.length) - minValue;
	factorValue =
	ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length);
	}
	} else if (column.getDataType() == DataTypes.LONG) {
	minValue = ByteUtil.toLong(min, 0, min.length);
	dataValue = ByteUtil.toLong(data, 0, data.length) - minValue;
	factorValue =
	ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length);
	} else if (column.getDataType() == DataTypes.DATE) {
	minValue = ByteUtil.toInt(min, 0, min.length);
	dataValue = ByteUtil.toInt(data, 0, data.length) - minValue;
	factorValue =
	ByteUtil.toInt(max, 0, max.length) - ByteUtil.toInt(min, 0, min.length);
	} else if (column.getDataType() == DataTypes.TIMESTAMP) {
	minValue = ByteUtil.toLong(min, 0, min.length);
	dataValue = ByteUtil.toLong(data, 0, data.length) - minValue;
	factorValue =
	ByteUtil.toLong(max, 0, max.length) - ByteUtil.toLong(min, 0, min.length);
	} else if (column.getDataType() == DataTypes.DOUBLE) {
	minValue = ByteUtil.toDouble(min, 0, min.length);
	dataValue = ByteUtil.toDouble(data, 0, data.length) - minValue;
	factorValue =
	ByteUtil.toDouble(max, 0, max.length) - ByteUtil.toDouble(min, 0, min.length);
	} else {
	throw new UnsupportedOperationException("data type: " + column.getDataType());
	}

	if (factorValue == 0d) {
	return 1;
	}
	return dataValue / factorValue;
	}
	}

	public void close() throws IOException {
	this.fileReader.finish();
	}
	}