blob: 53de54a8a4b851aff6327315cb355dee0cdee682 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_EXEC_PARQUET_COMMON_H
#define IMPALA_EXEC_PARQUET_COMMON_H
#include "common/compiler-util.h"
#include "gen-cpp/Descriptors_types.h"
#include "gen-cpp/parquet_types.h"
#include "runtime/decimal-value.h"
#include "runtime/string-value.h"
#include "util/bit-util.h"
#include "util/decimal-util.h"
/// This file contains common elements between the parquet Writer and Scanner.
namespace impala {
class TimestampValue;
const uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
const uint32_t PARQUET_CURRENT_VERSION = 1;
/// Mapping of impala types to parquet storage types. This is indexed by
/// PrimitiveType enum
const parquet::Type::type IMPALA_TO_PARQUET_TYPES[] = {
parquet::Type::BOOLEAN, // Invalid
parquet::Type::BOOLEAN, // NULL type
parquet::Type::BOOLEAN,
parquet::Type::INT32,
parquet::Type::INT32,
parquet::Type::INT32,
parquet::Type::INT64,
parquet::Type::FLOAT,
parquet::Type::DOUBLE,
parquet::Type::INT96, // Timestamp
parquet::Type::BYTE_ARRAY, // String
parquet::Type::BYTE_ARRAY, // Date, NYI
parquet::Type::BYTE_ARRAY, // DateTime, NYI
parquet::Type::BYTE_ARRAY, // Binary NYI
parquet::Type::FIXED_LEN_BYTE_ARRAY, // Decimal
parquet::Type::BYTE_ARRAY, // VARCHAR(N)
parquet::Type::BYTE_ARRAY, // CHAR(N)
};
/// Mapping of Parquet codec enums to Impala enums
const THdfsCompression::type PARQUET_TO_IMPALA_CODEC[] = {
THdfsCompression::NONE,
THdfsCompression::SNAPPY,
THdfsCompression::GZIP,
THdfsCompression::LZO
};
/// Mapping of Impala codec enums to Parquet enums
const parquet::CompressionCodec::type IMPALA_TO_PARQUET_CODEC[] = {
parquet::CompressionCodec::UNCOMPRESSED,
parquet::CompressionCodec::SNAPPY, // DEFAULT
parquet::CompressionCodec::GZIP, // GZIP
parquet::CompressionCodec::GZIP, // DEFLATE
parquet::CompressionCodec::SNAPPY,
parquet::CompressionCodec::SNAPPY, // SNAPPY_BLOCKED
parquet::CompressionCodec::LZO,
};
/// The plain encoding does not maintain any state so all these functions
/// are static helpers.
/// TODO: we are using templates to provide a generic interface (over the
/// types) to avoid performance penalties. This makes the code more complex
/// and should be removed when we have codegen support to inline virtual
/// calls.
class ParquetPlainEncoder {
public:
/// Returns the byte size of 'v'.
template <typename T>
static int ByteSize(const T& v) { return sizeof(T); }
/// Returns the encoded size of values of type t. Returns -1 if it is variable
/// length. This can be different than the slot size of the types.
static int EncodedByteSize(const ColumnType& t) {
switch (t.type) {
case TYPE_STRING:
case TYPE_VARCHAR:
case TYPE_CHAR:
// CHAR is varlen here because we don't write the padding to the file
return -1;
case TYPE_TINYINT:
case TYPE_SMALLINT:
case TYPE_INT:
case TYPE_FLOAT:
return 4;
case TYPE_BIGINT:
case TYPE_DOUBLE:
return 8;
case TYPE_TIMESTAMP:
return 12;
case TYPE_DECIMAL:
return DecimalSize(t);
case TYPE_NULL:
case TYPE_BOOLEAN: // These types are not plain encoded.
default:
DCHECK(false);
return -1;
}
}
/// The minimum byte size to store decimals of with precision t.precision.
static int DecimalSize(const ColumnType& t) {
DCHECK(t.type == TYPE_DECIMAL);
// Numbers in the comment is the max positive value that can be represented
// with those number of bits (max negative is -(X + 1)).
// TODO: use closed form for this?
switch (t.precision) {
case 1: case 2:
return 1; // 127
case 3: case 4:
return 2; // 32,767
case 5: case 6:
return 3; // 8,388,607
case 7: case 8: case 9:
return 4; // 2,147,483,427
case 10: case 11:
return 5; // 549,755,813,887
case 12: case 13: case 14:
return 6; // 140,737,488,355,327
case 15: case 16:
return 7; // 36,028,797,018,963,967
case 17: case 18:
return 8; // 9,223,372,036,854,775,807
case 19: case 20: case 21:
return 9; // 2,361,183,241,434,822,606,847
case 22: case 23:
return 10; // 604,462,909,807,314,587,353,087
case 24: case 25: case 26:
return 11; // 154,742,504,910,672,534,362,390,527
case 27: case 28:
return 12; // 39,614,081,257,132,168,796,771,975,167
case 29: case 30: case 31:
return 13; // 10,141,204,801,825,835,211,973,625,643,007
case 32: case 33:
return 14; // 2,596,148,429,267,413,814,265,248,164,610,047
case 34: case 35:
return 15; // 664,613,997,892,457,936,451,903,530,140,172,287
case 36: case 37: case 38:
return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727
default:
DCHECK(false);
break;
}
return -1;
}
/// Encodes t into buffer. Returns the number of bytes added. buffer must
/// be preallocated and big enough. Buffer need not be aligned.
/// 'fixed_len_size' is only applicable for data encoded using FIXED_LEN_BYTE_ARRAY and
/// is the number of bytes the plain encoder should use.
template <typename T>
static int Encode(const T& t, int fixed_len_size, uint8_t* buffer) {
memcpy(buffer, &t, ByteSize(t));
return ByteSize(t);
}
/// Decodes t from 'buffer', reading up to the byte before 'buffer_end'. 'buffer'
/// need not be aligned. For types that are stored as FIXED_LEN_BYTE_ARRAY,
/// 'fixed_len_size' is the size of the object. Otherwise, it is unused.
/// Returns the number of bytes read or -1 if the value was not decoded successfully.
template <typename T>
static int Decode(const uint8_t* buffer, const uint8_t* buffer_end, int fixed_len_size,
T* v) {
int byte_size = ByteSize(*v);
if (UNLIKELY(buffer_end - buffer < byte_size)) return -1;
memcpy(v, buffer, byte_size);
return byte_size;
}
};
/// Calling this with arguments of type ColumnType is certainly a programmer error, so we
/// disallow it.
template <> int ParquetPlainEncoder::ByteSize(const ColumnType& t);
/// Disable for bools. Plain encoding is not used for booleans.
template <> int ParquetPlainEncoder::ByteSize(const bool& b);
template <> int ParquetPlainEncoder::Encode(const bool&, int fixed_len_size, uint8_t*);
template <> int ParquetPlainEncoder::Decode(const uint8_t*, const uint8_t*,
int fixed_len_size, bool* v);
/// Not used for decimals since the plain encoding encodes them using
/// FIXED_LEN_BYTE_ARRAY.
template <>
inline int ParquetPlainEncoder::ByteSize(const Decimal4Value&) {
DCHECK(false);
return -1;
}
template <>
inline int ParquetPlainEncoder::ByteSize(const Decimal8Value&) {
DCHECK(false);
return -1;
}
template <>
inline int ParquetPlainEncoder::ByteSize(const Decimal16Value&) {
DCHECK(false);
return -1;
}
/// Parquet doesn't have 8-bit or 16-bit ints. They are converted to 32-bit.
template <>
inline int ParquetPlainEncoder::ByteSize(const int8_t& v) { return sizeof(int32_t); }
template <>
inline int ParquetPlainEncoder::ByteSize(const int16_t& v) { return sizeof(int32_t); }
template <>
inline int ParquetPlainEncoder::ByteSize(const StringValue& v) {
return sizeof(int32_t) + v.len;
}
template <>
inline int ParquetPlainEncoder::ByteSize(const TimestampValue& v) {
return 12;
}
template <>
inline int ParquetPlainEncoder::Decode(const uint8_t* buffer, const uint8_t* buffer_end,
int fixed_len_size, int8_t* v) {
int byte_size = ByteSize(*v);
if (UNLIKELY(buffer_end - buffer < byte_size)) return -1;
*v = *buffer;
return byte_size;
}
template <>
inline int ParquetPlainEncoder::Decode(const uint8_t* buffer, const uint8_t* buffer_end,
int fixed_len_size, int16_t* v) {
int byte_size = ByteSize(*v);
if (UNLIKELY(buffer_end - buffer < byte_size)) return -1;
memcpy(v, buffer, sizeof(int16_t));
return byte_size;
}
template <>
inline int ParquetPlainEncoder::Encode(
const int8_t& v, int fixed_len_size, uint8_t* buffer) {
int32_t val = v;
memcpy(buffer, &val, sizeof(int32_t));
return ByteSize(v);
}
template <>
inline int ParquetPlainEncoder::Encode(
const int16_t& v, int fixed_len_size, uint8_t* buffer) {
int32_t val = v;
memcpy(buffer, &val, sizeof(int32_t));
return ByteSize(v);
}
template <>
inline int ParquetPlainEncoder::Encode(
const StringValue& v, int fixed_len_size, uint8_t* buffer) {
memcpy(buffer, &v.len, sizeof(int32_t));
memcpy(buffer + sizeof(int32_t), v.ptr, v.len);
return ByteSize(v);
}
template <>
inline int ParquetPlainEncoder::Decode(const uint8_t* buffer, const uint8_t* buffer_end,
int fixed_len_size, StringValue* v) {
if (UNLIKELY(buffer_end - buffer < sizeof(int32_t))) return -1;
memcpy(&v->len, buffer, sizeof(int32_t));
int byte_size = ByteSize(*v);
if (UNLIKELY(v->len < 0 || buffer_end - buffer < byte_size)) return -1;
v->ptr = reinterpret_cast<char*>(const_cast<uint8_t*>(buffer)) + sizeof(int32_t);
if (fixed_len_size > 0) v->len = std::min(v->len, fixed_len_size);
// we still read byte_size bytes, even if we truncate
return byte_size;
}
/// Write decimals as big endian (byte comparable) to benefit from common prefixes.
/// fixed_len_size can be less than sizeof(Decimal*Value) for space savings. This means
/// that the value in the in-memory format has leading zeros or negative 1's.
/// For example, precision 2 fits in 1 byte. All decimals stored as Decimal4Value
/// will have 3 bytes of leading zeros, we will only store the interesting byte.
template <>
inline int ParquetPlainEncoder::Encode(
const Decimal4Value& v, int fixed_len_size, uint8_t* buffer) {
DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);
return fixed_len_size;
}
template <>
inline int ParquetPlainEncoder::Encode(
const Decimal8Value& v, int fixed_len_size, uint8_t* buffer) {
DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);
return fixed_len_size;
}
template <>
inline int ParquetPlainEncoder::Encode(
const Decimal16Value& v, int fixed_len_size, uint8_t* buffer) {
DecimalUtil::EncodeToFixedLenByteArray(buffer, fixed_len_size, v);
return fixed_len_size;
}
template <>
inline int ParquetPlainEncoder::Decode(const uint8_t* buffer, const uint8_t* buffer_end,
int fixed_len_size, Decimal4Value* v) {
if (UNLIKELY(buffer_end - buffer < fixed_len_size)) return -1;
DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);
return fixed_len_size;
}
template <>
inline int ParquetPlainEncoder::Decode(const uint8_t* buffer, const uint8_t* buffer_end,
int fixed_len_size, Decimal8Value* v) {
if (UNLIKELY(buffer_end - buffer < fixed_len_size)) return -1;
DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);
return fixed_len_size;
}
template <>
inline int ParquetPlainEncoder::Decode(const uint8_t* buffer, const uint8_t* buffer_end,
int fixed_len_size, Decimal16Value* v) {
if (UNLIKELY(buffer_end - buffer < fixed_len_size)) return -1;
DecimalUtil::DecodeFromFixedLenByteArray(buffer, fixed_len_size, v);
return fixed_len_size;
}
}
#endif