blob: 38015c4d6399bb88efdcebda790ffe4f98c41700 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef PARQUET_TYPES_H
#define PARQUET_TYPES_H
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <sstream>
#include <string>
#include "arrow/util/compiler-util.h"
#include "parquet/util/visibility.h"
namespace parquet {
// ----------------------------------------------------------------------
// Metadata enums to match Thrift metadata
//
// The reason we maintain our own enums is to avoid transitive dependency on
// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
// public API. After building parquet-cpp, you should not need to include
// Thrift headers in your application. This means some boilerplate to convert
// between our types and Parquet's Thrift types.
//
// We can also add special values like NONE to distinguish between metadata
// values being set and not set. As an example consider ConvertedType and
// CompressionCodec
// Mirrors parquet::Type
struct Type {
enum type {
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3,
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7
};
};
// Mirrors parquet::ConvertedType
struct LogicalType {
enum type {
NONE,
UTF8,
MAP,
MAP_KEY_VALUE,
LIST,
ENUM,
DECIMAL,
DATE,
TIME_MILLIS,
TIME_MICROS,
TIMESTAMP_MILLIS,
TIMESTAMP_MICROS,
UINT_8,
UINT_16,
UINT_32,
UINT_64,
INT_8,
INT_16,
INT_32,
INT_64,
JSON,
BSON,
INTERVAL,
NA = 25
};
};
// Mirrors parquet::FieldRepetitionType
struct Repetition {
enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2 };
};
// Data encodings. Mirrors parquet::Encoding
struct Encoding {
enum type {
PLAIN = 0,
PLAIN_DICTIONARY = 2,
RLE = 3,
BIT_PACKED = 4,
DELTA_BINARY_PACKED = 5,
DELTA_LENGTH_BYTE_ARRAY = 6,
DELTA_BYTE_ARRAY = 7,
RLE_DICTIONARY = 8
};
};
// Compression, mirrors parquet::CompressionCodec
struct Compression {
enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI };
};
// parquet::PageType
struct PageType {
enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 };
};
// ----------------------------------------------------------------------
struct ByteArray {
ByteArray() : len(0), ptr(nullptr) {}
ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
uint32_t len;
const uint8_t* ptr;
bool operator==(const ByteArray& other) const {
return this->len == other.len && 0 == memcmp(this->ptr, other.ptr, this->len);
}
bool operator!=(const ByteArray& other) const {
return this->len != other.len || 0 != memcmp(this->ptr, other.ptr, this->len);
}
};
struct FixedLenByteArray {
FixedLenByteArray() : ptr(nullptr) {}
explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
const uint8_t* ptr;
};
typedef FixedLenByteArray FLBA;
MANUALLY_ALIGNED_STRUCT(1) Int96 {
uint32_t value[3];
bool operator==(const Int96& other) const {
return 0 == memcmp(this->value, other.value, 3 * sizeof(uint32_t));
}
bool operator!=(const Int96& other) const { return !(*this == other); }
};
STRUCT_END(Int96, 12);
static inline std::string ByteArrayToString(const ByteArray& a) {
return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
}
static inline std::string Int96ToString(const Int96& a) {
std::stringstream result;
for (int i = 0; i < 3; i++) {
result << a.value[i] << " ";
}
return result.str();
}
static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(a.ptr);
std::stringstream result;
for (int i = 0; i < len; i++) {
result << (uint32_t)bytes[i] << " ";
}
return result.str();
}
static inline int ByteCompare(const ByteArray& x1, const ByteArray& x2) {
uint32_t len = std::min(x1.len, x2.len);
int cmp = memcmp(x1.ptr, x2.ptr, len);
if (cmp != 0) return cmp;
if (len < x1.len) return 1;
if (len < x2.len) return -1;
return 0;
}
template <int TYPE>
struct type_traits {};
template <>
struct type_traits<Type::BOOLEAN> {
typedef bool value_type;
static constexpr int value_byte_size = 1;
static constexpr const char* printf_code = "d";
};
template <>
struct type_traits<Type::INT32> {
typedef int32_t value_type;
static constexpr int value_byte_size = 4;
static constexpr const char* printf_code = "d";
};
template <>
struct type_traits<Type::INT64> {
typedef int64_t value_type;
static constexpr int value_byte_size = 8;
static constexpr const char* printf_code = "ld";
};
template <>
struct type_traits<Type::INT96> {
typedef Int96 value_type;
static constexpr int value_byte_size = 12;
static constexpr const char* printf_code = "s";
};
template <>
struct type_traits<Type::FLOAT> {
typedef float value_type;
static constexpr int value_byte_size = 4;
static constexpr const char* printf_code = "f";
};
template <>
struct type_traits<Type::DOUBLE> {
typedef double value_type;
static constexpr int value_byte_size = 8;
static constexpr const char* printf_code = "lf";
};
template <>
struct type_traits<Type::BYTE_ARRAY> {
typedef ByteArray value_type;
static constexpr int value_byte_size = sizeof(ByteArray);
static constexpr const char* printf_code = "s";
};
template <>
struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
typedef FixedLenByteArray value_type;
static constexpr int value_byte_size = sizeof(FixedLenByteArray);
static constexpr const char* printf_code = "s";
};
template <Type::type TYPE>
struct DataType {
static constexpr Type::type type_num = TYPE;
typedef typename type_traits<TYPE>::value_type c_type;
};
typedef DataType<Type::BOOLEAN> BooleanType;
typedef DataType<Type::INT32> Int32Type;
typedef DataType<Type::INT64> Int64Type;
typedef DataType<Type::INT96> Int96Type;
typedef DataType<Type::FLOAT> FloatType;
typedef DataType<Type::DOUBLE> DoubleType;
typedef DataType<Type::BYTE_ARRAY> ByteArrayType;
typedef DataType<Type::FIXED_LEN_BYTE_ARRAY> FLBAType;
template <typename Type>
inline std::string format_fwf(int width) {
std::stringstream ss;
ss << "%-" << width << type_traits<Type::type_num>::printf_code;
return ss.str();
}
PARQUET_EXPORT std::string CompressionToString(Compression::type t);
PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t);
PARQUET_EXPORT std::string TypeToString(Type::type t);
PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val);
PARQUET_EXPORT int GetTypeByteSize(Type::type t);
} // namespace parquet
#endif // PARQUET_TYPES_H