blob: c0f88246e8049f1a7e7b0af5074da36db314126b [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ConvertColumnReader.hh"
#include "Utils.hh"
#include <optional>
namespace orc {
// Assume that we are using tight numeric vector batch
using BooleanVectorBatch = ByteVectorBatch;
ConvertColumnReader::ConvertColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ColumnReader(readType, stripe), readType(readType), throwOnOverflow(throwOnOverflow) {
reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
/*throwOnOverflow=*/false, /*convertToReadType*/ false);
data =
fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
}
void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
reader->next(*data, numValues, notNull);
rowBatch.resize(data->capacity);
rowBatch.numElements = data->numElements;
rowBatch.hasNulls = data->hasNulls;
if (!rowBatch.hasNulls) {
memset(rowBatch.notNull.data(), 1, data->notNull.size());
} else {
memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
}
}
uint64_t ConvertColumnReader::skip(uint64_t numValues) {
return reader->skip(numValues);
}
void ConvertColumnReader::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
reader->seekToRowGroup(positions);
}
static inline bool canFitInLong(double value) {
constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
}
template <typename FileType, typename ReadType>
static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
if (!shouldThrow) {
dstBatch.notNull.data()[idx] = 0;
dstBatch.hasNulls = true;
} else {
std::ostringstream ss;
ss << "Overflow when convert from " << typeid(FileType).name() << " to "
<< typeid(ReadType).name();
throw SchemaEvolutionError(ss.str());
}
}
static inline void handleParseFromStringError(ColumnVectorBatch& dstBatch, uint64_t idx,
bool shouldThrow, const std::string& typeName,
const std::string& str,
const std::string& expectedFormat = "") {
if (!shouldThrow) {
dstBatch.notNull.data()[idx] = 0;
dstBatch.hasNulls = true;
} else {
std::ostringstream ss;
ss << "Failed to parse " << typeName << " from string:" << str;
if (expectedFormat != "") {
ss << " the following format \"" << expectedFormat << "\" is expected";
}
throw SchemaEvolutionError(ss.str());
}
}
// return false if overflow
template <typename ReadType>
static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
dstValue = static_cast<ReadType>(inputLong);
if constexpr (std::is_same<ReadType, int64_t>::value) {
return true;
}
if (static_cast<int64_t>(dstValue) != inputLong) {
return false;
}
return true;
}
template <typename DestBatchPtrType>
static inline DestBatchPtrType SafeCastBatchTo(ColumnVectorBatch* batch) {
auto result = dynamic_cast<DestBatchPtrType>(batch);
if (result == nullptr) {
std::ostringstream ss;
ss << "Bad cast when convert from ColumnVectorBatch to "
<< typeid(typename std::remove_const<
typename std::remove_pointer<DestBatchPtrType>::type>::type)
.name();
throw InvalidArgument(ss.str());
}
return result;
}
// set null or throw exception if overflow
template <typename ReadType, typename FileType>
static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
ColumnVectorBatch& destBatch, uint64_t idx,
bool shouldThrow) {
constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
int64_t longValue = static_cast<int64_t>(srcValue);
if (isFileTypeFloatingPoint) {
if (isReadTypeFloatingPoint) {
destValue = static_cast<ReadType>(srcValue);
} else {
if (!canFitInLong(static_cast<double>(srcValue)) ||
!downCastToInteger(destValue, longValue)) {
handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
}
}
} else {
if (isReadTypeFloatingPoint) {
destValue = static_cast<ReadType>(srcValue);
if (destValue != destValue) { // check is NaN
handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
}
} else {
if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
}
}
}
}
// { boolean, byte, short, int, long, float, double } ->
// { byte, short, int, long, float, double }
template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
class NumericConvertColumnReader : public ConvertColumnReader {
public:
NumericConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
if (rowBatch.hasNulls) {
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
if (rowBatch.notNull[i]) {
convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
throwOnOverflow);
}
}
} else {
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
throwOnOverflow);
}
}
}
};
// { boolean, byte, short, int, long, float, double } -> { boolean }
template <typename FileTypeBatch>
class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
: public ConvertColumnReader {
public:
NumericConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
if (rowBatch.hasNulls) {
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
if (rowBatch.notNull[i]) {
dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
}
}
} else {
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
}
}
}
};
class ConvertToStringVariantColumnReader : public ConvertColumnReader {
public:
ConvertToStringVariantColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
virtual uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
protected:
std::vector<std::string> strBuffer;
};
void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
char* notNull) {
ConvertColumnReader::next(rowBatch, numValues, notNull);
// cache converted string in the buffer
auto totalLength = convertToStrBuffer(rowBatch, numValues);
// contact string values to blob buffer of vector batch
auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
dstBatch.blob.resize(totalLength);
char* blob = dstBatch.blob.data();
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
const auto size = strBuffer[i].size();
::memcpy(blob, strBuffer[i].c_str(), size);
dstBatch.data[i] = blob;
dstBatch.length[i] = static_cast<int32_t>(size);
blob += size;
}
}
strBuffer.clear();
}
class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
public:
BooleanToStringVariantColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {
trueValue_ = "TRUE";
falseValue_ = "FALSE";
if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) {
if (readType.getMaximumLength() < 5) {
throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
std::to_string(readType.getMaximumLength()));
}
if (readType.getKind() == CHAR) {
trueValue_.resize(readType.getMaximumLength(), ' ');
falseValue_.resize(readType.getMaximumLength(), ' ');
}
}
}
uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
private:
std::string trueValue_;
std::string falseValue_;
};
uint64_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
uint64_t numValues) {
uint64_t size = 0;
strBuffer.resize(numValues);
const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
// cast the bool value to string
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = (srcBatch.data[i] ? trueValue_ : falseValue_);
size += strBuffer[i].size();
}
}
return size;
}
template <typename FileTypeBatch>
class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
public:
NumericToStringVariantColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {}
uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
};
template <typename FileTypeBatch>
uint64_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
ColumnVectorBatch& rowBatch, uint64_t numValues) {
uint64_t size = 0;
strBuffer.resize(numValues);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
if (readType.getKind() == STRING) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = std::to_string(srcBatch.data[i]);
size += strBuffer[i].size();
}
}
} else if (readType.getKind() == VARCHAR) {
const auto maxLength = readType.getMaximumLength();
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = std::to_string(srcBatch.data[i]);
if (strBuffer[i].size() > maxLength) {
handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
} else {
size += strBuffer[i].size();
}
}
}
} else if (readType.getKind() == CHAR) {
const auto maxLength = readType.getMaximumLength();
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = std::to_string(srcBatch.data[i]);
if (strBuffer[i].size() > maxLength) {
handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
} else {
strBuffer[i].resize(maxLength, ' ');
size += strBuffer[i].size();
}
}
}
} else {
throw SchemaEvolutionError("Invalid type for numeric to string conversion: " +
readType.toString());
}
return size;
}
template <typename FileTypeBatch, typename ReadTypeBatch, bool isFloatingFileType>
class NumericToDecimalColumnReader : public ConvertColumnReader {
public:
NumericToDecimalColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {
precision_ = static_cast<int32_t>(readType.getPrecision());
scale_ = static_cast<int32_t>(readType.getScale());
bool overflow = false;
upperBound_ = scaleUpInt128ByPowerOfTen(1, precision_, overflow);
}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
dstBatch.precision = precision_;
dstBatch.scale = scale_;
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
if constexpr (isFloatingFileType) {
convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
} else {
convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
}
}
}
}
private:
template <typename SrcType>
void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {
const auto result = convertDecimal(value, precision_, scale_);
Int128 i128 = result.second;
if (result.first) {
handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
return;
}
if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
if (!i128.fitsInLong()) {
handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
} else {
dstBatch.values[idx] = i128.toLong();
}
} else {
dstBatch.values[idx] = i128;
}
}
template <typename SrcType>
void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {
int fromScale = 0;
auto result = convertDecimal(value, fromScale, precision_, scale_);
if (result.first) {
handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
} else {
if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
if (!result.second.fitsInLong()) {
handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
} else {
dstBatch.values[idx] = result.second.toLong();
}
} else {
dstBatch.values[idx] = result.second;
}
}
}
int32_t precision_;
int32_t scale_;
int64_t scaleMultiplier_;
Int128 upperBound_;
};
class ConvertToTimestampColumnReader : public ConvertColumnReader {
public:
ConvertToTimestampColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow),
isInstant(readType.getKind() == TIMESTAMP_INSTANT),
readerTimezone(isInstant ? &getTimezoneByName("GMT") : &stripe.getReaderTimezone()),
needConvertTimezone(readerTimezone != &getTimezoneByName("GMT")) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
protected:
const bool isInstant;
const orc::Timezone* readerTimezone;
const bool needConvertTimezone;
};
// avoid emitting vtable in every translation unit
void ConvertToTimestampColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
char* notNull) {
ConvertColumnReader::next(rowBatch, numValues, notNull);
}
template <typename FileTypeBatch>
class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader {
public:
NumericToTimestampColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
convertToTimestamp(dstBatch, i, srcBatch.data[i]);
}
}
}
private:
template <typename FileType>
void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, FileType value);
};
template <typename FileTypeBatch>
template <typename FileType>
void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
if constexpr (std::is_floating_point<FileType>::value) {
if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
return;
}
dstBatch.data[idx] = static_cast<int64_t>(value);
dstBatch.nanoseconds[idx] = static_cast<int32_t>(
static_cast<double>(value - static_cast<FileType>(dstBatch.data[idx])) * 1e9);
if (dstBatch.nanoseconds[idx] < 0) {
dstBatch.data[idx] -= 1;
dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
}
} else {
dstBatch.data[idx] = value;
dstBatch.nanoseconds[idx] = 0;
}
if (needConvertTimezone) {
dstBatch.data[idx] = readerTimezone->convertFromUTC(dstBatch.data[idx]);
}
}
template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
class DecimalToNumericColumnReader : public ConvertColumnReader {
public:
DecimalToNumericColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {
precision_ = fileType.getPrecision();
scale_ = fileType.getScale();
factor_ = 1;
for (int i = 0; i < scale_; i++) {
factor_ *= 10;
}
}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
if constexpr (std::is_floating_point_v<ReadType>) {
convertDecimalToDouble(dstBatch, i, srcBatch);
} else {
convertDecimalToInteger(dstBatch, i, srcBatch);
}
}
}
}
private:
void convertDecimalToInteger(ReadTypeBatch& dstBatch, uint64_t idx,
const FileTypeBatch& srcBatch) {
using FileType = decltype(srcBatch.values[idx]);
Int128 result = scaleDownInt128ByPowerOfTen(srcBatch.values[idx], scale_);
if (!result.fitsInLong()) {
handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
return;
}
convertNumericElement<ReadType, int64_t>(result.toLong(), dstBatch.data[idx], dstBatch, idx,
throwOnOverflow);
}
void convertDecimalToDouble(ReadTypeBatch& dstBatch, uint64_t idx,
const FileTypeBatch& srcBatch) {
double doubleValue = Int128(srcBatch.values[idx]).toDouble();
dstBatch.data[idx] = static_cast<ReadType>(doubleValue) / static_cast<ReadType>(factor_);
}
int32_t precision_;
int32_t scale_;
int64_t factor_;
};
template <typename FileTypeBatch>
class DecimalToNumericColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
: public ConvertColumnReader {
public:
DecimalToNumericColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
dstBatch.data[i] = srcBatch.values[i] == 0 ? 0 : 1;
}
}
}
};
template <typename FileTypeBatch, typename ReadTypeBatch>
class DecimalConvertColumnReader : public ConvertColumnReader {
public:
DecimalConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe,
bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {
fromPrecision_ = fileType.getPrecision();
fromScale_ = fileType.getScale();
toPrecision_ = readType.getPrecision();
toScale_ = readType.getScale();
}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
dstBatch.precision = toPrecision_;
dstBatch.scale = toScale_;
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
convertDecimalToDecimal(dstBatch, i, srcBatch);
}
}
}
private:
void convertDecimalToDecimal(ReadTypeBatch& dstBatch, uint64_t idx,
const FileTypeBatch& srcBatch) {
using FileType = decltype(srcBatch.values[idx]);
using ReadType = decltype(dstBatch.values[idx]);
auto [overflows, resultI128] =
convertDecimal(srcBatch.values[idx], fromScale_, toPrecision_, toScale_);
if (overflows) {
handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
}
if constexpr (std::is_same_v<ReadTypeBatch, Decimal64VectorBatch>) {
if (!resultI128.fitsInLong()) {
handleOverflow<FileType, ReadType>(dstBatch, idx, throwOnOverflow);
} else {
dstBatch.values[idx] = resultI128.toLong();
}
} else {
dstBatch.values[idx] = resultI128;
}
}
int32_t fromPrecision_;
int32_t fromScale_;
int32_t toPrecision_;
int32_t toScale_;
};
template <typename FileTypeBatch>
class DecimalToTimestampColumnReader : public ConvertToTimestampColumnReader {
public:
DecimalToTimestampColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow),
precision_(static_cast<int32_t>(fileType.getPrecision())),
scale_(static_cast<int32_t>(fileType.getScale())) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
convertDecimalToTimestamp(dstBatch, i, srcBatch);
}
}
}
private:
void convertDecimalToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
const FileTypeBatch& srcBatch) {
constexpr int SecondToNanoFactor = 9;
// Following constant comes from java.time.Instant
// '-1000000000-01-01T00:00Z'
constexpr int64_t MIN_EPOCH_SECONDS = -31557014167219200L;
// '1000000000-12-31T23:59:59.999999999Z'
constexpr int64_t MAX_EPOCH_SECONDS = 31556889864403199L;
// dummy variable, there's no risk of overflow
bool overflow = false;
Int128 i128(srcBatch.values[idx]);
Int128 integerPortion = scaleDownInt128ByPowerOfTen(i128, scale_);
if (integerPortion < MIN_EPOCH_SECONDS || integerPortion > MAX_EPOCH_SECONDS) {
handleOverflow<Decimal, int64_t>(dstBatch, idx, throwOnOverflow);
return;
}
i128 -= scaleUpInt128ByPowerOfTen(integerPortion, scale_, overflow);
Int128 fractionPortion = std::move(i128);
if (scale_ < SecondToNanoFactor) {
fractionPortion =
scaleUpInt128ByPowerOfTen(fractionPortion, SecondToNanoFactor - scale_, overflow);
} else {
fractionPortion = scaleDownInt128ByPowerOfTen(fractionPortion, scale_ - SecondToNanoFactor);
}
if (fractionPortion < 0) {
fractionPortion += 1e9;
integerPortion -= 1;
}
// line 630 has guaranteed toLong() will not overflow
dstBatch.data[idx] = integerPortion.toLong();
dstBatch.nanoseconds[idx] = fractionPortion.toLong();
if (needConvertTimezone) {
dstBatch.data[idx] = readerTimezone->convertFromUTC(dstBatch.data[idx]);
}
}
const int32_t precision_;
const int32_t scale_;
};
template <typename FileTypeBatch>
class DecimalToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
public:
DecimalToStringVariantColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow),
scale_(fileType.getScale()) {}
uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override {
uint64_t size = 0;
strBuffer.resize(numValues);
const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
if (readType.getKind() == STRING) {
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = Int128(srcBatch.values[i]).toDecimalString(scale_, true);
size += strBuffer[i].size();
}
}
} else {
const auto maxLength = readType.getMaximumLength();
for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = Int128(srcBatch.values[i]).toDecimalString(scale_, true);
}
if (strBuffer[i].size() > maxLength) {
strBuffer[i].resize(maxLength);
}
size += strBuffer[i].size();
}
}
return size;
}
private:
const int32_t scale_;
};
template <typename ReadTypeBatch, typename ReadType>
class StringVariantToNumericColumnReader : public ConvertColumnReader {
public:
StringVariantToNumericColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
if constexpr (std::is_floating_point_v<ReadType>) {
convertToDouble(dstBatch, srcBatch, i);
} else {
convertToInteger(dstBatch, srcBatch, i);
}
}
}
}
private:
void convertToInteger(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch,
uint64_t idx) {
int64_t longValue = 0;
const std::string longStr(srcBatch.data[idx], srcBatch.length[idx]);
try {
longValue = std::stoll(longStr);
} catch (...) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Long", longStr);
return;
}
if constexpr (std::is_same_v<ReadType, bool>) {
dstBatch.data[idx] = longValue == 0 ? 0 : 1;
} else {
if (!downCastToInteger(dstBatch.data[idx], longValue)) {
handleOverflow<std::string, ReadType>(dstBatch, idx, throwOnOverflow);
}
}
}
void convertToDouble(ReadTypeBatch& dstBatch, const StringVectorBatch& srcBatch, uint64_t idx) {
const std::string floatValue(srcBatch.data[idx], srcBatch.length[idx]);
try {
if constexpr (std::is_same_v<ReadType, float>) {
dstBatch.data[idx] = std::stof(floatValue);
} else {
dstBatch.data[idx] = std::stod(floatValue);
}
} catch (...) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, typeid(readType).name(),
floatValue);
}
}
};
class StringVariantConvertColumnReader : public ConvertToStringVariantColumnReader {
public:
StringVariantConvertColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToStringVariantColumnReader(readType, fileType, stripe, throwOnOverflow) {}
uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override {
uint64_t size = 0;
strBuffer.resize(numValues);
const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
const auto maxLength = readType.getMaximumLength();
if (readType.getKind() == STRING) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
strBuffer[i] = std::string(srcBatch.data[i], srcBatch.length[i]);
size += strBuffer[i].size();
}
}
} else if (readType.getKind() == VARCHAR) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
const char* charData = srcBatch.data[i];
uint64_t originLength = srcBatch.length[i];
uint64_t itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength);
strBuffer[i] = std::string(charData, itemLength);
size += strBuffer[i].length();
}
}
} else if (readType.getKind() == CHAR) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
const char* charData = srcBatch.data[i];
uint64_t originLength = srcBatch.length[i];
uint64_t charLength = Utf8Utils::charLength(charData, originLength);
auto itemLength = Utf8Utils::truncateBytesTo(maxLength, charData, originLength);
strBuffer[i] = std::string(srcBatch.data[i], itemLength);
// the padding is exactly 1 byte per char
if (charLength < maxLength) {
strBuffer[i].resize(itemLength + maxLength - charLength, ' ');
}
size += strBuffer[i].length();
}
}
} else {
throw SchemaEvolutionError("Invalid type for numeric to string conversion: " +
readType.toString());
}
return size;
}
};
class StringVariantToTimestampColumnReader : public ConvertToTimestampColumnReader {
public:
StringVariantToTimestampColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertToTimestampColumnReader(readType, fileType, stripe, throwOnOverflow) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i]));
}
}
}
private:
// Algorithm: http://howardhinnant.github.io/date_algorithms.html
// The algorithm implements a proleptic Gregorian calendar.
int64_t daysFromProlepticGregorianCalendar(int32_t y, int32_t m, int32_t d) {
y -= m <= 2;
int32_t era = y / 400;
int32_t yoe = y - era * 400; // [0, 399]
int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365]
int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
return 1ll * era * 146097 + doe - 719468;
}
std::optional<std::pair<int64_t, int64_t>> tryBestToParseFromString(
const std::string& timeStr) {
int32_t year, month, day, hour, min, sec, nanos = 0;
int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d %2d:%2d:%2d.%d", &year, &month,
&day, &hour, &min, &sec, &nanos);
if (matched != 6 && matched != 7) {
return std::nullopt;
}
if (nanos) {
if (nanos < 0 || nanos >= 1e9) {
return std::nullopt;
}
while (nanos < static_cast<int64_t>(1e8)) {
nanos *= 10;
}
}
int64_t daysSinceEpoch = daysFromProlepticGregorianCalendar(year, month, day);
int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + min) + sec;
return std::make_optional(std::pair<int64_t, int64_t>{secondSinceEpoch, nanos});
}
void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
const std::string& timeStr) {
// Expected timestamp_instant format string : yyyy-mm-dd hh:mm:ss[.xxx] timezone
// Eg. "2019-07-09 13:11:00 America/Los_Angeles"
// Expected timestamp format string : yyyy-mm-dd hh:mm:ss[.xxx]
// Eg. "2019-07-09 13:11:00"
static std::string expectedTimestampInstantFormat = "yyyy-mm-dd hh:mm:ss[.xxx] timezone";
static std::string expectedTimestampFormat = "yyyy-mm-dd hh:mm:ss[.xxx]";
auto timestamp = tryBestToParseFromString(timeStr);
if (!timestamp.has_value()) {
if (!isInstant) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp", timeStr,
expectedTimestampFormat);
return;
}
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
expectedTimestampInstantFormat);
return;
}
auto& [second, nanos] = timestamp.value();
if (isInstant) {
size_t pos = 0; // get the name of timezone
pos = timeStr.find(' ', pos) + 1;
pos = timeStr.find(' ', pos);
if (pos == std::string::npos) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
expectedTimestampInstantFormat);
return;
}
pos += 1;
size_t subStrLength = timeStr.length() - pos;
try {
second = getTimezoneByName(timeStr.substr(pos, subStrLength)).convertFromUTC(second);
} catch (const TimezoneError&) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Timestamp_Instant", timeStr,
expectedTimestampInstantFormat);
return;
}
} else {
if (needConvertTimezone) {
second = readerTimezone->convertFromUTC(second);
}
}
dstBatch.data[idx] = second;
dstBatch.nanoseconds[idx] = nanos;
}
};
template <typename ReadTypeBatch>
class StringVariantToDecimalColumnReader : public ConvertColumnReader {
public:
StringVariantToDecimalColumnReader(const Type& readType, const Type& fileType,
StripeStreams& stripe, bool throwOnOverflow)
: ConvertColumnReader(readType, fileType, stripe, throwOnOverflow),
precision_(static_cast<int32_t>(readType.getPrecision())),
scale_(static_cast<int32_t>(readType.getScale())) {}
void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
ConvertColumnReader::next(rowBatch, numValues, notNull);
const auto& srcBatch = *SafeCastBatchTo<const StringVectorBatch*>(data.get());
auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
for (uint64_t i = 0; i < numValues; ++i) {
if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
convertToDecimal(dstBatch, i, std::string(srcBatch.data[i], srcBatch.length[i]));
}
}
}
private:
void convertToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, const std::string& decimalStr) {
constexpr int32_t MAX_PRECISION_128 = 38;
int32_t fromPrecision = 0;
int32_t fromScale = 0;
uint32_t start = 0;
bool negative = false;
if (decimalStr.empty()) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
return;
}
auto dotPos = decimalStr.find('.');
if (dotPos == std::string::npos) {
fromScale = 0;
fromPrecision = decimalStr.length();
dotPos = decimalStr.length();
} else {
if (dotPos + 1 == decimalStr.length()) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
return;
}
fromPrecision = decimalStr.length() - 1;
fromScale = decimalStr.length() - dotPos - 1;
}
if (decimalStr.front() == '-') {
negative = true;
start++;
fromPrecision--;
}
const std::string integerPortion = decimalStr.substr(start, dotPos - start);
if (dotPos == start || fromPrecision > MAX_PRECISION_128 || fromPrecision <= 0 ||
!std::all_of(integerPortion.begin(), integerPortion.end(), ::isdigit)) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
return;
}
Int128 i128;
try {
bool overflow = false;
i128 = Int128(integerPortion);
// overflow won't happen
i128 *= scaleUpInt128ByPowerOfTen(Int128(1), fromScale, overflow);
} catch (const std::exception& e) {
handleParseFromStringError(dstBatch, idx, throwOnOverflow, "Decimal", decimalStr);
return;
}
if (dotPos + 1 < decimalStr.length()) {
const std::string fractionPortion = decimalStr.substr(dotPos + 1, fromScale);
if (!std::all_of(fractionPortion.begin(), fractionPortion.end(), ::isdigit)) {
handleOverflow<std::string, Int128>(dstBatch, idx, throwOnOverflow);
return;
}
i128 += Int128(fractionPortion);
}
auto [overflow, result] = convertDecimal(i128, fromScale, precision_, scale_);
if (overflow) {
handleOverflow<std::string, Int128>(dstBatch, idx, throwOnOverflow);
return;
}
if (negative) {
result.negate();
}
if constexpr (std::is_same_v<ReadTypeBatch, Decimal128VectorBatch>) {
dstBatch.values[idx] = result;
} else {
if (!result.fitsInLong()) {
handleOverflow<std::string, decltype(dstBatch.values[idx])>(dstBatch, idx,
throwOnOverflow);
} else {
dstBatch.values[idx] = result.toLong();
}
}
}
const int32_t precision_;
const int32_t scale_;
};
#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
using FROM##To##TO##ColumnReader = \
NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
#define DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(FROM, TO) \
using FROM##To##TO##ColumnReader = NumericToStringVariantColumnReader<FROM##VectorBatch>;
#define DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(FROM, IS_FROM_FLOATING) \
using FROM##To##Decimal64##ColumnReader = \
NumericToDecimalColumnReader<FROM##VectorBatch, Decimal64VectorBatch, IS_FROM_FLOATING>; \
using FROM##To##Decimal128##ColumnReader = \
NumericToDecimalColumnReader<FROM##VectorBatch, Decimal128VectorBatch, IS_FROM_FLOATING>;
#define DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(FROM) \
using FROM##ToTimestampColumnReader = NumericToTimestampColumnReader<FROM##VectorBatch>;
#define DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(TO, TYPE) \
using Decimal64##To##TO##ColumnReader = \
DecimalToNumericColumnReader<Decimal64VectorBatch, TO##VectorBatch, TYPE>; \
using Decimal128##To##TO##ColumnReader = \
DecimalToNumericColumnReader<Decimal128VectorBatch, TO##VectorBatch, TYPE>;
#define DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(TO) \
using Decimal64##To##TO##ColumnReader = \
DecimalConvertColumnReader<Decimal64VectorBatch, TO##VectorBatch>; \
using Decimal128##To##TO##ColumnReader = \
DecimalConvertColumnReader<Decimal128VectorBatch, TO##VectorBatch>;
#define DEFINE_DECIMAL_CONVERT_TO_TIMESTAMP_READER \
using Decimal64ToTimestampColumnReader = DecimalToTimestampColumnReader<Decimal64VectorBatch>; \
using Decimal128ToTimestampColumnReader = DecimalToTimestampColumnReader<Decimal128VectorBatch>;
#define DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(TO) \
using Decimal64To##TO##ColumnReader = DecimalToStringVariantColumnReader<Decimal64VectorBatch>; \
using Decimal128To##TO##ColumnReader = DecimalToStringVariantColumnReader<Decimal128VectorBatch>;
#define DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(FROM, TO, TYPE) \
using FROM##To##TO##ColumnReader = StringVariantToNumericColumnReader<TO##VectorBatch, TYPE>;
#define DEFINE_STRING_VARIANT_CONVERT_READER(FROM, TO) \
using FROM##To##TO##ColumnReader = StringVariantConvertColumnReader;
#define DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(FROM, TO) \
using FROM##To##TO##ColumnReader = StringVariantToTimestampColumnReader;
#define DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(FROM, TO) \
using FROM##To##TO##ColumnReader = StringVariantToDecimalColumnReader<TO##VectorBatch>;
DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t)
DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t)
DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t)
DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t)
DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t)
DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t)
DEFINE_NUMERIC_CONVERT_READER(Float, Double, double)
DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool)
DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool)
DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool)
DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool)
DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t)
DEFINE_NUMERIC_CONVERT_READER(Double, Float, float)
// Floating to integer
DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool)
DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t)
DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t)
DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool)
DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t)
DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t)
// Integer to Floating
DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float)
DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float)
DEFINE_NUMERIC_CONVERT_READER(Short, Float, float)
DEFINE_NUMERIC_CONVERT_READER(Int, Float, float)
DEFINE_NUMERIC_CONVERT_READER(Long, Float, float)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double)
DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double)
DEFINE_NUMERIC_CONVERT_READER(Short, Double, double)
DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
// Numeric to String/Char
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
// Numeric to Decimal
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
// Numeric to Timestamp
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
// Decimal to Numeric
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Boolean, bool)
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Byte, int8_t)
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Short, int16_t)
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Int, int32_t)
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Long, int64_t)
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Float, float)
DEFINE_DECIMAL_CONVERT_TO_NUMERIC_READER(Double, double)
// Decimal to Decimal
DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal64)
DEFINE_DECIMAL_CONVERT_TO_DECIMAL_READER(Decimal128)
DEFINE_DECIMAL_CONVERT_TO_TIMESTAMP_READER
DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(String)
DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Char)
DEFINE_DECIMAL_CONVERT_TO_STRING_VARINT_READER(Varchar)
// String variant to numeric
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Boolean, bool)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Byte, int8_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Short, int16_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Int, int32_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Long, int64_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Float, float)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(String, Double, double)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Boolean, bool)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Byte, int8_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Short, int16_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Int, int32_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Long, int64_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Float, float)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Char, Double, double)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Boolean, bool)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Byte, int8_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Short, int16_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Int, int32_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Long, int64_t)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Float, float)
DEFINE_STRING_VARIANT_CONVERT_TO_NUMERIC_READER(Varchar, Double, double)
// String variant to string variant
DEFINE_STRING_VARIANT_CONVERT_READER(String, String)
DEFINE_STRING_VARIANT_CONVERT_READER(String, Char)
DEFINE_STRING_VARIANT_CONVERT_READER(String, Varchar)
DEFINE_STRING_VARIANT_CONVERT_READER(Char, Char)
DEFINE_STRING_VARIANT_CONVERT_READER(Char, String)
DEFINE_STRING_VARIANT_CONVERT_READER(Char, Varchar)
DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, String)
DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Char)
DEFINE_STRING_VARIANT_CONVERT_READER(Varchar, Varchar)
// String variant to timestamp
DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(String, Timestamp)
DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Char, Timestamp)
DEFINE_STRING_VARIANT_CONVERT_TO_TIMESTAMP_READER(Varchar, Timestamp)
// String variant to decimal
DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal64)
DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(String, Decimal128)
DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal64)
DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Char, Decimal128)
DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal64)
DEFINE_STRING_VARIANT_CONVERT_CONVERT_TO_DECIMAL_READER(Varchar, Decimal128)
#define CREATE_READER(NAME) \
return std::make_unique<NAME>(readType, fileType, stripe, throwOnOverflow);
#define CASE_CREATE_READER(TYPE, CONVERT) \
case TYPE: \
CREATE_READER(CONVERT##ColumnReader)
const static int32_t MAX_PRECISION_64 = 18;
static inline bool isDecimal64(const Type& type) {
return type.getPrecision() > 0 && type.getPrecision() <= MAX_PRECISION_64;
}
#define CASE_CREATE_FROM_DECIMAL_READER(TYPE, TO) \
case TYPE: { \
if (isDecimal64(fileType)) { \
CREATE_READER(Decimal64To##TO##ColumnReader) \
} else { \
CREATE_READER(Decimal128To##TO##ColumnReader) \
} \
}
#define CASE_CREATE_DECIMAL_READER(FROM) \
case DECIMAL: { \
if (isDecimal64(readType)) { \
CREATE_READER(FROM##ToDecimal64ColumnReader) \
} else { \
CREATE_READER(FROM##ToDecimal128ColumnReader) \
} \
}
#define CASE_EXCEPTION \
default: \
throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
readType.toString());
std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
bool useTightNumericVector,
bool throwOnOverflow) {
if (!useTightNumericVector) {
throw SchemaEvolutionError(
"SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
"option useTightNumericVector");
}
const auto& readType = *stripe.getSchemaEvolution()->getReadType(fileType);
switch (fileType.getKind()) {
case BOOLEAN: {
switch (readType.getKind()) {
CASE_CREATE_READER(BYTE, BooleanToByte)
CASE_CREATE_READER(SHORT, BooleanToShort)
CASE_CREATE_READER(INT, BooleanToInt)
CASE_CREATE_READER(LONG, BooleanToLong)
CASE_CREATE_READER(FLOAT, BooleanToFloat)
CASE_CREATE_READER(DOUBLE, BooleanToDouble)
CASE_CREATE_READER(STRING, BooleanToString)
CASE_CREATE_READER(CHAR, BooleanToChar)
CASE_CREATE_READER(VARCHAR, BooleanToVarchar)
CASE_CREATE_DECIMAL_READER(Boolean)
CASE_CREATE_READER(TIMESTAMP, BooleanToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, BooleanToTimestamp)
case BOOLEAN:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case BYTE: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, ByteToBoolean)
CASE_CREATE_READER(SHORT, ByteToShort)
CASE_CREATE_READER(INT, ByteToInt)
CASE_CREATE_READER(LONG, ByteToLong)
CASE_CREATE_READER(FLOAT, ByteToFloat)
CASE_CREATE_READER(DOUBLE, ByteToDouble)
CASE_CREATE_READER(STRING, ByteToString)
CASE_CREATE_READER(CHAR, ByteToChar)
CASE_CREATE_READER(VARCHAR, ByteToVarchar)
CASE_CREATE_DECIMAL_READER(Byte)
CASE_CREATE_READER(TIMESTAMP, ByteToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, ByteToTimestamp)
case BYTE:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case SHORT: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, ShortToBoolean)
CASE_CREATE_READER(BYTE, ShortToByte)
CASE_CREATE_READER(INT, ShortToInt)
CASE_CREATE_READER(LONG, ShortToLong)
CASE_CREATE_READER(FLOAT, ShortToFloat)
CASE_CREATE_READER(DOUBLE, ShortToDouble)
CASE_CREATE_READER(STRING, ShortToString)
CASE_CREATE_READER(CHAR, ShortToChar)
CASE_CREATE_READER(VARCHAR, ShortToVarchar)
CASE_CREATE_DECIMAL_READER(Short)
CASE_CREATE_READER(TIMESTAMP, ShortToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, ShortToTimestamp)
case SHORT:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case INT: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, IntToBoolean)
CASE_CREATE_READER(BYTE, IntToByte)
CASE_CREATE_READER(SHORT, IntToShort)
CASE_CREATE_READER(LONG, IntToLong)
CASE_CREATE_READER(FLOAT, IntToFloat)
CASE_CREATE_READER(DOUBLE, IntToDouble)
CASE_CREATE_READER(STRING, IntToString)
CASE_CREATE_READER(CHAR, IntToChar)
CASE_CREATE_READER(VARCHAR, IntToVarchar)
CASE_CREATE_DECIMAL_READER(Int)
CASE_CREATE_READER(TIMESTAMP, IntToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, IntToTimestamp)
case INT:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case LONG: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, LongToBoolean)
CASE_CREATE_READER(BYTE, LongToByte)
CASE_CREATE_READER(SHORT, LongToShort)
CASE_CREATE_READER(INT, LongToInt)
CASE_CREATE_READER(FLOAT, LongToFloat)
CASE_CREATE_READER(DOUBLE, LongToDouble)
CASE_CREATE_READER(STRING, LongToString)
CASE_CREATE_READER(CHAR, LongToChar)
CASE_CREATE_READER(VARCHAR, LongToVarchar)
CASE_CREATE_DECIMAL_READER(Long)
CASE_CREATE_READER(TIMESTAMP, LongToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, LongToTimestamp)
case LONG:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case FLOAT: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, FloatToBoolean)
CASE_CREATE_READER(BYTE, FloatToByte)
CASE_CREATE_READER(SHORT, FloatToShort)
CASE_CREATE_READER(INT, FloatToInt)
CASE_CREATE_READER(LONG, FloatToLong)
CASE_CREATE_READER(DOUBLE, FloatToDouble)
CASE_CREATE_READER(STRING, FloatToString)
CASE_CREATE_READER(CHAR, FloatToChar)
CASE_CREATE_READER(VARCHAR, FloatToVarchar)
CASE_CREATE_DECIMAL_READER(Float)
CASE_CREATE_READER(TIMESTAMP, FloatToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, FloatToTimestamp)
case FLOAT:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case DOUBLE: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, DoubleToBoolean)
CASE_CREATE_READER(BYTE, DoubleToByte)
CASE_CREATE_READER(SHORT, DoubleToShort)
CASE_CREATE_READER(INT, DoubleToInt)
CASE_CREATE_READER(LONG, DoubleToLong)
CASE_CREATE_READER(FLOAT, DoubleToFloat)
CASE_CREATE_READER(STRING, DoubleToString)
CASE_CREATE_READER(CHAR, DoubleToChar)
CASE_CREATE_READER(VARCHAR, DoubleToVarchar)
CASE_CREATE_DECIMAL_READER(Double)
CASE_CREATE_READER(TIMESTAMP, DoubleToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, DoubleToTimestamp)
case DOUBLE:
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case DECIMAL: {
switch (readType.getKind()) {
CASE_CREATE_FROM_DECIMAL_READER(BOOLEAN, Boolean)
CASE_CREATE_FROM_DECIMAL_READER(BYTE, Byte)
CASE_CREATE_FROM_DECIMAL_READER(SHORT, Short)
CASE_CREATE_FROM_DECIMAL_READER(INT, Int)
CASE_CREATE_FROM_DECIMAL_READER(LONG, Long)
CASE_CREATE_FROM_DECIMAL_READER(FLOAT, Float)
CASE_CREATE_FROM_DECIMAL_READER(DOUBLE, Double)
CASE_CREATE_FROM_DECIMAL_READER(STRING, String)
CASE_CREATE_FROM_DECIMAL_READER(CHAR, Char)
CASE_CREATE_FROM_DECIMAL_READER(VARCHAR, Varchar)
CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP, Timestamp)
CASE_CREATE_FROM_DECIMAL_READER(TIMESTAMP_INSTANT, Timestamp)
case DECIMAL: {
if (isDecimal64(fileType)) {
if (isDecimal64(readType)) {
CREATE_READER(Decimal64ToDecimal64ColumnReader)
} else {
CREATE_READER(Decimal64ToDecimal128ColumnReader)
}
} else {
if (isDecimal64(readType)) {
CREATE_READER(Decimal128ToDecimal64ColumnReader)
} else {
CREATE_READER(Decimal128ToDecimal128ColumnReader)
}
}
}
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case STRING: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, StringToBoolean)
CASE_CREATE_READER(BYTE, StringToByte)
CASE_CREATE_READER(SHORT, StringToShort)
CASE_CREATE_READER(INT, StringToInt)
CASE_CREATE_READER(LONG, StringToLong)
CASE_CREATE_READER(FLOAT, StringToFloat)
CASE_CREATE_READER(DOUBLE, StringToDouble)
CASE_CREATE_READER(STRING, StringToString)
CASE_CREATE_READER(CHAR, StringToChar)
CASE_CREATE_READER(VARCHAR, StringToVarchar)
CASE_CREATE_READER(TIMESTAMP, StringToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, StringToTimestamp)
case DECIMAL: {
if (isDecimal64(readType)) {
CREATE_READER(StringToDecimal64ColumnReader)
} else {
CREATE_READER(StringToDecimal128ColumnReader)
}
}
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case CHAR: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, CharToBoolean)
CASE_CREATE_READER(BYTE, CharToByte)
CASE_CREATE_READER(SHORT, CharToShort)
CASE_CREATE_READER(INT, CharToInt)
CASE_CREATE_READER(LONG, CharToLong)
CASE_CREATE_READER(FLOAT, CharToFloat)
CASE_CREATE_READER(DOUBLE, CharToDouble)
CASE_CREATE_READER(STRING, CharToString)
CASE_CREATE_READER(CHAR, CharToChar)
CASE_CREATE_READER(VARCHAR, CharToVarchar)
CASE_CREATE_READER(TIMESTAMP, CharToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, CharToTimestamp)
case DECIMAL: {
if (isDecimal64(readType)) {
CREATE_READER(CharToDecimal64ColumnReader)
} else {
CREATE_READER(CharToDecimal128ColumnReader)
}
}
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case VARCHAR: {
switch (readType.getKind()) {
CASE_CREATE_READER(BOOLEAN, VarcharToBoolean)
CASE_CREATE_READER(BYTE, VarcharToByte)
CASE_CREATE_READER(SHORT, VarcharToShort)
CASE_CREATE_READER(INT, VarcharToInt)
CASE_CREATE_READER(LONG, VarcharToLong)
CASE_CREATE_READER(FLOAT, VarcharToFloat)
CASE_CREATE_READER(DOUBLE, VarcharToDouble)
CASE_CREATE_READER(STRING, VarcharToString)
CASE_CREATE_READER(CHAR, VarcharToChar)
CASE_CREATE_READER(VARCHAR, VarcharToVarchar)
CASE_CREATE_READER(TIMESTAMP, VarcharToTimestamp)
CASE_CREATE_READER(TIMESTAMP_INSTANT, VarcharToTimestamp)
case DECIMAL: {
if (isDecimal64(readType)) {
CREATE_READER(VarcharToDecimal64ColumnReader)
} else {
CREATE_READER(VarcharToDecimal128ColumnReader)
}
}
case BINARY:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
CASE_EXCEPTION
}
}
case BINARY:
case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DATE:
case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
} // namespace orc