blob: 442c43c228ead765434631e175c54682556d0272 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "SchemaEvolution.hh"
#include "orc/Exceptions.hh"
#include "orc/Type.hh"
namespace orc {
SchemaEvolution::SchemaEvolution(const std::shared_ptr<Type>& readType, const Type* fileType)
: readType_(readType) {
if (readType_) {
buildConversion(readType_.get(), fileType);
} else {
for (uint64_t i = 0; i <= fileType->getMaximumColumnId(); ++i) {
safePPDConversionMap_.insert(i);
}
}
}
const Type* SchemaEvolution::getReadType(const Type& fileType) const {
auto ret = readTypeMap_.find(fileType.getColumnId());
return ret == readTypeMap_.cend() ? &fileType : ret->second;
}
inline void invalidConversion(const Type* readType, const Type* fileType) {
throw SchemaEvolutionError("Cannot convert from " + fileType->toString() + " to " +
readType->toString());
}
struct EnumClassHash {
template <typename T>
std::size_t operator()(T t) const {
return static_cast<std::size_t>(t);
}
};
bool isNumeric(const Type& type) {
auto kind = type.getKind();
return kind == BOOLEAN || kind == BYTE || kind == SHORT || kind == INT || kind == LONG ||
kind == FLOAT || kind == DOUBLE;
}
bool isStringVariant(const Type& type) {
auto kind = type.getKind();
return kind == STRING || kind == CHAR || kind == VARCHAR;
}
bool isDecimal(const Type& type) {
auto kind = type.getKind();
return kind == DECIMAL;
}
bool isTimestamp(const Type& type) {
auto kind = type.getKind();
return kind == TIMESTAMP || kind == TIMESTAMP_INSTANT;
}
struct ConversionCheckResult {
bool isValid;
bool needConvert;
};
ConversionCheckResult checkConversion(const Type& readType, const Type& fileType) {
ConversionCheckResult ret = {false, false};
if (readType.getKind() == fileType.getKind()) {
ret.isValid = true;
if (fileType.getKind() == CHAR || fileType.getKind() == VARCHAR) {
ret.needConvert = readType.getMaximumLength() != fileType.getMaximumLength();
} else if (fileType.getKind() == DECIMAL) {
ret.needConvert = readType.getPrecision() != fileType.getPrecision() ||
readType.getScale() != fileType.getScale();
}
} else {
switch (fileType.getKind()) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case FLOAT:
case DOUBLE: {
ret.isValid = ret.needConvert = isNumeric(readType) || isStringVariant(readType) ||
isDecimal(readType) || isTimestamp(readType);
break;
}
case DECIMAL: {
ret.isValid = ret.needConvert =
isNumeric(readType) || isStringVariant(readType) || isTimestamp(readType);
break;
}
case STRING:
case CHAR:
case VARCHAR: {
ret.isValid = ret.needConvert = isStringVariant(readType) || isNumeric(readType) ||
isTimestamp(readType) || isDecimal(readType);
break;
}
case TIMESTAMP:
case TIMESTAMP_INSTANT:
case DATE:
case BINARY:
case GEOMETRY:
case GEOGRAPHY: {
// Not support
break;
}
case STRUCT:
case LIST:
case MAP:
case UNION: {
ret.isValid = ret.needConvert = false;
break;
}
default:
break;
}
}
return ret;
}
void SchemaEvolution::buildConversion(const Type* readType, const Type* fileType) {
if (fileType == nullptr) {
throw SchemaEvolutionError("File does not have " + readType->toString());
}
auto [valid, convert] = checkConversion(*readType, *fileType);
if (!valid) {
invalidConversion(readType, fileType);
}
readTypeMap_.emplace(readType->getColumnId(), convert ? readType : fileType);
// check whether PPD conversion is safe
buildSafePPDConversionMap(readType, fileType);
for (uint64_t i = 0; i < readType->getSubtypeCount(); ++i) {
auto subType = readType->getSubtype(i);
if (subType) {
// null subType means that this is a sub column of map/list type
// and it does not exist in the file. simply skip it.
buildConversion(subType, fileType->getTypeByColumnId(subType->getColumnId()));
}
}
}
bool SchemaEvolution::needConvert(const Type& fileType) const {
auto _readType = getReadType(fileType);
if (_readType == &fileType) {
return false;
}
// it does not check valid here as verified by buildConversion()
return checkConversion(*_readType, fileType).needConvert;
}
inline bool isPrimitive(const Type* type) {
auto kind = type->getKind();
return kind != STRUCT && kind != MAP && kind != LIST && kind != UNION;
}
void SchemaEvolution::buildSafePPDConversionMap(const Type* readType, const Type* fileType) {
if (readType == nullptr || !isPrimitive(readType) || fileType == nullptr ||
!isPrimitive(fileType)) {
return;
}
bool isSafe = false;
if (readType == fileType) {
// short cut for same type
isSafe = true;
} else if (readType->getKind() == DECIMAL && fileType->getKind() == DECIMAL) {
// for decimals alone do equality check to not mess up with precision change
if (fileType->getPrecision() == readType_->getPrecision() &&
fileType->getScale() == readType_->getScale()) {
isSafe = true;
}
} else {
// only integer and string evolutions are safe
// byte -> short -> int -> long
// string <-> char <-> varchar
// NOTE: Float to double evolution is not safe as floats are stored as
// doubles in ORC's internal index, but when doing predicate evaluation
// for queries like "select * from orc_float where f = 74.72" the constant
// on the filter is converted from string -> double so the precisions will
// be different and the comparison will fail.
// Soon, we should convert all sargs that compare equality between floats
// or doubles to range predicates.
// Similarly string -> char and varchar -> char and vice versa is impossible
// as ORC stores char with padded spaces in its internal index.
switch (fileType->getKind()) {
case BYTE: {
if (readType_->getKind() == SHORT || readType_->getKind() == INT ||
readType_->getKind() == LONG) {
isSafe = true;
}
break;
}
case SHORT: {
if (readType_->getKind() == INT || readType_->getKind() == LONG) {
isSafe = true;
}
break;
}
case INT: {
if (readType_->getKind() == LONG) {
isSafe = true;
}
break;
}
case STRING: {
if (readType_->getKind() == VARCHAR) {
isSafe = true;
}
break;
}
case VARCHAR: {
if (readType_->getKind() == STRING) {
isSafe = true;
}
break;
}
case BOOLEAN:
case LONG:
case FLOAT:
case DOUBLE:
case BINARY:
case GEOMETRY:
case GEOGRAPHY:
case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
case DECIMAL:
case DATE:
case CHAR:
case TIMESTAMP_INSTANT:
break;
}
}
if (isSafe) {
safePPDConversionMap_.insert(fileType->getColumnId());
}
}
bool SchemaEvolution::isSafePPDConversion(uint64_t columnId) const {
return safePPDConversionMap_.find(columnId) != safePPDConversionMap_.cend();
}
} // namespace orc