blob: 7dca501890596a02e4d77d793b0b7a2629193e0f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "parquet-column-stats.inline.h"
#include <algorithm>
#include <cmath>
#include <limits>
#include "common/names.h"
namespace impala {
bool ColumnStatsReader::GetRequiredStatsField(const string& fn_name,
StatsField* stats_field) {
if (fn_name == "lt" || fn_name == "le") {
*stats_field = StatsField::MIN;
return true;
} else if (fn_name == "gt" || fn_name == "ge") {
*stats_field = StatsField::MAX;
return true;
}
DCHECK(false) << "Unsupported function name for statistics evaluation: "
<< fn_name;
return false;
}
bool ColumnStatsReader::ReadFromThrift(StatsField stats_field, void* slot) const {
if (!(col_chunk_.__isset.meta_data && col_chunk_.meta_data.__isset.statistics)) {
return false;
}
const parquet::Statistics& stats = col_chunk_.meta_data.statistics;
// Try to read the requested stats field. If it is not set, we may fall back to reading
// the old stats, based on the column type.
const string* stat_value = nullptr;
switch (stats_field) {
case StatsField::MIN:
if (stats.__isset.min_value && CanUseStats()) {
stat_value = &stats.min_value;
break;
}
if (stats.__isset.min && CanUseDeprecatedStats()) {
stat_value = &stats.min;
}
break;
case StatsField::MAX:
if (stats.__isset.max_value && CanUseStats()) {
stat_value = &stats.max_value;
break;
}
if (stats.__isset.max && CanUseDeprecatedStats()) {
stat_value = &stats.max;
}
break;
default:
DCHECK(false) << "Unsupported statistics field requested";
}
if (stat_value == nullptr) return false;
return ReadFromString(stats_field, *stat_value, slot);
}
bool ColumnStatsReader::ReadFromString(StatsField stats_field,
const string& encoded_value, void* slot) const {
switch (col_type_.type) {
case TYPE_BOOLEAN:
return ColumnStats<bool>::DecodePlainValue(encoded_value, slot,
parquet::Type::BOOLEAN);
case TYPE_TINYINT: {
// parquet::Statistics encodes INT_8 values using 4 bytes.
int32_t col_stats;
bool ret = ColumnStats<int32_t>::DecodePlainValue(encoded_value, &col_stats,
parquet::Type::INT32);
if (!ret || col_stats < std::numeric_limits<int8_t>::min() ||
col_stats > std::numeric_limits<int8_t>::max()) {
return false;
}
*static_cast<int8_t*>(slot) = col_stats;
return true;
}
case TYPE_SMALLINT: {
// parquet::Statistics encodes INT_16 values using 4 bytes.
int32_t col_stats;
bool ret = ColumnStats<int32_t>::DecodePlainValue(encoded_value, &col_stats,
parquet::Type::INT32);
if (!ret || col_stats < std::numeric_limits<int16_t>::min() ||
col_stats > std::numeric_limits<int16_t>::max()) {
return false;
}
*static_cast<int16_t*>(slot) = col_stats;
return true;
}
case TYPE_INT:
return ColumnStats<int32_t>::DecodePlainValue(encoded_value, slot, element_.type);
case TYPE_BIGINT:
return ColumnStats<int64_t>::DecodePlainValue(encoded_value, slot, element_.type);
case TYPE_FLOAT:
// IMPALA-6527, IMPALA-6538: ignore min/max stats if NaN
return ColumnStats<float>::DecodePlainValue(encoded_value, slot, element_.type) &&
!std::isnan(*reinterpret_cast<float*>(slot));
case TYPE_DOUBLE:
// IMPALA-6527, IMPALA-6538: ignore min/max stats if NaN
return ColumnStats<double>::DecodePlainValue(encoded_value, slot, element_.type) &&
!std::isnan(*reinterpret_cast<double*>(slot));
case TYPE_TIMESTAMP:
return DecodeTimestamp(encoded_value, stats_field,
static_cast<TimestampValue*>(slot));
case TYPE_STRING:
case TYPE_VARCHAR:
return ColumnStats<StringValue>::DecodePlainValue(encoded_value, slot,
element_.type);
case TYPE_CHAR:
/// We don't read statistics for CHAR columns, since CHAR support is broken in
/// Impala (IMPALA-1652).
return false;
case TYPE_DECIMAL:
switch (col_type_.GetByteSize()) {
case 4:
return ColumnStats<Decimal4Value>::DecodePlainValue(encoded_value, slot,
element_.type);
case 8:
return ColumnStats<Decimal8Value>::DecodePlainValue(encoded_value, slot,
element_.type);
case 16:
return ColumnStats<Decimal16Value>::DecodePlainValue(encoded_value, slot,
element_.type);
}
DCHECK(false) << "Unknown decimal byte size: " << col_type_.GetByteSize();
case TYPE_DATE:
return ColumnStats<DateValue>::DecodePlainValue(encoded_value, slot, element_.type);
default:
DCHECK(false) << col_type_.DebugString();
}
return false;
}
bool ColumnStatsReader::DecodeTimestamp(const std::string& stat_value,
ColumnStatsReader::StatsField stats_field, TimestampValue* slot) const {
bool stats_read = false;
if (element_.type == parquet::Type::INT96) {
stats_read =
ColumnStats<TimestampValue>::DecodePlainValue(stat_value, slot, element_.type);
} else if (element_.type == parquet::Type::INT64) {
int64_t tmp;
stats_read = ColumnStats<int64_t>::DecodePlainValue(stat_value, &tmp, element_.type);
if (stats_read) *slot = timestamp_decoder_.Int64ToTimestampValue(tmp);
} else {
DCHECK(false) << element_.name;
return false;
}
if (stats_read && timestamp_decoder_.NeedsConversion()) {
if (stats_field == ColumnStatsReader::StatsField::MIN) {
timestamp_decoder_.ConvertMinStatToLocalTime(slot);
} else {
timestamp_decoder_.ConvertMaxStatToLocalTime(slot);
}
}
return stats_read && slot->HasDateAndTime();
}
bool ColumnStatsReader::ReadNullCountStat(int64_t* null_count) const {
if (!(col_chunk_.__isset.meta_data && col_chunk_.meta_data.__isset.statistics)) {
return false;
}
const parquet::Statistics& stats = col_chunk_.meta_data.statistics;
if (stats.__isset.null_count) {
*null_count = stats.null_count;
return true;
}
return false;
}
Status ColumnStatsBase::CopyToBuffer(StringBuffer* buffer, StringValue* value) {
if (value->ptr == buffer->buffer()) return Status::OK();
buffer->Clear();
RETURN_IF_ERROR(buffer->Append(value->ptr, value->len));
value->ptr = buffer->buffer();
return Status::OK();
}
bool ColumnStatsReader::CanUseStats() const {
// If column order is not set, only statistics for numeric types can be trusted.
if (col_order_ == nullptr) {
return col_type_.IsBooleanType() || col_type_.IsIntegerType()
|| col_type_.IsFloatingPointType();
}
// Stats can be used if the column order is TypeDefinedOrder (see parquet.thrift).
return col_order_->__isset.TYPE_ORDER;
}
bool ColumnStatsReader::CanUseDeprecatedStats() const {
// If column order is set to something other than TypeDefinedOrder, we shall not use the
// stats (see parquet.thrift).
if (col_order_ != nullptr && !col_order_->__isset.TYPE_ORDER) return false;
return col_type_.IsBooleanType() || col_type_.IsIntegerType()
|| col_type_.IsFloatingPointType();
}
} // end ns impala