blob: a1aafa7dbe3b195edecc80f0d5ae030d299f8269 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ORC_STATISTICS_IMPL_HH
#define ORC_STATISTICS_IMPL_HH
#include "orc/Common.hh"
#include "orc/Int128.hh"
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
#include "Timezone.hh"
#include "TypeImpl.hh"
namespace orc {
/**
* StatContext contains fields required to compute statistics
*/
struct StatContext {
const bool correctStats;
const Timezone* const writerTimezone;
StatContext() : correctStats(false), writerTimezone(nullptr) {}
StatContext(bool cStat, const Timezone* const timezone = nullptr)
: correctStats(cStat), writerTimezone(timezone) {}
};
/**
* Internal Statistics Implementation
*/
template <typename T>
class InternalStatisticsImpl {
private:
bool _hasNull;
bool _hasMinimum;
bool _hasMaximum;
bool _hasSum;
bool _hasTotalLength;
uint64_t _totalLength;
uint64_t _valueCount;
T _minimum;
T _maximum;
T _sum;
public:
InternalStatisticsImpl() {
_hasNull = false;
_hasMinimum = false;
_hasMaximum = false;
_hasSum = false;
_hasTotalLength = false;
_totalLength = 0;
_valueCount = 0;
}
~InternalStatisticsImpl() {}
// GET / SET _totalLength
bool hasTotalLength() const {
return _hasTotalLength;
}
void setHasTotalLength(bool hasTotalLength) {
_hasTotalLength = hasTotalLength;
}
uint64_t getTotalLength() const {
return _totalLength;
}
void setTotalLength(uint64_t totalLength) {
_totalLength = totalLength;
}
// GET / SET _sum
bool hasSum() const {
return _hasSum;
}
void setHasSum(bool hasSum) {
_hasSum = hasSum;
}
T getSum() const {
return _sum;
}
void setSum(T sum) {
_sum = sum;
}
// GET / SET _maximum
bool hasMaximum() const {
return _hasMaximum;
}
const T& getMaximum() const {
return _maximum;
}
void setHasMaximum(bool hasMax) {
_hasMaximum = hasMax;
}
void setMaximum(T max) {
_maximum = max;
}
// GET / SET _minimum
bool hasMinimum() const {
return _hasMinimum;
}
void setHasMinimum(bool hasMin) {
_hasMinimum = hasMin;
}
const T& getMinimum() const {
return _minimum;
}
void setMinimum(T min) {
_minimum = min;
}
// GET / SET _valueCount
uint64_t getNumberOfValues() const {
return _valueCount;
}
void setNumberOfValues(uint64_t numValues) {
_valueCount = numValues;
}
// GET / SET _hasNullValue
bool hasNull() const {
return _hasNull;
}
void setHasNull(bool hasNull) {
_hasNull = hasNull;
}
void reset() {
_hasNull = false;
_hasMinimum = false;
_hasMaximum = false;
_hasSum = false;
_hasTotalLength = false;
_totalLength = 0;
_valueCount = 0;
}
void updateMinMax(T value) {
if (!_hasMinimum) {
_hasMinimum = _hasMaximum = true;
_minimum = _maximum = value;
} else if (compare(value, _minimum)) {
_minimum = value;
} else if (compare(_maximum, value)) {
_maximum = value;
}
}
// sum is not merged here as we need to check overflow
void merge(const InternalStatisticsImpl& other) {
_hasNull = _hasNull || other._hasNull;
_valueCount += other._valueCount;
if (other._hasMinimum) {
if (!_hasMinimum) {
_hasMinimum = _hasMaximum = true;
_minimum = other._minimum;
_maximum = other._maximum;
} else {
// all template types should support operator<
if (compare(_maximum, other._maximum)) {
_maximum = other._maximum;
}
if (compare(other._minimum, _minimum)) {
_minimum = other._minimum;
}
}
}
_hasTotalLength = _hasTotalLength && other._hasTotalLength;
_totalLength += other._totalLength;
}
};
typedef InternalStatisticsImpl<char> InternalCharStatistics;
typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
typedef InternalStatisticsImpl<uint64_t> InternalCollectionStatistics;
/**
* Mutable column statistics for use by the writer.
*/
class MutableColumnStatistics {
public:
virtual ~MutableColumnStatistics();
virtual void increase(uint64_t count) = 0;
virtual void setNumberOfValues(uint64_t value) = 0;
virtual void setHasNull(bool hasNull) = 0;
virtual void merge(const MutableColumnStatistics& other) = 0;
virtual void reset() = 0;
virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
};
/**
* ColumnStatistics Implementation
*/
class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics {
private:
InternalCharStatistics _stats;
public:
ColumnStatisticsImpl() {
reset();
}
ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~ColumnStatisticsImpl() override;
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
void merge(const MutableColumnStatistics& other) override {
_stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
}
void reset() override {
_stats.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Column has " << getNumberOfValues() << " values"
<< " and has null value: " << (hasNull() ? "yes" : "no") << std::endl;
return buffer.str();
}
};
class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics {
private:
InternalCharStatistics _stats;
public:
BinaryColumnStatisticsImpl() {
reset();
}
BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BinaryColumnStatisticsImpl() override;
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
bool hasTotalLength() const override {
return _stats.hasTotalLength();
}
uint64_t getTotalLength() const override {
if (hasTotalLength()) {
return _stats.getTotalLength();
} else {
throw ParseError("Total length is not defined.");
}
}
void setTotalLength(uint64_t length) {
_stats.setHasTotalLength(true);
_stats.setTotalLength(length);
}
void update(size_t length) {
_stats.setTotalLength(_stats.getTotalLength() + length);
}
void merge(const MutableColumnStatistics& other) override {
const BinaryColumnStatisticsImpl& binStats =
dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
_stats.merge(binStats._stats);
}
void reset() override {
_stats.reset();
setTotalLength(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics();
binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Binary" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasTotalLength()) {
buffer << "Total length: " << getTotalLength() << std::endl;
} else {
buffer << "Total length: not defined" << std::endl;
}
return buffer.str();
}
};
class BooleanColumnStatisticsImpl : public BooleanColumnStatistics,
public MutableColumnStatistics {
private:
InternalBooleanStatistics _stats;
bool _hasCount;
uint64_t _trueCount;
public:
BooleanColumnStatisticsImpl() {
reset();
}
BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BooleanColumnStatisticsImpl() override;
bool hasCount() const override {
return _hasCount;
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
_hasCount = true;
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
uint64_t getFalseCount() const override {
if (hasCount()) {
return getNumberOfValues() - _trueCount;
} else {
throw ParseError("False count is not defined.");
}
}
uint64_t getTrueCount() const override {
if (hasCount()) {
return _trueCount;
} else {
throw ParseError("True count is not defined.");
}
}
void setTrueCount(uint64_t trueCount) {
_hasCount = true;
_trueCount = trueCount;
}
void update(bool value, size_t repetitions) {
if (value) {
_trueCount += repetitions;
}
}
void merge(const MutableColumnStatistics& other) override {
const BooleanColumnStatisticsImpl& boolStats =
dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
_stats.merge(boolStats._stats);
_hasCount = _hasCount && boolStats._hasCount;
_trueCount += boolStats._trueCount;
}
void reset() override {
_stats.reset();
setTrueCount(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
if (_hasCount) {
bucketStats->add_count(_trueCount);
} else {
bucketStats->clear_count();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Boolean" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasCount()) {
buffer << "(true: " << getTrueCount() << "; false: " << getFalseCount() << ")" << std::endl;
} else {
buffer << "(true: not defined; false: not defined)" << std::endl;
buffer << "True and false counts are not defined" << std::endl;
}
return buffer.str();
}
};
class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics {
private:
InternalDateStatistics _stats;
public:
DateColumnStatisticsImpl() {
reset();
}
DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
virtual ~DateColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
int32_t getMinimum() const override {
if (hasMinimum()) {
return _stats.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
}
int32_t getMaximum() const override {
if (hasMaximum()) {
return _stats.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int32_t minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(int32_t maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
void update(int32_t value) {
_stats.updateMinMax(value);
}
void merge(const MutableColumnStatistics& other) override {
const DateColumnStatisticsImpl& dateStats =
dynamic_cast<const DateColumnStatisticsImpl&>(other);
_stats.merge(dateStats._stats);
}
void reset() override {
_stats.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::DateStatistics* dateStatistics = pbStats.mutable_datestatistics();
if (_stats.hasMinimum()) {
dateStatistics->set_maximum(_stats.getMaximum());
dateStatistics->set_minimum(_stats.getMinimum());
} else {
dateStatistics->clear_minimum();
dateStatistics->clear_maximum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Date" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
} else {
buffer << "Minimum: not defined" << std::endl;
}
if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
} else {
buffer << "Maximum: not defined" << std::endl;
}
return buffer.str();
}
};
class DecimalColumnStatisticsImpl : public DecimalColumnStatistics,
public MutableColumnStatistics {
private:
InternalDecimalStatistics _stats;
public:
DecimalColumnStatisticsImpl() {
reset();
}
DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~DecimalColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasSum() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
Decimal getMinimum() const override {
if (hasMinimum()) {
return _stats.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
}
Decimal getMaximum() const override {
if (hasMaximum()) {
return _stats.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(Decimal minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(Decimal maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
Decimal getSum() const override {
if (hasSum()) {
return _stats.getSum();
} else {
throw ParseError("Sum is not defined.");
}
}
void setSum(Decimal sum) {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void update(const Decimal& value) {
_stats.updateMinMax(value);
if (_stats.hasSum()) {
updateSum(value);
}
}
void merge(const MutableColumnStatistics& other) override {
const DecimalColumnStatisticsImpl& decStats =
dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
_stats.merge(decStats._stats);
_stats.setHasSum(_stats.hasSum() && decStats.hasSum());
if (_stats.hasSum()) {
updateSum(decStats.getSum());
}
}
void reset() override {
_stats.reset();
setSum(Decimal());
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics();
if (_stats.hasMinimum()) {
decStats->set_minimum(_stats.getMinimum().toString(true));
decStats->set_maximum(_stats.getMaximum().toString(true));
} else {
decStats->clear_minimum();
decStats->clear_maximum();
}
if (_stats.hasSum()) {
decStats->set_sum(_stats.getSum().toString(true));
} else {
decStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Decimal" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimum()) {
buffer << "Minimum: " << getMinimum().toString() << std::endl;
} else {
buffer << "Minimum: not defined" << std::endl;
}
if (hasMaximum()) {
buffer << "Maximum: " << getMaximum().toString() << std::endl;
} else {
buffer << "Maximum: not defined" << std::endl;
}
if (hasSum()) {
buffer << "Sum: " << getSum().toString() << std::endl;
} else {
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
private:
void updateSum(Decimal value) {
if (_stats.hasSum()) {
bool overflow = false;
Decimal sum = _stats.getSum();
if (sum.scale > value.scale) {
value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow);
} else if (sum.scale < value.scale) {
sum.value = scaleUpInt128ByPowerOfTen(sum.value, value.scale - sum.scale, overflow);
sum.scale = value.scale;
}
if (!overflow) {
bool wasPositive = sum.value >= 0;
sum.value += value.value;
if ((value.value >= 0) == wasPositive) {
_stats.setHasSum((sum.value >= 0) == wasPositive);
}
} else {
_stats.setHasSum(false);
}
if (_stats.hasSum()) {
_stats.setSum(sum);
}
}
}
};
class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics {
private:
InternalDoubleStatistics _stats;
public:
DoubleColumnStatisticsImpl() {
reset();
}
DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~DoubleColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasSum() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
double getMinimum() const override {
if (hasMinimum()) {
return _stats.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
}
double getMaximum() const override {
if (hasMaximum()) {
return _stats.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(double minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(double maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
double getSum() const override {
if (hasSum()) {
return _stats.getSum();
} else {
throw ParseError("Sum is not defined.");
}
}
void setSum(double sum) {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void update(double value) {
_stats.updateMinMax(value);
_stats.setSum(_stats.getSum() + value);
}
void merge(const MutableColumnStatistics& other) override {
const DoubleColumnStatisticsImpl& doubleStats =
dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
_stats.merge(doubleStats._stats);
_stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
if (_stats.hasSum()) {
_stats.setSum(_stats.getSum() + doubleStats.getSum());
}
}
void reset() override {
_stats.reset();
setSum(0.0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics();
if (_stats.hasMinimum()) {
doubleStats->set_minimum(_stats.getMinimum());
doubleStats->set_maximum(_stats.getMaximum());
} else {
doubleStats->clear_minimum();
doubleStats->clear_maximum();
}
if (_stats.hasSum()) {
doubleStats->set_sum(_stats.getSum());
} else {
doubleStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Double" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
} else {
buffer << "Minimum: not defined" << std::endl;
}
if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
} else {
buffer << "Maximum: not defined" << std::endl;
}
if (hasSum()) {
buffer << "Sum: " << getSum() << std::endl;
} else {
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
};
class IntegerColumnStatisticsImpl : public IntegerColumnStatistics,
public MutableColumnStatistics {
private:
InternalIntegerStatistics _stats;
public:
IntegerColumnStatisticsImpl() {
reset();
}
IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~IntegerColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasSum() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
int64_t getMinimum() const override {
if (hasMinimum()) {
return _stats.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
if (hasMaximum()) {
return _stats.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int64_t minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(int64_t maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
int64_t getSum() const override {
if (hasSum()) {
return _stats.getSum();
} else {
throw ParseError("Sum is not defined.");
}
}
void setSum(int64_t sum) {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void update(int64_t value, int repetitions) {
_stats.updateMinMax(value);
if (_stats.hasSum()) {
if (repetitions > 1) {
_stats.setHasSum(multiplyExact(value, repetitions, &value));
}
if (_stats.hasSum()) {
_stats.setHasSum(addExact(_stats.getSum(), value, &value));
if (_stats.hasSum()) {
_stats.setSum(value);
}
}
}
}
void merge(const MutableColumnStatistics& other) override {
const IntegerColumnStatisticsImpl& intStats =
dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
_stats.merge(intStats._stats);
// update sum and check overflow
_stats.setHasSum(_stats.hasSum() && intStats.hasSum());
if (_stats.hasSum()) {
int64_t value;
_stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value));
if (_stats.hasSum()) {
_stats.setSum(value);
}
}
}
void reset() override {
_stats.reset();
setSum(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics();
if (_stats.hasMinimum()) {
intStats->set_minimum(_stats.getMinimum());
intStats->set_maximum(_stats.getMaximum());
} else {
intStats->clear_minimum();
intStats->clear_maximum();
}
if (_stats.hasSum()) {
intStats->set_sum(_stats.getSum());
} else {
intStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Integer" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
} else {
buffer << "Minimum: not defined" << std::endl;
}
if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
} else {
buffer << "Maximum: not defined" << std::endl;
}
if (hasSum()) {
buffer << "Sum: " << getSum() << std::endl;
} else {
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
};
class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics {
private:
InternalStringStatistics _stats;
public:
StringColumnStatisticsImpl() {
reset();
}
StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~StringColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasTotalLength() const override {
return _stats.hasTotalLength();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
const std::string& getMinimum() const override {
if (hasMinimum()) {
return _stats.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
}
const std::string& getMaximum() const override {
if (hasMaximum()) {
return _stats.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(std::string minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(std::string maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
uint64_t getTotalLength() const override {
if (hasTotalLength()) {
return _stats.getTotalLength();
} else {
throw ParseError("Total length is not defined.");
}
}
void setTotalLength(uint64_t length) {
_stats.setHasTotalLength(true);
_stats.setTotalLength(length);
}
void update(const char* value, size_t length) {
if (value != nullptr) {
if (!_stats.hasMinimum()) {
std::string tempStr(value, value + length);
setMinimum(tempStr);
setMaximum(tempStr);
} else {
// update min
int minCmp = strncmp(_stats.getMinimum().c_str(), value,
std::min(_stats.getMinimum().length(), length));
if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) {
setMinimum(std::string(value, value + length));
}
// update max
int maxCmp = strncmp(_stats.getMaximum().c_str(), value,
std::min(_stats.getMaximum().length(), length));
if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) {
setMaximum(std::string(value, value + length));
}
}
}
_stats.setTotalLength(_stats.getTotalLength() + length);
}
void update(std::string value) {
update(value.c_str(), value.length());
}
void merge(const MutableColumnStatistics& other) override {
const StringColumnStatisticsImpl& strStats =
dynamic_cast<const StringColumnStatisticsImpl&>(other);
_stats.merge(strStats._stats);
}
void reset() override {
_stats.reset();
setTotalLength(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::StringStatistics* strStats = pbStats.mutable_stringstatistics();
if (_stats.hasMinimum()) {
strStats->set_minimum(_stats.getMinimum());
strStats->set_maximum(_stats.getMaximum());
} else {
strStats->clear_minimum();
strStats->clear_maximum();
}
if (_stats.hasTotalLength()) {
strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
} else {
strStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: String" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimum()) {
buffer << "Minimum: " << getMinimum() << std::endl;
} else {
buffer << "Minimum is not defined" << std::endl;
}
if (hasMaximum()) {
buffer << "Maximum: " << getMaximum() << std::endl;
} else {
buffer << "Maximum is not defined" << std::endl;
}
if (hasTotalLength()) {
buffer << "Total length: " << getTotalLength() << std::endl;
} else {
buffer << "Total length is not defined" << std::endl;
}
return buffer.str();
}
};
class TimestampColumnStatisticsImpl : public TimestampColumnStatistics,
public MutableColumnStatistics {
private:
InternalIntegerStatistics _stats;
bool _hasLowerBound;
bool _hasUpperBound;
int64_t _lowerBound;
int64_t _upperBound;
int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp
int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp
static constexpr int32_t DEFAULT_MIN_NANOS = 0;
static constexpr int32_t DEFAULT_MAX_NANOS = 999999;
public:
TimestampColumnStatisticsImpl() {
reset();
}
TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~TimestampColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
int64_t getMinimum() const override {
if (hasMinimum()) {
return _stats.getMinimum();
} else {
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
if (hasMaximum()) {
return _stats.getMaximum();
} else {
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int64_t minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(int64_t maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
void update(int64_t value) {
_stats.updateMinMax(value);
}
void update(int64_t milli, int32_t nano) {
if (!_stats.hasMinimum()) {
_stats.setHasMinimum(true);
_stats.setHasMaximum(true);
_stats.setMinimum(milli);
_stats.setMaximum(milli);
_maximumNanos = _minimumNanos = nano;
} else {
if (milli <= _stats.getMinimum()) {
if (milli < _stats.getMinimum() || nano < _minimumNanos) {
_minimumNanos = nano;
}
_stats.setMinimum(milli);
}
if (milli >= _stats.getMaximum()) {
if (milli > _stats.getMaximum() || nano > _maximumNanos) {
_maximumNanos = nano;
}
_stats.setMaximum(milli);
}
}
}
void merge(const MutableColumnStatistics& other) override {
const TimestampColumnStatisticsImpl& tsStats =
dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
_stats.setHasNull(_stats.hasNull() || tsStats.hasNull());
_stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues());
if (tsStats.hasMinimum()) {
if (!_stats.hasMinimum()) {
_stats.setHasMinimum(true);
_stats.setHasMaximum(true);
_stats.setMinimum(tsStats.getMinimum());
_stats.setMaximum(tsStats.getMaximum());
_minimumNanos = tsStats.getMinimumNanos();
_maximumNanos = tsStats.getMaximumNanos();
} else {
if (tsStats.getMaximum() >= _stats.getMaximum()) {
if (tsStats.getMaximum() > _stats.getMaximum() ||
tsStats.getMaximumNanos() > _maximumNanos) {
_maximumNanos = tsStats.getMaximumNanos();
}
_stats.setMaximum(tsStats.getMaximum());
}
if (tsStats.getMinimum() <= _stats.getMinimum()) {
if (tsStats.getMinimum() < _stats.getMinimum() ||
tsStats.getMinimumNanos() < _minimumNanos) {
_minimumNanos = tsStats.getMinimumNanos();
}
_stats.setMinimum(tsStats.getMinimum());
}
}
}
}
void reset() override {
_stats.reset();
_minimumNanos = DEFAULT_MIN_NANOS;
_maximumNanos = DEFAULT_MAX_NANOS;
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::TimestampStatistics* tsStats = pbStats.mutable_timestampstatistics();
if (_stats.hasMinimum()) {
tsStats->set_minimumutc(_stats.getMinimum());
tsStats->set_maximumutc(_stats.getMaximum());
if (_minimumNanos != DEFAULT_MIN_NANOS) {
tsStats->set_minimumnanos(_minimumNanos + 1);
}
if (_maximumNanos != DEFAULT_MAX_NANOS) {
tsStats->set_maximumnanos(_maximumNanos + 1);
}
} else {
tsStats->clear_minimumutc();
tsStats->clear_maximumutc();
tsStats->clear_minimumnanos();
tsStats->clear_maximumnanos();
}
}
std::string toString() const override {
std::ostringstream buffer;
struct tm tmValue;
char timeBuffer[20];
time_t secs = 0;
buffer << "Data type: Timestamp" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimum()) {
secs = static_cast<time_t>(getMinimum() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl;
} else {
buffer << "Minimum is not defined" << std::endl;
}
if (hasLowerBound()) {
secs = static_cast<time_t>(getLowerBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl;
} else {
buffer << "LowerBound is not defined" << std::endl;
}
if (hasMaximum()) {
secs = static_cast<time_t>(getMaximum() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl;
} else {
buffer << "Maximum is not defined" << std::endl;
}
if (hasUpperBound()) {
secs = static_cast<time_t>(getUpperBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl;
} else {
buffer << "UpperBound is not defined" << std::endl;
}
return buffer.str();
}
bool hasLowerBound() const override {
return _hasLowerBound;
}
bool hasUpperBound() const override {
return _hasUpperBound;
}
int64_t getLowerBound() const override {
if (hasLowerBound()) {
return _lowerBound;
} else {
throw ParseError("LowerBound is not defined.");
}
}
int64_t getUpperBound() const override {
if (hasUpperBound()) {
return _upperBound;
} else {
throw ParseError("UpperBound is not defined.");
}
}
int32_t getMinimumNanos() const override {
if (hasMinimum()) {
return _minimumNanos;
} else {
throw ParseError("Minimum is not defined.");
}
}
int32_t getMaximumNanos() const override {
if (hasMaximum()) {
return _maximumNanos;
} else {
throw ParseError("Maximum is not defined.");
}
}
};
class CollectionColumnStatisticsImpl : public CollectionColumnStatistics,
public MutableColumnStatistics {
private:
InternalCollectionStatistics _stats;
public:
CollectionColumnStatisticsImpl() {
reset();
}
CollectionColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~CollectionColumnStatisticsImpl() override;
bool hasMinimumChildren() const override {
return _stats.hasMinimum();
}
bool hasMaximumChildren() const override {
return _stats.hasMaximum();
}
bool hasTotalChildren() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
uint64_t getMinimumChildren() const override {
if (hasMinimumChildren()) {
return _stats.getMinimum();
} else {
throw ParseError("MinimumChildren is not defined.");
}
}
uint64_t getMaximumChildren() const override {
if (hasMaximumChildren()) {
return _stats.getMaximum();
} else {
throw ParseError("MaximumChildren is not defined.");
}
}
uint64_t getTotalChildren() const override {
if (hasTotalChildren()) {
return _stats.getSum();
} else {
throw ParseError("TotalChildren is not defined.");
}
}
void setMinimumChildren(uint64_t minimum) override {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximumChildren(uint64_t maximum) override {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
void setTotalChildren(uint64_t sum) override {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void setHasTotalChildren(bool hasSum) override {
_stats.setHasSum(hasSum);
}
void merge(const MutableColumnStatistics& other) override {
const CollectionColumnStatisticsImpl& collectionStats =
dynamic_cast<const CollectionColumnStatisticsImpl&>(other);
_stats.merge(collectionStats._stats);
// hasSumValue here means no overflow
_stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren());
if (_stats.hasSum()) {
uint64_t oldSum = _stats.getSum();
_stats.setSum(_stats.getSum() + collectionStats.getTotalChildren());
if (oldSum > _stats.getSum()) {
_stats.setHasSum(false);
}
}
}
void reset() override {
_stats.reset();
setTotalChildren(0);
}
void update(uint64_t value) {
_stats.updateMinMax(value);
if (_stats.hasSum()) {
uint64_t oldSum = _stats.getSum();
_stats.setSum(_stats.getSum() + value);
if (oldSum > _stats.getSum()) {
_stats.setHasSum(false);
}
}
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::CollectionStatistics* collectionStats = pbStats.mutable_collectionstatistics();
if (_stats.hasMinimum()) {
collectionStats->set_minchildren(_stats.getMinimum());
collectionStats->set_maxchildren(_stats.getMaximum());
} else {
collectionStats->clear_minchildren();
collectionStats->clear_maxchildren();
}
if (_stats.hasSum()) {
collectionStats->set_totalchildren(_stats.getSum());
} else {
collectionStats->clear_totalchildren();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Collection(LIST|MAP)" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if (hasMinimumChildren()) {
buffer << "MinChildren: " << getMinimumChildren() << std::endl;
} else {
buffer << "MinChildren is not defined" << std::endl;
}
if (hasMaximumChildren()) {
buffer << "MaxChildren: " << getMaximumChildren() << std::endl;
} else {
buffer << "MaxChildren is not defined" << std::endl;
}
if (hasTotalChildren()) {
buffer << "TotalChildren: " << getTotalChildren() << std::endl;
} else {
buffer << "TotalChildren is not defined" << std::endl;
}
return buffer.str();
}
};
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext);
class StatisticsImpl : public Statistics {
private:
std::vector<ColumnStatistics*> colStats;
// DELIBERATELY NOT IMPLEMENTED
StatisticsImpl(const StatisticsImpl&);
StatisticsImpl& operator=(const StatisticsImpl&);
public:
StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext);
StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
return colStats[columnId];
}
virtual ~StatisticsImpl() override;
uint32_t getNumberOfColumns() const override {
return static_cast<uint32_t>(colStats.size());
}
};
class StripeStatisticsImpl : public StripeStatistics {
private:
std::unique_ptr<StatisticsImpl> columnStats;
std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats;
// DELIBERATELY NOT IMPLEMENTED
StripeStatisticsImpl(const StripeStatisticsImpl&);
StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
public:
StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
return columnStats->getColumnStatistics(columnId);
}
uint32_t getNumberOfColumns() const override {
return columnStats->getNumberOfColumns();
}
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
uint32_t rowIndex) const override {
// check id indices are valid
return rowIndexStats[columnId][rowIndex].get();
}
virtual ~StripeStatisticsImpl() override;
uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
return static_cast<uint32_t>(rowIndexStats[columnId].size());
}
};
/**
* Create ColumnStatistics for writers
* @param type of column
* @return MutableColumnStatistics instances
*/
std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type);
} // namespace orc
#endif