blob: 633450fd544fefcfc101d97ec415c017599266ec [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ORC_STATISTICS_IMPL_HH
#define ORC_STATISTICS_IMPL_HH
#include "orc/Common.hh"
#include "orc/Int128.hh"
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
#include "Timezone.hh"
#include "TypeImpl.hh"
namespace orc {
/**
* StatContext contains fields required to compute statistics
*/
struct StatContext {
const bool correctStats;
const Timezone* const writerTimezone;
StatContext() : correctStats(false), writerTimezone(nullptr) {}
StatContext(bool cStat, const Timezone* const timezone = nullptr) :
correctStats(cStat), writerTimezone(timezone) {}
};
/**
* Internal Statistics Implementation
*/
template <typename T>
class InternalStatisticsImpl {
private:
bool _hasNull;
bool _hasMinimum;
bool _hasMaximum;
bool _hasSum;
bool _hasTotalLength;
uint64_t _totalLength;
uint64_t _valueCount;
T _minimum;
T _maximum;
T _sum;
public:
InternalStatisticsImpl() {
_hasNull = false;
_hasMinimum = false;
_hasMaximum = false;
_hasSum = false;
_hasTotalLength = false;
_totalLength = 0;
_valueCount = 0;
}
~InternalStatisticsImpl() {}
// GET / SET _totalLength
bool hasTotalLength() const { return _hasTotalLength; }
void setHasTotalLength(bool hasTotalLength) {
_hasTotalLength = hasTotalLength;
}
uint64_t getTotalLength() const { return _totalLength; }
void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
// GET / SET _sum
bool hasSum() const { return _hasSum; }
void setHasSum(bool hasSum) { _hasSum = hasSum; }
T getSum() const { return _sum; }
void setSum(T sum) { _sum = sum; }
// GET / SET _maximum
bool hasMaximum() const { return _hasMaximum; }
const T & getMaximum() const { return _maximum; }
void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
void setMaximum(T max) { _maximum = max; }
// GET / SET _minimum
bool hasMinimum() const { return _hasMinimum; }
void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
const T & getMinimum() const { return _minimum; }
void setMinimum(T min) { _minimum = min; }
// GET / SET _valueCount
uint64_t getNumberOfValues() const { return _valueCount; }
void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
// GET / SET _hasNullValue
bool hasNull() const { return _hasNull; }
void setHasNull(bool hasNull) { _hasNull = hasNull; }
void reset() {
_hasNull = false;
_hasMinimum = false;
_hasMaximum = false;
_hasSum = false;
_hasTotalLength = false;
_totalLength = 0;
_valueCount = 0;
}
void updateMinMax(T value) {
if (!_hasMinimum) {
_hasMinimum = _hasMaximum = true;
_minimum = _maximum = value;
} else if (compare(value, _minimum)) {
_minimum = value;
} else if (compare(_maximum, value)) {
_maximum = value;
}
}
// sum is not merged here as we need to check overflow
void merge(const InternalStatisticsImpl& other) {
_hasNull = _hasNull || other._hasNull;
_valueCount += other._valueCount;
if (other._hasMinimum) {
if (!_hasMinimum) {
_hasMinimum = _hasMaximum = true;
_minimum = other._minimum;
_maximum = other._maximum;
} else {
// all template types should support operator<
if (compare(_maximum, other._maximum)) {
_maximum = other._maximum;
}
if (compare(other._minimum, _minimum)) {
_minimum = other._minimum;
}
}
}
_hasTotalLength = _hasTotalLength && other._hasTotalLength;
_totalLength += other._totalLength;
}
};
typedef InternalStatisticsImpl<char> InternalCharStatistics;
typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
/**
* Mutable column statistics for use by the writer.
*/
class MutableColumnStatistics {
public:
virtual ~MutableColumnStatistics();
virtual void increase(uint64_t count) = 0;
virtual void setNumberOfValues(uint64_t value) = 0;
virtual void setHasNull(bool hasNull) = 0;
virtual void merge(const MutableColumnStatistics& other) = 0;
virtual void reset() = 0;
virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
};
/**
* ColumnStatistics Implementation
*/
class ColumnStatisticsImpl: public ColumnStatistics,
public MutableColumnStatistics {
private:
InternalCharStatistics _stats;
public:
ColumnStatisticsImpl() { reset(); }
ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~ColumnStatisticsImpl() override;
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
void merge(const MutableColumnStatistics& other) override {
_stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
}
void reset() override {
_stats.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Column has " << getNumberOfValues() << " values"
<< " and has null value: " << (hasNull() ? "yes" : "no")
<< std::endl;
return buffer.str();
}
};
class BinaryColumnStatisticsImpl: public BinaryColumnStatistics,
public MutableColumnStatistics {
private:
InternalCharStatistics _stats;
public:
BinaryColumnStatisticsImpl() { reset(); }
BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BinaryColumnStatisticsImpl() override;
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
bool hasTotalLength() const override {
return _stats.hasTotalLength();
}
uint64_t getTotalLength() const override {
if(hasTotalLength()){
return _stats.getTotalLength();
}else{
throw ParseError("Total length is not defined.");
}
}
void setTotalLength(uint64_t length) {
_stats.setHasTotalLength(true);
_stats.setTotalLength(length);
}
void update(size_t length) {
_stats.setTotalLength(_stats.getTotalLength() + length);
}
void merge(const MutableColumnStatistics& other) override {
const BinaryColumnStatisticsImpl& binStats =
dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
_stats.merge(binStats._stats);
}
void reset() override {
_stats.reset();
setTotalLength(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics();
binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Binary" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasTotalLength()){
buffer << "Total length: " << getTotalLength() << std::endl;
}else{
buffer << "Total length: not defined" << std::endl;
}
return buffer.str();
}
};
class BooleanColumnStatisticsImpl: public BooleanColumnStatistics,
public MutableColumnStatistics {
private:
InternalBooleanStatistics _stats;
bool _hasCount;
uint64_t _trueCount;
public:
BooleanColumnStatisticsImpl() { reset(); }
BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~BooleanColumnStatisticsImpl() override;
bool hasCount() const override {
return _hasCount;
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
_hasCount = true;
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
uint64_t getFalseCount() const override {
if(hasCount()){
return getNumberOfValues() - _trueCount;
}else{
throw ParseError("False count is not defined.");
}
}
uint64_t getTrueCount() const override {
if(hasCount()){
return _trueCount;
}else{
throw ParseError("True count is not defined.");
}
}
void setTrueCount(uint64_t trueCount) {
_hasCount = true;
_trueCount = trueCount;
}
void update(bool value, size_t repetitions) {
if (value) {
_trueCount += repetitions;
}
}
void merge(const MutableColumnStatistics& other) override {
const BooleanColumnStatisticsImpl& boolStats =
dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
_stats.merge(boolStats._stats);
_hasCount = _hasCount && boolStats._hasCount;
_trueCount += boolStats._trueCount;
}
void reset() override {
_stats.reset();
setTrueCount(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
if (_hasCount) {
bucketStats->add_count(_trueCount);
} else {
bucketStats->clear_count();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Boolean" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasCount()){
buffer << "(true: " << getTrueCount() << "; false: "
<< getFalseCount() << ")" << std::endl;
} else {
buffer << "(true: not defined; false: not defined)" << std::endl;
buffer << "True and false counts are not defined" << std::endl;
}
return buffer.str();
}
};
class DateColumnStatisticsImpl: public DateColumnStatistics,
public MutableColumnStatistics{
private:
InternalDateStatistics _stats;
public:
DateColumnStatisticsImpl() { reset(); }
DateColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~DateColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
int32_t getMinimum() const override {
if(hasMinimum()){
return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
int32_t getMaximum() const override {
if(hasMaximum()){
return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int32_t minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(int32_t maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
void update(int32_t value) {
_stats.updateMinMax(value);
}
void merge(const MutableColumnStatistics& other) override {
const DateColumnStatisticsImpl& dateStats =
dynamic_cast<const DateColumnStatisticsImpl&>(other);
_stats.merge(dateStats._stats);
}
void reset() override {
_stats.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::DateStatistics* dateStatistics =
pbStats.mutable_datestatistics();
if (_stats.hasMinimum()) {
dateStatistics->set_maximum(_stats.getMaximum());
dateStatistics->set_minimum(_stats.getMinimum());
} else {
dateStatistics->clear_minimum();
dateStatistics->clear_maximum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Date" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasMinimum()){
buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
if(hasMaximum()){
buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
return buffer.str();
}
};
class DecimalColumnStatisticsImpl: public DecimalColumnStatistics,
public MutableColumnStatistics {
private:
InternalDecimalStatistics _stats;
public:
DecimalColumnStatisticsImpl() { reset(); }
DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~DecimalColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasSum() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
Decimal getMinimum() const override {
if(hasMinimum()){
return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
Decimal getMaximum() const override {
if(hasMaximum()){
return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(Decimal minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(Decimal maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
Decimal getSum() const override {
if(hasSum()){
return _stats.getSum();
}else{
throw ParseError("Sum is not defined.");
}
}
void setSum(Decimal sum) {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void update(const Decimal& value) {
_stats.updateMinMax(value);
if (_stats.hasSum()) {
updateSum(value);
}
}
void merge(const MutableColumnStatistics& other) override {
const DecimalColumnStatisticsImpl& decStats =
dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
_stats.merge(decStats._stats);
_stats.setHasSum(_stats.hasSum() && decStats.hasSum());
if (_stats.hasSum()) {
updateSum(decStats.getSum());
}
}
void reset() override {
_stats.reset();
setSum(Decimal());
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics();
if (_stats.hasMinimum()) {
decStats->set_minimum(_stats.getMinimum().toString());
decStats->set_maximum(_stats.getMaximum().toString());
} else {
decStats->clear_minimum();
decStats->clear_maximum();
}
if (_stats.hasSum()) {
decStats->set_sum(_stats.getSum().toString());
} else {
decStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Decimal" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasMinimum()){
buffer << "Minimum: " << getMinimum().toString() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
if(hasMaximum()){
buffer << "Maximum: " << getMaximum().toString() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
if(hasSum()){
buffer << "Sum: " << getSum().toString() << std::endl;
}else{
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
private:
void updateSum(Decimal value) {
if (_stats.hasSum()) {
bool overflow = false;
Decimal sum = _stats.getSum();
if (sum.scale > value.scale) {
value.value = scaleUpInt128ByPowerOfTen(value.value,
sum.scale - value.scale,
overflow);
} else if (sum.scale < value.scale) {
sum.value = scaleUpInt128ByPowerOfTen(sum.value,
value.scale - sum.scale,
overflow);
sum.scale = value.scale;
}
if (!overflow) {
bool wasPositive = sum.value >= 0;
sum.value += value.value;
if ((value.value >= 0) == wasPositive) {
_stats.setHasSum((sum.value >= 0) == wasPositive);
}
} else {
_stats.setHasSum(false);
}
if (_stats.hasSum()) {
_stats.setSum(sum);
}
}
}
};
class DoubleColumnStatisticsImpl: public DoubleColumnStatistics,
public MutableColumnStatistics {
private:
InternalDoubleStatistics _stats;
public:
DoubleColumnStatisticsImpl() { reset(); }
DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~DoubleColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasSum() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
double getMinimum() const override {
if(hasMinimum()){
return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
double getMaximum() const override {
if(hasMaximum()){
return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(double minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(double maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
double getSum() const override {
if(hasSum()){
return _stats.getSum();
}else{
throw ParseError("Sum is not defined.");
}
}
void setSum(double sum) {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void update(double value) {
_stats.updateMinMax(value);
_stats.setSum(_stats.getSum() + value);
}
void merge(const MutableColumnStatistics& other) override {
const DoubleColumnStatisticsImpl& doubleStats =
dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
_stats.merge(doubleStats._stats);
_stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
if (_stats.hasSum()) {
_stats.setSum(_stats.getSum() + doubleStats.getSum());
}
}
void reset() override {
_stats.reset();
setSum(0.0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics();
if (_stats.hasMinimum()) {
doubleStats->set_minimum(_stats.getMinimum());
doubleStats->set_maximum(_stats.getMaximum());
} else {
doubleStats->clear_minimum();
doubleStats->clear_maximum();
}
if (_stats.hasSum()) {
doubleStats->set_sum(_stats.getSum());
} else {
doubleStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Double" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasMinimum()){
buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
if(hasMaximum()){
buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
if(hasSum()){
buffer << "Sum: " << getSum() << std::endl;
}else{
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
};
class IntegerColumnStatisticsImpl: public IntegerColumnStatistics,
public MutableColumnStatistics {
private:
InternalIntegerStatistics _stats;
public:
IntegerColumnStatisticsImpl() { reset(); }
IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
virtual ~IntegerColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasSum() const override {
return _stats.hasSum();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
int64_t getMinimum() const override {
if(hasMinimum()){
return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
if(hasMaximum()){
return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int64_t minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(int64_t maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
int64_t getSum() const override {
if(hasSum()){
return _stats.getSum();
}else{
throw ParseError("Sum is not defined.");
}
}
void setSum(int64_t sum) {
_stats.setHasSum(true);
_stats.setSum(sum);
}
void update(int64_t value, int repetitions);
void merge(const MutableColumnStatistics& other) override {
const IntegerColumnStatisticsImpl& intStats =
dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
_stats.merge(intStats._stats);
// update sum and check overflow
_stats.setHasSum(_stats.hasSum() && intStats.hasSum());
if (_stats.hasSum()) {
bool wasPositive = _stats.getSum() >= 0;
_stats.setSum(_stats.getSum() + intStats.getSum());
if ((intStats.getSum() >= 0) == wasPositive) {
_stats.setHasSum((_stats.getSum() >= 0) == wasPositive);
}
}
}
void reset() override {
_stats.reset();
setSum(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics();
if (_stats.hasMinimum()) {
intStats->set_minimum(_stats.getMinimum());
intStats->set_maximum(_stats.getMaximum());
} else {
intStats->clear_minimum();
intStats->clear_maximum();
}
if (_stats.hasSum()) {
intStats->set_sum(_stats.getSum());
} else {
intStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: Integer" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasMinimum()){
buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum: not defined" << std::endl;
}
if(hasMaximum()){
buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum: not defined" << std::endl;
}
if(hasSum()){
buffer << "Sum: " << getSum() << std::endl;
}else{
buffer << "Sum: not defined" << std::endl;
}
return buffer.str();
}
};
class StringColumnStatisticsImpl: public StringColumnStatistics,
public MutableColumnStatistics{
private:
InternalStringStatistics _stats;
public:
StringColumnStatisticsImpl() {
reset();
}
StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~StringColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
bool hasTotalLength() const override {
return _stats.hasTotalLength();
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
const std::string & getMinimum() const override {
if(hasMinimum()){
return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
const std::string & getMaximum() const override {
if(hasMaximum()){
return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(std::string minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(std::string maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
uint64_t getTotalLength() const override {
if(hasTotalLength()){
return _stats.getTotalLength();
}else{
throw ParseError("Total length is not defined.");
}
}
void setTotalLength(uint64_t length) {
_stats.setHasTotalLength(true);
_stats.setTotalLength(length);
}
void update(const char* value, size_t length) {
if (value != nullptr) {
if (!_stats.hasMinimum()) {
std::string tempStr(value, value + length);
setMinimum(tempStr);
setMaximum(tempStr);
} else {
// update min
int minCmp = strncmp(_stats.getMinimum().c_str(),
value,
std::min(_stats.getMinimum().length(), length));
if (minCmp > 0 ||
(minCmp == 0 && length < _stats.getMinimum().length())) {
setMinimum(std::string(value, value + length));
}
// update max
int maxCmp = strncmp(_stats.getMaximum().c_str(),
value,
std::min(_stats.getMaximum().length(), length));
if (maxCmp < 0 ||
(maxCmp == 0 && length > _stats.getMaximum().length())) {
setMaximum(std::string(value, value + length));
}
}
}
_stats.setTotalLength(_stats.getTotalLength() + length);
}
void update(std::string value) {
update(value.c_str(), value.length());
}
void merge(const MutableColumnStatistics& other) override {
const StringColumnStatisticsImpl& strStats =
dynamic_cast<const StringColumnStatisticsImpl&>(other);
_stats.merge(strStats._stats);
}
void reset() override {
_stats.reset();
setTotalLength(0);
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::StringStatistics* strStats = pbStats.mutable_stringstatistics();
if (_stats.hasMinimum()) {
strStats->set_minimum(_stats.getMinimum());
strStats->set_maximum(_stats.getMaximum());
} else {
strStats->clear_minimum();
strStats->clear_maximum();
}
if (_stats.hasTotalLength()) {
strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
} else {
strStats->clear_sum();
}
}
std::string toString() const override {
std::ostringstream buffer;
buffer << "Data type: String" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasMinimum()){
buffer << "Minimum: " << getMinimum() << std::endl;
}else{
buffer << "Minimum is not defined" << std::endl;
}
if(hasMaximum()){
buffer << "Maximum: " << getMaximum() << std::endl;
}else{
buffer << "Maximum is not defined" << std::endl;
}
if(hasTotalLength()){
buffer << "Total length: " << getTotalLength() << std::endl;
}else{
buffer << "Total length is not defined" << std::endl;
}
return buffer.str();
}
};
class TimestampColumnStatisticsImpl: public TimestampColumnStatistics,
public MutableColumnStatistics {
private:
InternalIntegerStatistics _stats;
bool _hasLowerBound;
bool _hasUpperBound;
int64_t _lowerBound;
int64_t _upperBound;
public:
TimestampColumnStatisticsImpl() { reset(); }
TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
const StatContext& statContext);
virtual ~TimestampColumnStatisticsImpl() override;
bool hasMinimum() const override {
return _stats.hasMinimum();
}
bool hasMaximum() const override {
return _stats.hasMaximum();
}
uint64_t getNumberOfValues() const override {
return _stats.getNumberOfValues();
}
void setNumberOfValues(uint64_t value) override {
_stats.setNumberOfValues(value);
}
void increase(uint64_t count) override {
_stats.setNumberOfValues(_stats.getNumberOfValues() + count);
}
bool hasNull() const override {
return _stats.hasNull();
}
void setHasNull(bool hasNull) override {
_stats.setHasNull(hasNull);
}
int64_t getMinimum() const override {
if(hasMinimum()){
return _stats.getMinimum();
}else{
throw ParseError("Minimum is not defined.");
}
}
int64_t getMaximum() const override {
if(hasMaximum()){
return _stats.getMaximum();
}else{
throw ParseError("Maximum is not defined.");
}
}
void setMinimum(int64_t minimum) {
_stats.setHasMinimum(true);
_stats.setMinimum(minimum);
}
void setMaximum(int64_t maximum) {
_stats.setHasMaximum(true);
_stats.setMaximum(maximum);
}
void update(int64_t value) {
_stats.updateMinMax(value);
}
void merge(const MutableColumnStatistics& other) override {
const TimestampColumnStatisticsImpl& tsStats =
dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
_stats.merge(tsStats._stats);
}
void reset() override {
_stats.reset();
}
void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
pbStats.set_hasnull(_stats.hasNull());
pbStats.set_numberofvalues(_stats.getNumberOfValues());
proto::TimestampStatistics* tsStats =
pbStats.mutable_timestampstatistics();
if (_stats.hasMinimum()) {
tsStats->set_minimumutc(_stats.getMinimum());
tsStats->set_maximumutc(_stats.getMaximum());
} else {
tsStats->clear_minimumutc();
tsStats->clear_maximumutc();
}
}
std::string toString() const override {
std::ostringstream buffer;
struct tm tmValue;
char timeBuffer[20];
time_t secs = 0;
buffer << "Data type: Timestamp" << std::endl
<< "Values: " << getNumberOfValues() << std::endl
<< "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
if(hasMinimum()){
secs = static_cast<time_t>(getMinimum() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "Minimum: " << timeBuffer << "."
<< (getMinimum() % 1000) << std::endl;
}else{
buffer << "Minimum is not defined" << std::endl;
}
if(hasLowerBound()){
secs = static_cast<time_t>(getLowerBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "LowerBound: " << timeBuffer << "."
<< (getLowerBound() % 1000) << std::endl;
}else{
buffer << "LowerBound is not defined" << std::endl;
}
if(hasMaximum()){
secs = static_cast<time_t>(getMaximum()/1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "Maximum: " << timeBuffer << "."
<< (getMaximum() % 1000) << std::endl;
}else{
buffer << "Maximum is not defined" << std::endl;
}
if(hasUpperBound()){
secs = static_cast<time_t>(getUpperBound() / 1000);
gmtime_r(&secs, &tmValue);
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
buffer << "UpperBound: " << timeBuffer << "."
<< (getUpperBound() % 1000) << std::endl;
}else{
buffer << "UpperBound is not defined" << std::endl;
}
return buffer.str();
}
bool hasLowerBound() const override {
return _hasLowerBound;
}
bool hasUpperBound() const override {
return _hasUpperBound;
}
int64_t getLowerBound() const override {
if(hasLowerBound()){
return _lowerBound;
}else{
throw ParseError("LowerBound is not defined.");
}
}
int64_t getUpperBound() const override {
if(hasUpperBound()){
return _upperBound;
}else{
throw ParseError("UpperBound is not defined.");
}
}
};
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext);
class StatisticsImpl: public Statistics {
private:
std::vector<ColumnStatistics*> colStats;
// DELIBERATELY NOT IMPLEMENTED
StatisticsImpl(const StatisticsImpl&);
StatisticsImpl& operator=(const StatisticsImpl&);
public:
StatisticsImpl(const proto::StripeStatistics& stripeStats,
const StatContext& statContext);
StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
) const override {
return colStats[columnId];
}
virtual ~StatisticsImpl() override;
uint32_t getNumberOfColumns() const override {
return static_cast<uint32_t>(colStats.size());
}
};
class StripeStatisticsImpl: public StripeStatistics {
private:
std::unique_ptr<StatisticsImpl> columnStats;
std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > >
rowIndexStats;
// DELIBERATELY NOT IMPLEMENTED
StripeStatisticsImpl(const StripeStatisticsImpl&);
StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
public:
StripeStatisticsImpl(
const proto::StripeStatistics& stripeStats,
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
) const override {
return columnStats->getColumnStatistics(columnId);
}
uint32_t getNumberOfColumns() const override {
return columnStats->getNumberOfColumns();
}
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
uint32_t rowIndex
) const override {
// check id indices are valid
return rowIndexStats[columnId][rowIndex].get();
}
virtual ~StripeStatisticsImpl() override;
uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
return static_cast<uint32_t>(rowIndexStats[columnId].size());
}
};
/**
* Create ColumnStatistics for writers
* @param type of column
* @return MutableColumnStatistics instances
*/
std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
const Type& type);
}// namespace
#endif