blob: 414cd6e93e87c26930fda7a25985d658d80798e8 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.impala.catalog;
import java.util.Set;
import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;
import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
import org.apache.impala.analysis.Expr;
import org.apache.impala.analysis.SlotRef;
import org.apache.impala.thrift.TColumnStats;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import com.google.common.math.LongMath;
/**
* Statistics for a single column.
*/
public class ColumnStats {
// Set of the currently supported column stats column types.
private final static Set<PrimitiveType> SUPPORTED_COL_TYPES = Sets.newHashSet(
PrimitiveType.BIGINT, PrimitiveType.BINARY, PrimitiveType.BOOLEAN,
PrimitiveType.DATE, PrimitiveType.DOUBLE, PrimitiveType.FLOAT,
PrimitiveType.INT, PrimitiveType.SMALLINT, PrimitiveType.CHAR,
PrimitiveType.VARCHAR, PrimitiveType.STRING, PrimitiveType.TIMESTAMP,
PrimitiveType.TINYINT, PrimitiveType.DECIMAL);
public enum StatsKey {
NUM_DISTINCT_VALUES("numDVs"),
NUM_NULLS("numNulls"),
AVG_SIZE("avgSize"),
MAX_SIZE("maxSize");
private final String name_;
private StatsKey(String name) { name_ = name; }
/**
* Returns the StatsKey whose name matches 'key'. The comparison is
* case insensitive. Returns null if there is no matching StatsKey.
*/
public static StatsKey fromString(String key) {
for (StatsKey k: values()) {
if (key.equalsIgnoreCase(k.name_)) return k;
}
return null;
}
@Override
public String toString() { return name_; }
}
// in bytes: excludes serialization overhead.
// -1 if unknown. Always has a valid value for fixed-length types.
private double avgSize_;
// in bytes; includes serialization overhead.
// -1 if unknown. Always has a valid value for fixed-length types.
// avgSerializedSize_ is valid iff avgSize_ is valid.
private double avgSerializedSize_;
private long maxSize_; // in bytes
private long numDistinctValues_;
private long numNulls_;
public ColumnStats(Type colType) {
initColStats(colType);
validate(colType);
}
/**
* C'tor for clone().
*/
private ColumnStats(ColumnStats other) {
avgSize_ = other.avgSize_;
avgSerializedSize_ = other.avgSerializedSize_;
maxSize_ = other.maxSize_;
numDistinctValues_ = other.numDistinctValues_;
numNulls_ = other.numNulls_;
validate(null);
}
/**
* Initializes all column stats values as "unknown". For fixed-length type
* (those which don't need additional storage besides the slot they occupy),
* sets avgSerializedSize and maxSize to their slot size.
*/
private void initColStats(Type colType) {
avgSize_ = -1;
avgSerializedSize_ = -1;
maxSize_ = -1;
numDistinctValues_ = -1;
numNulls_ = -1;
if (colType.isFixedLengthType()) {
avgSerializedSize_ = colType.getSlotSize();
avgSize_ = colType.getSlotSize();
maxSize_ = colType.getSlotSize();
}
}
/**
* Creates ColumnStats from the given expr. Sets numDistinctValues and if the expr
* is a SlotRef also numNulls.
*/
public static ColumnStats fromExpr(Expr expr) {
Preconditions.checkNotNull(expr);
Preconditions.checkState(expr.getType().isValid(), expr);
Type colType = expr.getType();
ColumnStats stats = new ColumnStats(colType);
stats.setNumDistinctValues(expr.getNumDistinctValues());
SlotRef slotRef = expr.unwrapSlotRef(false);
if (slotRef == null) return stats;
ColumnStats slotStats = slotRef.getDesc().getStats();
if (slotStats == null) return stats;
stats.numNulls_ = slotStats.getNumNulls();
if (!colType.isFixedLengthType()) {
stats.avgSerializedSize_ = slotStats.getAvgSerializedSize();
stats.avgSize_ = slotStats.getAvgSize();
stats.maxSize_ = slotStats.getMaxSize();
}
stats.validate(colType);
return stats;
}
/**
* Adds other's numDistinctValues and numNulls to this ColumnStats.
* If this or other's stats are invalid, sets the corresponding stat to invalid,
* Returns this with the updated stats.
* This method is used to aggregate stats for slots that originate from multiple
* source slots, e.g., those produced by union queries.
*/
public ColumnStats add(ColumnStats other) {
if (numDistinctValues_ == -1 || other.numDistinctValues_ == -1) {
numDistinctValues_ = -1;
} else {
numDistinctValues_ += other.numDistinctValues_;
}
if (numNulls_ == -1 || other.numNulls_ == -1) {
numNulls_ = -1;
} else {
numNulls_ += other.numNulls_;
}
validate(null);
return this;
}
public long getNumDistinctValues() { return numDistinctValues_; }
public void setNumDistinctValues(long numDistinctValues) {
numDistinctValues_ = numDistinctValues;
}
public void setNumNulls(long numNulls) { numNulls_ = numNulls; }
public double getAvgSerializedSize() { return avgSerializedSize_; }
public double getAvgSize() { return avgSize_; }
public long getMaxSize() { return maxSize_; }
public boolean hasNulls() { return numNulls_ > 0; }
public long getNumNulls() { return numNulls_; }
// True iff getAvgSize() and getAvgSerializedSize() will return valid values.
public boolean hasAvgSize() { return avgSize_ >= 0; }
public boolean hasNumDistinctValues() { return numDistinctValues_ >= 0; }
public boolean hasStats() { return numNulls_ != -1 || numDistinctValues_ != -1; }
/**
* Updates the stats with the given ColumnStatisticsData. If the ColumnStatisticsData
* is not compatible with the given colType, all stats are initialized based on
* initColStats().
* Returns false if the ColumnStatisticsData data was incompatible with the given
* column type, otherwise returns true.
*/
public boolean update(Type colType, ColumnStatisticsData statsData) {
Preconditions.checkState(isSupportedColType(colType));
initColStats(colType);
boolean isCompatible = false;
switch (colType.getPrimitiveType()) {
case BOOLEAN:
isCompatible = statsData.isSetBooleanStats();
if (isCompatible) {
BooleanColumnStatsData boolStats = statsData.getBooleanStats();
numNulls_ = boolStats.getNumNulls();
// If we have numNulls, we can infer NDV from that.
if (numNulls_ > 0) {
numDistinctValues_ = 3;
} else if (numNulls_ == 0) {
numDistinctValues_ = 2;
} else {
numDistinctValues_ = -1;
}
}
break;
case TINYINT:
case SMALLINT:
case INT:
case BIGINT:
case TIMESTAMP: // Hive and Impala use LongColumnStatsData for timestamps.
isCompatible = statsData.isSetLongStats();
if (isCompatible) {
LongColumnStatsData longStats = statsData.getLongStats();
numDistinctValues_ = longStats.getNumDVs();
numNulls_ = longStats.getNumNulls();
}
break;
case DATE:
isCompatible = statsData.isSetDateStats();
if (isCompatible) {
DateColumnStatsData dateStats = statsData.getDateStats();
numDistinctValues_ = dateStats.getNumDVs();
numNulls_ = dateStats.getNumNulls();
}
break;
case FLOAT:
case DOUBLE:
isCompatible = statsData.isSetDoubleStats();
if (isCompatible) {
DoubleColumnStatsData doubleStats = statsData.getDoubleStats();
numDistinctValues_ = doubleStats.getNumDVs();
numNulls_ = doubleStats.getNumNulls();
}
break;
case CHAR:
// Ignore CHAR length stats, since it is fixed length internally.
isCompatible = statsData.isSetStringStats();
if (isCompatible) {
StringColumnStatsData stringStats = statsData.getStringStats();
numDistinctValues_ = stringStats.getNumDVs();
numNulls_ = stringStats.getNumNulls();
}
break;
case VARCHAR:
case STRING:
isCompatible = statsData.isSetStringStats();
if (isCompatible) {
StringColumnStatsData stringStats = statsData.getStringStats();
numDistinctValues_ = stringStats.getNumDVs();
numNulls_ = stringStats.getNumNulls();
maxSize_ = stringStats.getMaxColLen();
avgSize_ = Double.valueOf(stringStats.getAvgColLen()).floatValue();
if (avgSize_ >= 0) {
avgSerializedSize_ = avgSize_ + PrimitiveType.STRING.getSlotSize();
} else {
avgSerializedSize_ = -1;
}
}
break;
case BINARY:
isCompatible = statsData.isSetStringStats();
if (isCompatible) {
BinaryColumnStatsData binaryStats = statsData.getBinaryStats();
numNulls_ = binaryStats.getNumNulls();
maxSize_ = binaryStats.getMaxColLen();
avgSize_ = Double.valueOf(binaryStats.getAvgColLen()).floatValue();
avgSerializedSize_ = avgSize_ + PrimitiveType.BINARY.getSlotSize();
}
break;
case DECIMAL:
isCompatible = statsData.isSetDecimalStats();
if (isCompatible) {
DecimalColumnStatsData decimalStats = statsData.getDecimalStats();
numNulls_ = decimalStats.getNumNulls();
numDistinctValues_ = decimalStats.getNumDVs();
}
break;
default:
Preconditions.checkState(false,
"Unexpected column type: " + colType.toString());
break;
}
validate(colType);
return isCompatible;
}
/**
* Convert the statistics back into an HMS-compatible ColumnStatisticsData object.
* This is essentially the inverse of {@link #update(Type, ColumnStatisticsData)
* above.
*
* Returns null if statistics for the specified type are not supported.
*/
public static ColumnStatisticsData createHiveColStatsData(
long capNdv, TColumnStats colStats, Type colType) {
ColumnStatisticsData colStatsData = new ColumnStatisticsData();
long ndv = colStats.getNum_distinct_values();
// Cap NDV at row count if available.
if (capNdv >= 0) ndv = Math.min(ndv, capNdv);
long numNulls = colStats.getNum_nulls();
switch(colType.getPrimitiveType()) {
case BOOLEAN:
// TODO(IMPALA-8205): actually compute the count of true/false
// values.
colStatsData.setBooleanStats(new BooleanColumnStatsData(
/*numTrues=*/-1, /*numFalse=*/-1, numNulls));
break;
case TINYINT:
ndv = Math.min(ndv, LongMath.pow(2, Byte.SIZE));
colStatsData.setLongStats(new LongColumnStatsData(numNulls, ndv));
break;
case SMALLINT:
ndv = Math.min(ndv, LongMath.pow(2, Short.SIZE));
colStatsData.setLongStats(new LongColumnStatsData(numNulls, ndv));
break;
case INT:
ndv = Math.min(ndv, LongMath.pow(2, Integer.SIZE));
colStatsData.setLongStats(new LongColumnStatsData(numNulls, ndv));
break;
case DATE:
// Number of distinct dates in the 0001-01-01..9999-12-31 inclusive range is
// 3652059.
ndv = Math.min(ndv, 3652059);
colStatsData.setDateStats(new DateColumnStatsData(numNulls, ndv));
break;
case BIGINT:
case TIMESTAMP: // Hive and Impala use LongColumnStatsData for timestamps.
colStatsData.setLongStats(new LongColumnStatsData(numNulls, ndv));
break;
case FLOAT:
case DOUBLE:
colStatsData.setDoubleStats(new DoubleColumnStatsData(numNulls, ndv));
break;
case CHAR:
case VARCHAR:
case STRING:
long maxStrLen = colStats.getMax_size();
double avgStrLen = colStats.getAvg_size();
colStatsData.setStringStats(
new StringColumnStatsData(maxStrLen, avgStrLen, numNulls, ndv));
break;
case DECIMAL:
double decMaxNdv = Math.pow(10, colType.getPrecision());
ndv = (long) Math.min(ndv, decMaxNdv);
colStatsData.setDecimalStats(new DecimalColumnStatsData(numNulls, ndv));
break;
default:
return null;
}
return colStatsData;
}
public ColumnStatisticsData toHmsCompatibleThrift(Type colType) {
return createHiveColStatsData(-1, toThrift(), colType);
}
/**
* Sets the member corresponding to the given stats key to 'value'.
* Requires that the given value is of a type appropriate for the
* member being set. Throws if that is not the case.
*/
public void update(Type colType, StatsKey key, Number value) {
Preconditions.checkNotNull(key);
Preconditions.checkNotNull(value);
if (key == StatsKey.AVG_SIZE) {
Preconditions.checkArgument(value instanceof Float);
Float floatValue = (Float) value;
Preconditions.checkArgument(floatValue >= 0 || floatValue == -1, floatValue);
} else {
Preconditions.checkArgument(value instanceof Long);
Long longValue = (Long) value;
Preconditions.checkArgument(longValue >= 0 || longValue == -1, longValue);
}
switch (key) {
case NUM_DISTINCT_VALUES: {
numDistinctValues_ = (Long) value;
break;
}
case NUM_NULLS: {
numNulls_ = (Long) value;
break;
}
case AVG_SIZE: {
Preconditions.checkArgument(!colType.isFixedLengthType(), colType);
avgSize_ = (Float) value;
// Ensure avgSerializedSize_ stays in sync with avgSize_.
if (avgSize_ >= 0) {
avgSerializedSize_ = colType.getSlotSize() + avgSize_;
} else {
avgSerializedSize_ = -1;
}
break;
}
case MAX_SIZE: {
Preconditions.checkArgument(!colType.isFixedLengthType(), colType);
maxSize_ = (Long) value;
break;
}
default: Preconditions.checkState(false);
}
validate(colType);
}
/**
* Returns true if the given PrimitiveType supports column stats updates.
*/
public static boolean isSupportedColType(Type colType) {
if (!colType.isScalarType()) return false;
ScalarType scalarType = (ScalarType) colType;
return SUPPORTED_COL_TYPES.contains(scalarType.getPrimitiveType());
}
public void update(Type colType, TColumnStats stats) {
initColStats(colType);
if (!colType.isFixedLengthType() && stats.getAvg_size() >= 0) {
// Update size estimates based on average size. Fixed length types already include
// size estimates.
avgSize_ = Double.valueOf(stats.getAvg_size()).floatValue();
avgSerializedSize_ = colType.getSlotSize() + avgSize_;
}
maxSize_ = stats.getMax_size();
numDistinctValues_ = stats.getNum_distinct_values();
numNulls_ = stats.getNum_nulls();
validate(colType);
}
public TColumnStats toThrift() {
TColumnStats colStats = new TColumnStats();
colStats.setAvg_size(avgSize_);
colStats.setMax_size(maxSize_);
colStats.setNum_distinct_values(numDistinctValues_);
colStats.setNum_nulls(numNulls_);
return colStats;
}
/**
* Check that the stats obey expected invariants.
* 'colType' is optional, but should be passed in if it is available in the caller.
*/
public void validate(Type colType) {
// avgSize_ and avgSerializedSize_ must be set together.
Preconditions.checkState(avgSize_ >= 0 == avgSerializedSize_ >= 0, this);
// Values must be either valid or -1.
Preconditions.checkState(avgSize_ == -1 || avgSize_ >= 0, this);
Preconditions.checkState(avgSerializedSize_ == -1 || avgSerializedSize_ >= 0, this);
Preconditions.checkState(maxSize_ == -1 || maxSize_ >= 0, this);
Preconditions.checkState(numDistinctValues_ == -1 || numDistinctValues_ >= 0, this);
Preconditions.checkState(numNulls_ == -1 || numNulls_ >= 0, this);
if (colType != null && colType.isFixedLengthType()) {
Preconditions.checkState(avgSize_ == colType.getSlotSize(), this);
Preconditions.checkState(avgSerializedSize_ == colType.getSlotSize(), this);
Preconditions.checkState(maxSize_ == colType.getSlotSize(), this);
}
}
@Override
public String toString() {
return Objects.toStringHelper(this.getClass())
.add("avgSize_", avgSize_)
.add("avgSerializedSize_", avgSerializedSize_)
.add("maxSize_", maxSize_)
.add("numDistinct_", numDistinctValues_)
.add("numNulls_", numNulls_)
.toString();
}
@Override
public ColumnStats clone() { return new ColumnStats(this); }
}