IMPALA-6660: Change -0/+0 floating point to compare as equal in hash table
Currently -0.0/+0.0 values are hashed to different values due to
their different binary representation, while -0.0==+0.0 is true in
C++. This caused them to be distinct values in hash maps despite
being treated as equal in comparisons.
This commit fixes the hashing of -0.0/+0.0, thus changing the
behaviour of hash joins and aggregations (since aggregations
follow the behaviour of the join). That way, the canonical form for
-0/+0 is changed to +0.
Tests:
- Added e2e tests for aggregation (group by and distinct) and
join queries with -0.0 and +0.0 present.
Change-Id: I6bb1a817c81c452d041238c19cb6c9f602a5d565
Reviewed-on: http://gerrit.cloudera.org:8080/14588
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/codegen/codegen-anyval.cc b/be/src/codegen/codegen-anyval.cc
index 4bf4ccf..c702315 100644
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc
@@ -482,10 +482,18 @@
value_ = builder_->CreateInsertValue(value_, v, 0, name_);
}
+llvm::Value* CodegenAnyVal::ConvertToPositiveZero(llvm::Value* val) {
+ // Replaces negative zero with positive, leaves everything else unchanged.
+ llvm::Value* is_negative_zero = builder_->CreateFCmpOEQ(
+ val, llvm::ConstantFP::getNegativeZero(val->getType()), "cmp_zero");
+ return builder_->CreateSelect(is_negative_zero,
+ llvm::ConstantFP::get(val->getType(), 0.0), val);
+}
+
void CodegenAnyVal::ConvertToCanonicalForm() {
// Convert the value to a bit pattern that is unambiguous.
// Specifically, for floating point type values, NaN values are converted to
- // the same bit pattern.
+ // the same bit pattern, and -0 is converted to +0.
switch(type_.type) {
case TYPE_FLOAT:
case TYPE_DOUBLE: {
@@ -497,7 +505,8 @@
canonical_val = llvm::ConstantFP::getNaN(codegen_->double_type());
}
llvm::Value* is_nan = builder_->CreateFCmpUNO(raw, raw, "cmp_nan");
- SetVal(builder_->CreateSelect(is_nan, canonical_val, raw));
+
+ SetVal(builder_->CreateSelect(is_nan, canonical_val, ConvertToPositiveZero(raw)));
break;
}
default:
@@ -774,6 +783,7 @@
local_val, "local_val_is_nan");
llvm::Value* val_is_nan = builder_->CreateFCmpUNO(val, val, "val_is_nan");
llvm::Value* both_nan = builder_->CreateAnd(local_is_nan, val_is_nan);
+
return builder_->CreateOr(cmp_raw, both_nan, "cmp_raw_with_nan");
}
case TYPE_STRING:
diff --git a/be/src/codegen/codegen-anyval.h b/be/src/codegen/codegen-anyval.h
index bc271fe..91b5594 100644
--- a/be/src/codegen/codegen-anyval.h
+++ b/be/src/codegen/codegen-anyval.h
@@ -236,6 +236,7 @@
/// Therefore all NaN values need to be converted into a consistent
/// form where all bits are the same. This method will do that -
/// ensure that all NaN values have the same bit pattern.
+ /// Similarly, -0 == +0 is handled here.
///
/// Generically speaking, a canonical form of a value ensures that
/// all ambiguity is removed from a value's bit settings -- if there
@@ -244,6 +245,10 @@
/// float and double values.)
void ConvertToCanonicalForm();
+ /// Replaces negative floating point zero with positive zero,
+ /// leaves everything else unchanged.
+ llvm::Value* ConvertToPositiveZero(llvm::Value* val);
+
/// Returns the i1 result of this == other. this and other must be non-null.
llvm::Value* Eq(CodegenAnyVal* other);
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index 6ea5fdf..5bd2b91 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -197,9 +197,7 @@
}
const ColumnType& expr_type = build_exprs_[i]->type();
DCHECK_LE(expr_type.GetSlotSize(), sizeof(NULL_VALUE));
- if (RawValue::IsNaN(val, expr_type)) {
- val = RawValue::CanonicalNaNValue(expr_type);
- }
+ val = RawValue::CanonicalValue(val, expr_type);
RawValue::Write(val, loc, expr_type, NULL);
}
return has_null;
@@ -1165,8 +1163,7 @@
llvm::Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(
NULL, expr_values_null, codegen->GetI32Constant(i), "null_byte_loc");
llvm::Value* null_byte = builder.CreateLoad(llvm_null_byte_loc);
- row_is_null =
- builder.CreateICmpNE(null_byte, codegen->GetI8Constant(0));
+ row_is_null = builder.CreateICmpNE(null_byte, codegen->GetI8Constant(0));
}
if (inclusive_equality) result.ConvertToCanonicalForm();
diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc
index 82eb879..27c5f53 100644
--- a/be/src/runtime/raw-value.cc
+++ b/be/src/runtime/raw-value.cc
@@ -29,9 +29,11 @@
namespace impala {
-const int RawValue::ASCII_PRECISION = 16; // print 16 digits for double/float
-const double RawValue::CANONICAL_DOUBLE_NAN = nan("");
-const float RawValue::CANONICAL_FLOAT_NAN = nanf("");
+const int RawValue::ASCII_PRECISION;
+constexpr double RawValue::CANONICAL_DOUBLE_NAN;
+constexpr float RawValue::CANONICAL_FLOAT_NAN;
+constexpr double RawValue::CANONICAL_DOUBLE_ZERO;
+constexpr float RawValue::CANONICAL_FLOAT_ZERO;
void RawValue::PrintValueAsBytes(const void* value, const ColumnType& type,
stringstream* stream) {
diff --git a/be/src/runtime/raw-value.h b/be/src/runtime/raw-value.h
index 7bf52a5..f94d122 100644
--- a/be/src/runtime/raw-value.h
+++ b/be/src/runtime/raw-value.h
@@ -34,14 +34,17 @@
/// Useful utility functions for runtime values (which are passed around as void*).
class RawValue {
public:
- /// Ascii output precision for double/float
- static const int ASCII_PRECISION;
+ /// Ascii output precision for double/float, print 16 digits.
+ static const int ASCII_PRECISION = 16;
/// Single NaN values to ensure all NaN values can be assigned one bit pattern
/// that will always compare and hash the same way. Allows for all NaN values
/// to be put into the same "group by" bucket.
- static const double CANONICAL_DOUBLE_NAN;
- static const float CANONICAL_FLOAT_NAN;
+ static constexpr double CANONICAL_DOUBLE_NAN = std::numeric_limits<double>::quiet_NaN();
+ static constexpr float CANONICAL_FLOAT_NAN = std::numeric_limits<float>::quiet_NaN();
+ /// The canonical zero values when comparing negative and positive zeros.
+ static constexpr double CANONICAL_DOUBLE_ZERO = 0.0;
+ static constexpr float CANONICAL_FLOAT_ZERO = 0.0f;
/// Convert 'value' into ascii and write to 'stream'. NULL turns into "NULL". 'scale'
/// determines how many digits after the decimal are printed for floating point numbers,
@@ -107,9 +110,19 @@
/// Returns true if val/type correspond to a NaN floating point value.
static inline bool IsNaN(const void* val, const ColumnType& type);
+ /// Returns true if val/type correspond to a +0/-0 floating point value.
+ static inline bool IsFloatingZero(const void* val, const ColumnType& type);
+
+ /// Returns the canonical form of the given value. Currently this means a unified NaN
+ /// value in case of NaN and +0 in case of +0/-0.
+ static inline const void* CanonicalValue(const void* val, const ColumnType& type);
+
/// Returns a canonical NaN value for a floating point type
/// (which will always have the same bit-pattern to maintain consistency in hashing).
static inline const void* CanonicalNaNValue(const ColumnType& type);
+
+ // Returns positive zero for floating point types.
+ static inline const void* PositiveFloatingZero(const ColumnType& type);
};
}
diff --git a/be/src/runtime/raw-value.inline.h b/be/src/runtime/raw-value.inline.h
index 116f21e..a9fc3b3 100644
--- a/be/src/runtime/raw-value.inline.h
+++ b/be/src/runtime/raw-value.inline.h
@@ -51,6 +51,23 @@
}
}
+inline bool RawValue::IsFloatingZero(const void* val, const ColumnType& type) {
+ switch(type.type) {
+ case TYPE_FLOAT:
+ return *reinterpret_cast<const float*>(val) == CANONICAL_FLOAT_ZERO;
+ case TYPE_DOUBLE:
+ return *reinterpret_cast<const double*>(val) == CANONICAL_DOUBLE_ZERO;
+ default:
+ return false;
+ }
+}
+
+inline const void* RawValue::CanonicalValue(const void* val, const ColumnType& type) {
+ if (RawValue::IsNaN(val, type)) return RawValue::CanonicalNaNValue(type);
+ if (RawValue::IsFloatingZero(val, type)) return RawValue::PositiveFloatingZero(type);
+ return val;
+}
+
inline const void* RawValue::CanonicalNaNValue(const ColumnType& type) {
switch(type.type) {
case TYPE_FLOAT:
@@ -63,6 +80,18 @@
}
}
+inline const void* RawValue::PositiveFloatingZero(const ColumnType& type) {
+ switch(type.type) {
+ case TYPE_FLOAT:
+ return &CANONICAL_FLOAT_ZERO;
+ case TYPE_DOUBLE:
+ return &CANONICAL_DOUBLE_ZERO;
+ default:
+ DCHECK(false);
+ return nullptr;
+ }
+}
+
inline bool RawValue::Eq(const void* v1, const void* v2, const ColumnType& type) {
const StringValue* string_value1;
const StringValue* string_value2;
diff --git a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
index 315c3ec..f0f3112 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
@@ -1402,6 +1402,25 @@
bigint, float, float
====
---- QUERY
+# IMPALA-6660: GROUP BY of -0/+0 values aggregates zeros as one grouping
+select x, count(*)
+from (values(cast("-0" as float) x), (cast("0" as float))) v
+group by x
+---- RESULTS
+0,2
+---- TYPES
+float, bigint
+====
+---- QUERY
+# IMPALA-6660: -0/+0 values are not distinct
+select distinct *
+from (values(cast("-0" as float)), (cast("0" as float))) v;
+---- RESULTS
+0
+---- TYPES
+float
+====
+---- QUERY
# IMPALA-8140: Test that group by with limit does not crash on Asan
select count(*) from tpch_parquet.orders o group by o.o_clerk limit 10
# We don't validate the results since the order is not deterministic.
diff --git a/testdata/workloads/functional-query/queries/QueryTest/joins.test b/testdata/workloads/functional-query/queries/QueryTest/joins.test
index 4865b27..a5267aa 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/joins.test
@@ -831,3 +831,14 @@
---- TYPES
FLOAT
====
+---- QUERY
+# IMPALA-6660: Test that +0 equals -0 when joining
+select * from
+ (select cast("-0" as float) c1) v1,
+ (select cast("+0" as float) c2) v2
+where v1.c1 = v2.c2;
+---- RESULTS
+-0, 0
+---- TYPES
+FLOAT,FLOAT
+====