IMPALA-6660: Change -0/+0 floating point to compare as equal in hash table Currently -0.0/+0.0 values are hashed to different values due to their different binary representation, while -0.0==+0.0 is true in C++. This caused them to be distinct values in hash maps despite being treated as equal in comparisons. This commit fixes the hashing of -0.0/+0.0, thus changing the behaviour of hash joins and aggregations (since aggregations follow the behaviour of the join). That way, the canonical form for -0/+0 is changed to +0. Tests: - Added e2e tests for aggregation (group by and distinct) and join queries with -0.0 and +0.0 present. Change-Id: I6bb1a817c81c452d041238c19cb6c9f602a5d565 Reviewed-on: http://gerrit.cloudera.org:8080/14588 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>

commit: bf031a2142db67d2c8741059d2a3ee551f02e4a4 [log] [tgz]
author: norbert.luksa <norbert.luksa@cloudera.com> Wed Oct 30 16:30:25 2019 +0100
committer: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tue Nov 26 19:14:24 2019 +0000
tree: 0e65710bc1c346dc466381a757c3057993351e59
parent: 7878203b4a1d0aecde431f1c0d578e6b80d7ead6 [diff]
diff --git a/be/src/codegen/codegen-anyval.cc b/be/src/codegen/codegen-anyval.cc
index 4bf4ccf..c702315 100644
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc

@@ -482,10 +482,18 @@
   value_ = builder_->CreateInsertValue(value_, v, 0, name_);
 }
 
+llvm::Value* CodegenAnyVal::ConvertToPositiveZero(llvm::Value* val) {
+  // Replaces negative zero with positive, leaves everything else unchanged.
+  llvm::Value* is_negative_zero = builder_->CreateFCmpOEQ(
+      val, llvm::ConstantFP::getNegativeZero(val->getType()), "cmp_zero");
+  return builder_->CreateSelect(is_negative_zero,
+                llvm::ConstantFP::get(val->getType(), 0.0), val);
+}
+
 void CodegenAnyVal::ConvertToCanonicalForm() {
   // Convert the value to a bit pattern that is unambiguous.
   // Specifically, for floating point type values, NaN values are converted to
-  // the same bit pattern.
+  // the same bit pattern, and -0 is converted to +0.
   switch(type_.type) {
     case TYPE_FLOAT:
     case TYPE_DOUBLE: {
@@ -497,7 +505,8 @@
         canonical_val = llvm::ConstantFP::getNaN(codegen_->double_type());
       }
       llvm::Value* is_nan = builder_->CreateFCmpUNO(raw, raw, "cmp_nan");
-      SetVal(builder_->CreateSelect(is_nan, canonical_val, raw));
+
+      SetVal(builder_->CreateSelect(is_nan, canonical_val, ConvertToPositiveZero(raw)));
       break;
     }
     default:
@@ -774,6 +783,7 @@
           local_val, "local_val_is_nan");
       llvm::Value* val_is_nan = builder_->CreateFCmpUNO(val, val, "val_is_nan");
       llvm::Value* both_nan = builder_->CreateAnd(local_is_nan, val_is_nan);
+
       return builder_->CreateOr(cmp_raw, both_nan, "cmp_raw_with_nan");
     }
     case TYPE_STRING:

diff --git a/be/src/codegen/codegen-anyval.h b/be/src/codegen/codegen-anyval.h
index bc271fe..91b5594 100644
--- a/be/src/codegen/codegen-anyval.h
+++ b/be/src/codegen/codegen-anyval.h

@@ -236,6 +236,7 @@
   /// Therefore all NaN values need to be converted into a consistent
   /// form where all bits are the same.  This method will do that -
   /// ensure that all NaN values have the same bit pattern.
+  /// Similarly, -0 == +0 is handled here.
   ///
   /// Generically speaking, a canonical form of a value ensures that
   /// all ambiguity is removed from a value's bit settings -- if there
@@ -244,6 +245,10 @@
   /// float and double values.)
   void ConvertToCanonicalForm();
 
+  /// Replaces negative floating point zero with positive zero,
+  /// leaves everything else unchanged.
+  llvm::Value* ConvertToPositiveZero(llvm::Value* val);
+
   /// Returns the i1 result of this == other. this and other must be non-null.
   llvm::Value* Eq(CodegenAnyVal* other);
 

diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index 6ea5fdf..5bd2b91 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc

@@ -197,9 +197,7 @@
     }
     const ColumnType& expr_type = build_exprs_[i]->type();
     DCHECK_LE(expr_type.GetSlotSize(), sizeof(NULL_VALUE));
-    if (RawValue::IsNaN(val, expr_type)) {
-      val = RawValue::CanonicalNaNValue(expr_type);
-    }
+    val = RawValue::CanonicalValue(val, expr_type);
     RawValue::Write(val, loc, expr_type, NULL);
   }
   return has_null;
@@ -1165,8 +1163,7 @@
       llvm::Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(
           NULL, expr_values_null, codegen->GetI32Constant(i), "null_byte_loc");
       llvm::Value* null_byte = builder.CreateLoad(llvm_null_byte_loc);
-      row_is_null =
-          builder.CreateICmpNE(null_byte, codegen->GetI8Constant(0));
+      row_is_null = builder.CreateICmpNE(null_byte, codegen->GetI8Constant(0));
     }
     if (inclusive_equality) result.ConvertToCanonicalForm();
 

diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc
index 82eb879..27c5f53 100644
--- a/be/src/runtime/raw-value.cc
+++ b/be/src/runtime/raw-value.cc

@@ -29,9 +29,11 @@
 
 namespace impala {
 
-const int RawValue::ASCII_PRECISION = 16; // print 16 digits for double/float
-const double RawValue::CANONICAL_DOUBLE_NAN = nan("");
-const float RawValue::CANONICAL_FLOAT_NAN = nanf("");
+const int RawValue::ASCII_PRECISION;
+constexpr double RawValue::CANONICAL_DOUBLE_NAN;
+constexpr float RawValue::CANONICAL_FLOAT_NAN;
+constexpr double RawValue::CANONICAL_DOUBLE_ZERO;
+constexpr float RawValue::CANONICAL_FLOAT_ZERO;
 
 void RawValue::PrintValueAsBytes(const void* value, const ColumnType& type,
                                  stringstream* stream) {

diff --git a/be/src/runtime/raw-value.h b/be/src/runtime/raw-value.h
index 7bf52a5..f94d122 100644
--- a/be/src/runtime/raw-value.h
+++ b/be/src/runtime/raw-value.h

@@ -34,14 +34,17 @@
 /// Useful utility functions for runtime values (which are passed around as void*).
 class RawValue {
  public:
-  /// Ascii output precision for double/float
-  static const int ASCII_PRECISION;
+  /// Ascii output precision for double/float, print 16 digits.
+  static const int ASCII_PRECISION = 16;
 
   /// Single NaN values to ensure all NaN values can be assigned one bit pattern
   /// that will always compare and hash the same way.  Allows for all NaN values
   /// to be put into the same "group by" bucket.
-  static const double CANONICAL_DOUBLE_NAN;
-  static const float CANONICAL_FLOAT_NAN;
+  static constexpr double CANONICAL_DOUBLE_NAN = std::numeric_limits<double>::quiet_NaN();
+  static constexpr float CANONICAL_FLOAT_NAN = std::numeric_limits<float>::quiet_NaN();
+  /// The canonical zero values when comparing negative and positive zeros.
+  static constexpr double CANONICAL_DOUBLE_ZERO = 0.0;
+  static constexpr float CANONICAL_FLOAT_ZERO = 0.0f;
 
   /// Convert 'value' into ascii and write to 'stream'. NULL turns into "NULL". 'scale'
   /// determines how many digits after the decimal are printed for floating point numbers,
@@ -107,9 +110,19 @@
   /// Returns true if val/type correspond to a NaN floating point value.
   static inline bool IsNaN(const void* val, const ColumnType& type);
 
+  /// Returns true if val/type correspond to a +0/-0 floating point value.
+  static inline bool IsFloatingZero(const void* val, const ColumnType& type);
+
+  /// Returns the canonical form of the given value. Currently this means a unified NaN
+  /// value in case of NaN and +0 in case of +0/-0.
+  static inline const void* CanonicalValue(const void* val, const ColumnType& type);
+
   /// Returns a canonical NaN value for a floating point type
   /// (which will always have the same bit-pattern to maintain consistency in hashing).
   static inline const void* CanonicalNaNValue(const ColumnType& type);
+
+  // Returns positive zero for floating point types.
+  static inline const void* PositiveFloatingZero(const ColumnType& type);
 };
 
 }

diff --git a/be/src/runtime/raw-value.inline.h b/be/src/runtime/raw-value.inline.h
index 116f21e..a9fc3b3 100644
--- a/be/src/runtime/raw-value.inline.h
+++ b/be/src/runtime/raw-value.inline.h

@@ -51,6 +51,23 @@
   }
 }
 
+inline bool RawValue::IsFloatingZero(const void* val, const ColumnType& type) {
+  switch(type.type) {
+  case TYPE_FLOAT:
+    return *reinterpret_cast<const float*>(val) == CANONICAL_FLOAT_ZERO;
+  case TYPE_DOUBLE:
+    return *reinterpret_cast<const double*>(val) == CANONICAL_DOUBLE_ZERO;
+  default:
+    return false;
+  }
+}
+
+inline const void* RawValue::CanonicalValue(const void* val, const ColumnType& type) {
+  if (RawValue::IsNaN(val, type)) return RawValue::CanonicalNaNValue(type);
+  if (RawValue::IsFloatingZero(val, type)) return RawValue::PositiveFloatingZero(type);
+  return val;
+}
+
 inline const void* RawValue::CanonicalNaNValue(const ColumnType& type) {
   switch(type.type) {
   case TYPE_FLOAT:
@@ -63,6 +80,18 @@
   }
 }
 
+inline const void* RawValue::PositiveFloatingZero(const ColumnType& type) {
+  switch(type.type) {
+  case TYPE_FLOAT:
+    return &CANONICAL_FLOAT_ZERO;
+  case TYPE_DOUBLE:
+    return &CANONICAL_DOUBLE_ZERO;
+  default:
+    DCHECK(false);
+    return nullptr;
+  }
+}
+
 inline bool RawValue::Eq(const void* v1, const void* v2, const ColumnType& type) {
   const StringValue* string_value1;
   const StringValue* string_value2;

diff --git a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
index 315c3ec..f0f3112 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test

@@ -1402,6 +1402,25 @@
 bigint, float, float
 ====
 ---- QUERY
+# IMPALA-6660: GROUP BY of -0/+0 values aggregates zeros as one grouping
+select x, count(*)
+from (values(cast("-0" as float) x), (cast("0" as float))) v
+group by x
+---- RESULTS
+0,2
+---- TYPES
+float, bigint
+====
+---- QUERY
+# IMPALA-6660: -0/+0 values are not distinct
+select distinct *
+from (values(cast("-0" as float)), (cast("0" as float))) v;
+---- RESULTS
+0
+---- TYPES
+float
+====
+---- QUERY
 # IMPALA-8140: Test that group by with limit does not crash on Asan
 select count(*) from tpch_parquet.orders o group by o.o_clerk limit 10
 # We don't validate the results since the order is not deterministic.

diff --git a/testdata/workloads/functional-query/queries/QueryTest/joins.test b/testdata/workloads/functional-query/queries/QueryTest/joins.test
index 4865b27..a5267aa 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/joins.test

@@ -831,3 +831,14 @@
 ---- TYPES
 FLOAT
 ====
+---- QUERY
+# IMPALA-6660: Test that +0 equals -0 when joining
+select * from
+  (select cast("-0" as float) c1) v1,
+  (select cast("+0" as float) c2) v2
+where v1.c1 = v2.c2;
+---- RESULTS
+-0, 0
+---- TYPES
+FLOAT,FLOAT
+====
commit	bf031a2142db67d2c8741059d2a3ee551f02e4a4	[log] [tgz]
author	norbert.luksa <norbert.luksa@cloudera.com>	Wed Oct 30 16:30:25 2019 +0100
committer	Impala Public Jenkins <impala-public-jenkins@cloudera.com>	Tue Nov 26 19:14:24 2019 +0000
tree	0e65710bc1c346dc466381a757c3057993351e59
parent	7878203b4a1d0aecde431f1c0d578e6b80d7ead6 [diff]