PARQUET-1108: Fix Int96 comparators
Int96 must be treated as a hardware intrinsic type. Therefore, the comparator for little-endian must start from the end.
Author: Deepak Majeti <deepak.majeti@hpe.com>
Closes #399 from majetideepak/PARQUET-1108 and squashes the following commits:
352522a [Deepak Majeti] PARQUET-1108: Fix Int96 comparators
diff --git a/src/parquet/util/comparison-test.cc b/src/parquet/util/comparison-test.cc
index dc915bf..c05f934 100644
--- a/src/parquet/util/comparison-test.cc
+++ b/src/parquet/util/comparison-test.cc
@@ -137,7 +137,7 @@
TEST(Comparison, SignedInt96) {
parquet::Int96 a{{1, 41, 14}}, b{{1, 41, 42}};
parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}};
- parquet::Int96 aaa{{static_cast<uint32_t>(-1), 41, 14}}, bbb{{1, 41, 42}};
+ parquet::Int96 aaa{{1, 41, static_cast<uint32_t>(-14)}}, bbb{{1, 41, 42}};
NodePtr node = PrimitiveNode::Make("SignedInt96", Repetition::REQUIRED, Type::INT96);
ColumnDescriptor descr(node, 0, 0);
@@ -151,7 +151,8 @@
TEST(Comparison, UnsignedInt96) {
parquet::Int96 a{{1, 41, 14}}, b{{1, static_cast<uint32_t>(-41), 42}};
- parquet::Int96 aa{{1, 41, 14}}, bb{{static_cast<uint32_t>(-1), 41, 14}};
+ parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, static_cast<uint32_t>(-14)}};
+ parquet::Int96 aaa, bbb;
NodePtr node = PrimitiveNode::Make("UnsignedInt96", Repetition::REQUIRED, Type::INT96);
ColumnDescriptor descr(node, 0, 0);
@@ -160,6 +161,31 @@
ASSERT_TRUE(uless(a, b));
ASSERT_TRUE(uless(aa, bb));
+
+ // INT96 Timestamp
+ aaa.value[2] = 2451545; // 2000-01-01
+ bbb.value[2] = 2451546; // 2000-01-02
+ // 12 hours + 34 minutes + 56 seconds.
+ reinterpret_cast<uint64_t*>(&aaa.value[0])[0] = 45296000000000;
+ // 12 hours + 34 minutes + 50 seconds.
+ reinterpret_cast<uint64_t*>(&bbb.value[0])[0] = 45290000000000;
+ ASSERT_TRUE(uless(aaa, bbb));
+
+ aaa.value[2] = 2451545; // 2000-01-01
+ bbb.value[2] = 2451545; // 2000-01-01
+ // 11 hours + 34 minutes + 56 seconds.
+ reinterpret_cast<uint64_t*>(&aaa.value[0])[0] = 41696000000000;
+ // 12 hours + 34 minutes + 50 seconds.
+ reinterpret_cast<uint64_t*>(&bbb.value[0])[0] = 45290000000000;
+ ASSERT_TRUE(uless(aaa, bbb));
+
+ aaa.value[2] = 2451545; // 2000-01-01
+ bbb.value[2] = 2451545; // 2000-01-01
+ // 12 hours + 34 minutes + 55 seconds.
+ reinterpret_cast<uint64_t*>(&aaa.value[0])[0] = 45295000000000;
+ // 12 hours + 34 minutes + 56 seconds.
+ reinterpret_cast<uint64_t*>(&bbb.value[0])[0] = 45296000000000;
+ ASSERT_TRUE(uless(aaa, bbb));
}
TEST(Comparison, SignedInt64) {
diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h
index 803f0da..12be7ba 100644
--- a/src/parquet/util/comparison.h
+++ b/src/parquet/util/comparison.h
@@ -48,9 +48,16 @@
CompareDefault() {}
virtual ~CompareDefault() {}
virtual bool operator()(const Int96& a, const Int96& b) {
- const int32_t* aptr = reinterpret_cast<const int32_t*>(&a.value[0]);
- const int32_t* bptr = reinterpret_cast<const int32_t*>(&b.value[0]);
- return std::lexicographical_compare(aptr, aptr + 3, bptr, bptr + 3);
+ // Only the MSB bit is by Signed comparison
+ // For little-endian, this is the last bit of Int96 type
+ const int32_t amsb = static_cast<const int32_t>(a.value[2]);
+ const int32_t bmsb = static_cast<const int32_t>(b.value[2]);
+ if (amsb != bmsb) {
+ return (amsb < bmsb);
+ } else if (a.value[1] != b.value[1]) {
+ return (a.value[1] < b.value[1]);
+ }
+ return (a.value[0] < b.value[0]);
}
};
@@ -132,9 +139,12 @@
public:
virtual ~CompareUnsignedInt96() {}
bool operator()(const Int96& a, const Int96& b) override {
- const uint32_t* aptr = reinterpret_cast<const uint32_t*>(&a.value[0]);
- const uint32_t* bptr = reinterpret_cast<const uint32_t*>(&b.value[0]);
- return std::lexicographical_compare(aptr, aptr + 3, bptr, bptr + 3);
+ if (a.value[2] != b.value[2]) {
+ return (a.value[2] < b.value[2]);
+ } else if (a.value[1] != b.value[1]) {
+ return (a.value[1] < b.value[1]);
+ }
+ return (a.value[0] < b.value[0]);
}
};