blob: 116e305e59e6c3c0b35dbd0959ffa20311449fed [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cmath>
#include <cstdint>
#include <limits>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "arrow/testing/gtest_util.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/hashing.h"
#include "arrow/util/logging.h"
namespace arrow {
namespace internal {
template <typename Integer>
static std::unordered_set<Integer> MakeDistinctIntegers(int32_t n_values) {
std::default_random_engine gen(42);
std::uniform_int_distribution<Integer> values_dist(0,
std::numeric_limits<Integer>::max());
std::unordered_set<Integer> values;
values.reserve(n_values);
while (values.size() < static_cast<uint32_t>(n_values)) {
values.insert(static_cast<Integer>(values_dist(gen)));
}
return values;
}
template <typename Integer>
static std::unordered_set<Integer> MakeSequentialIntegers(int32_t n_values) {
std::unordered_set<Integer> values;
values.reserve(n_values);
for (int32_t i = 0; i < n_values; ++i) {
values.insert(static_cast<Integer>(i));
}
DCHECK_EQ(values.size(), static_cast<uint32_t>(n_values));
return values;
}
static std::unordered_set<std::string> MakeDistinctStrings(int32_t n_values) {
std::unordered_set<std::string> values;
values.reserve(n_values);
// Generate strings between 0 and 24 bytes, with ASCII characters
std::default_random_engine gen(42);
std::uniform_int_distribution<int32_t> length_dist(0, 24);
std::uniform_int_distribution<uint32_t> char_dist('0', 'z');
while (values.size() < static_cast<uint32_t>(n_values)) {
auto length = length_dist(gen);
std::string s(length, 'X');
for (int32_t i = 0; i < length; ++i) {
s[i] = static_cast<uint8_t>(char_dist(gen));
}
values.insert(std::move(s));
}
return values;
}
template <typename T>
static void CheckScalarHashQuality(const std::unordered_set<T>& distinct_values) {
std::unordered_set<hash_t> hashes;
for (const auto v : distinct_values) {
hashes.insert(ScalarHelper<T, 0>::ComputeHash(v));
hashes.insert(ScalarHelper<T, 1>::ComputeHash(v));
}
ASSERT_GE(static_cast<double>(hashes.size()),
0.96 * static_cast<double>(2 * distinct_values.size()));
}
TEST(HashingQuality, Int64) {
#ifdef ARROW_VALGRIND
const int32_t n_values = 500;
#else
const int32_t n_values = 10000;
#endif
{
const auto values = MakeDistinctIntegers<int64_t>(n_values);
CheckScalarHashQuality<int64_t>(values);
}
{
const auto values = MakeSequentialIntegers<int64_t>(n_values);
CheckScalarHashQuality<int64_t>(values);
}
}
TEST(HashingQuality, Strings) {
#ifdef ARROW_VALGRIND
const int32_t n_values = 500;
#else
const int32_t n_values = 10000;
#endif
const auto values = MakeDistinctStrings(n_values);
std::unordered_set<hash_t> hashes;
for (const auto& v : values) {
hashes.insert(ComputeStringHash<0>(v.data(), static_cast<int64_t>(v.size())));
hashes.insert(ComputeStringHash<1>(v.data(), static_cast<int64_t>(v.size())));
}
ASSERT_GE(static_cast<double>(hashes.size()),
0.96 * static_cast<double>(2 * values.size()));
}
TEST(HashingBounds, Strings) {
std::vector<size_t> sizes({1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21});
for (const auto s : sizes) {
std::string str;
for (size_t i = 0; i < s; i++) {
str.push_back(static_cast<char>(i));
}
hash_t h = ComputeStringHash<1>(str.c_str(), str.size());
int different = 0;
for (char i = 0; i < 120; i++) {
str[str.size() - 1] = i;
if (ComputeStringHash<1>(str.c_str(), str.size()) != h) {
different++;
}
}
ASSERT_GE(different, 118);
}
}
template <typename MemoTable, typename Value>
void AssertGet(MemoTable& table, const Value& v, int32_t expected) {
ASSERT_EQ(table.Get(v), expected);
}
template <typename MemoTable, typename Value>
void AssertGetOrInsert(MemoTable& table, const Value& v, int32_t expected) {
int32_t memo_index;
ASSERT_OK(table.GetOrInsert(v, &memo_index));
ASSERT_EQ(memo_index, expected);
}
template <typename MemoTable>
void AssertGetNull(MemoTable& table, int32_t expected) {
ASSERT_EQ(table.GetNull(), expected);
}
template <typename MemoTable>
void AssertGetOrInsertNull(MemoTable& table, int32_t expected) {
ASSERT_EQ(table.GetOrInsertNull(), expected);
}
TEST(ScalarMemoTable, Int64) {
const int64_t A = 1234, B = 0, C = -98765321, D = 12345678901234LL, E = -1, F = 1,
G = 9223372036854775807LL, H = -9223372036854775807LL - 1;
ScalarMemoTable<int64_t> table(default_memory_pool(), 0);
ASSERT_EQ(table.size(), 0);
AssertGet(table, A, kKeyNotFound);
AssertGetNull(table, kKeyNotFound);
AssertGetOrInsert(table, A, 0);
AssertGet(table, B, kKeyNotFound);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGetOrInsert(table, E, 4);
AssertGetOrInsertNull(table, 5);
AssertGet(table, A, 0);
AssertGetOrInsert(table, A, 0);
AssertGet(table, E, 4);
AssertGetOrInsert(table, E, 4);
AssertGetOrInsert(table, F, 6);
AssertGetOrInsert(table, G, 7);
AssertGetOrInsert(table, H, 8);
AssertGetOrInsert(table, G, 7);
AssertGetOrInsert(table, F, 6);
AssertGetOrInsertNull(table, 5);
AssertGetOrInsert(table, E, 4);
AssertGetOrInsert(table, D, 3);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, A, 0);
const int64_t size = 9;
ASSERT_EQ(table.size(), size);
{
std::vector<int64_t> values(size);
table.CopyValues(values.data());
EXPECT_THAT(values, testing::ElementsAre(A, B, C, D, E, 0, F, G, H));
}
{
const int32_t start_offset = 3;
std::vector<int64_t> values(size - start_offset);
table.CopyValues(start_offset, values.data());
EXPECT_THAT(values, testing::ElementsAre(D, E, 0, F, G, H));
}
}
TEST(ScalarMemoTable, UInt16) {
const uint16_t A = 1236, B = 0, C = 65535, D = 32767, E = 1;
ScalarMemoTable<uint16_t> table(default_memory_pool(), 0);
ASSERT_EQ(table.size(), 0);
AssertGet(table, A, kKeyNotFound);
AssertGetNull(table, kKeyNotFound);
AssertGetOrInsert(table, A, 0);
AssertGet(table, B, kKeyNotFound);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
{
EXPECT_EQ(table.size(), 4);
std::vector<uint16_t> values(table.size());
table.CopyValues(values.data());
EXPECT_THAT(values, testing::ElementsAre(A, B, C, D));
}
AssertGetOrInsertNull(table, 4);
AssertGetOrInsert(table, E, 5);
AssertGet(table, A, 0);
AssertGetOrInsert(table, A, 0);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGetNull(table, 4);
AssertGet(table, E, 5);
AssertGetOrInsert(table, E, 5);
ASSERT_EQ(table.size(), 6);
std::vector<uint16_t> values(table.size());
table.CopyValues(values.data());
EXPECT_THAT(values, testing::ElementsAre(A, B, C, D, 0, E));
}
TEST(SmallScalarMemoTable, Int8) {
const int8_t A = 1, B = 0, C = -1, D = -128, E = 127;
SmallScalarMemoTable<int8_t> table(default_memory_pool(), 0);
AssertGet(table, A, kKeyNotFound);
AssertGetNull(table, kKeyNotFound);
AssertGetOrInsert(table, A, 0);
AssertGet(table, B, kKeyNotFound);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGetOrInsert(table, E, 4);
AssertGetOrInsertNull(table, 5);
AssertGet(table, A, 0);
AssertGetOrInsert(table, A, 0);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGet(table, E, 4);
AssertGetOrInsert(table, E, 4);
AssertGetNull(table, 5);
AssertGetOrInsertNull(table, 5);
ASSERT_EQ(table.size(), 6);
std::vector<int8_t> values(table.size());
table.CopyValues(values.data());
EXPECT_THAT(values, testing::ElementsAre(A, B, C, D, E, 0));
}
TEST(SmallScalarMemoTable, Bool) {
SmallScalarMemoTable<bool> table(default_memory_pool(), 0);
ASSERT_EQ(table.size(), 0);
AssertGet(table, true, kKeyNotFound);
AssertGetOrInsert(table, true, 0);
AssertGetOrInsertNull(table, 1);
AssertGetOrInsert(table, false, 2);
AssertGet(table, true, 0);
AssertGetOrInsert(table, true, 0);
AssertGetNull(table, 1);
AssertGetOrInsertNull(table, 1);
AssertGet(table, false, 2);
AssertGetOrInsert(table, false, 2);
ASSERT_EQ(table.size(), 3);
EXPECT_THAT(table.values(), testing::ElementsAre(true, 0, false));
// NOTE std::vector<bool> doesn't have a data() method
}
TEST(ScalarMemoTable, Float64) {
const double A = 0.0, B = 1.5, C = -0.0, D = std::numeric_limits<double>::infinity(),
E = -D, F = std::nan("");
ScalarMemoTable<double> table(default_memory_pool(), 0);
ASSERT_EQ(table.size(), 0);
AssertGet(table, A, kKeyNotFound);
AssertGetNull(table, kKeyNotFound);
AssertGetOrInsert(table, A, 0);
AssertGet(table, B, kKeyNotFound);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGetOrInsert(table, E, 4);
AssertGetOrInsert(table, F, 5);
AssertGet(table, A, 0);
AssertGetOrInsert(table, A, 0);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGet(table, E, 4);
AssertGetOrInsert(table, E, 4);
AssertGet(table, F, 5);
AssertGetOrInsert(table, F, 5);
ASSERT_EQ(table.size(), 6);
std::vector<double> expected({A, B, C, D, E, F});
std::vector<double> values(table.size());
table.CopyValues(values.data());
for (uint32_t i = 0; i < expected.size(); ++i) {
auto u = expected[i];
auto v = values[i];
if (std::isnan(u)) {
ASSERT_TRUE(std::isnan(v));
} else {
ASSERT_EQ(u, v);
}
}
}
TEST(ScalarMemoTable, StressInt64) {
std::default_random_engine gen(42);
std::uniform_int_distribution<int64_t> value_dist(-50, 50);
#ifdef ARROW_VALGRIND
const int32_t n_repeats = 500;
#else
const int32_t n_repeats = 10000;
#endif
ScalarMemoTable<int64_t> table(default_memory_pool(), 0);
std::unordered_map<int64_t, int32_t> map;
for (int32_t i = 0; i < n_repeats; ++i) {
int64_t value = value_dist(gen);
int32_t expected, actual;
auto it = map.find(value);
if (it == map.end()) {
expected = static_cast<int32_t>(map.size());
map[value] = expected;
} else {
expected = it->second;
}
ASSERT_OK(table.GetOrInsert(value, &actual));
ASSERT_EQ(actual, expected);
}
ASSERT_EQ(table.size(), map.size());
}
TEST(BinaryMemoTable, Basics) {
std::string A = "", B = "a", C = "foo", D = "bar", E, F;
E += '\0';
F += '\0';
F += "trailing";
BinaryMemoTable<BinaryBuilder> table(default_memory_pool(), 0);
ASSERT_EQ(table.size(), 0);
AssertGet(table, A, kKeyNotFound);
AssertGetNull(table, kKeyNotFound);
AssertGetOrInsert(table, A, 0);
AssertGet(table, B, kKeyNotFound);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGetOrInsert(table, E, 4);
AssertGetOrInsert(table, F, 5);
AssertGetOrInsertNull(table, 6);
AssertGet(table, A, 0);
AssertGetOrInsert(table, A, 0);
AssertGet(table, B, 1);
AssertGetOrInsert(table, B, 1);
AssertGetOrInsert(table, C, 2);
AssertGetOrInsert(table, D, 3);
AssertGetOrInsert(table, E, 4);
AssertGet(table, F, 5);
AssertGetOrInsert(table, F, 5);
AssertGetNull(table, 6);
AssertGetOrInsertNull(table, 6);
ASSERT_EQ(table.size(), 7);
ASSERT_EQ(table.values_size(), 17);
const int32_t size = table.size();
{
std::vector<int8_t> offsets(size + 1);
table.CopyOffsets(offsets.data());
EXPECT_THAT(offsets, testing::ElementsAre(0, 0, 1, 4, 7, 8, 17, 17));
std::string expected_values;
expected_values += "afoobar";
expected_values += '\0';
expected_values += '\0';
expected_values += "trailing";
std::string values(17, 'X');
table.CopyValues(reinterpret_cast<uint8_t*>(&values[0]));
ASSERT_EQ(values, expected_values);
}
{
const int32_t start_offset = 4;
std::vector<int8_t> offsets(size + 1 - start_offset);
table.CopyOffsets(start_offset, offsets.data());
EXPECT_THAT(offsets, testing::ElementsAre(0, 1, 10, 10));
std::string expected_values;
expected_values += '\0';
expected_values += '\0';
expected_values += "trailing";
std::string values(10, 'X');
table.CopyValues(4 /* start offset */, reinterpret_cast<uint8_t*>(&values[0]));
ASSERT_EQ(values, expected_values);
}
{
const int32_t start_offset = 1;
std::vector<std::string> actual;
table.VisitValues(start_offset, [&](const util::string_view& v) {
actual.emplace_back(v.data(), v.length());
});
EXPECT_THAT(actual, testing::ElementsAre(B, C, D, E, F, ""));
}
}
TEST(BinaryMemoTable, Stress) {
#ifdef ARROW_VALGRIND
const int32_t n_values = 20;
const int32_t n_repeats = 20;
#else
const int32_t n_values = 100;
const int32_t n_repeats = 100;
#endif
const auto values = MakeDistinctStrings(n_values);
BinaryMemoTable<BinaryBuilder> table(default_memory_pool(), 0);
std::unordered_map<std::string, int32_t> map;
for (int32_t i = 0; i < n_repeats; ++i) {
for (const auto& value : values) {
int32_t expected, actual;
auto it = map.find(value);
if (it == map.end()) {
expected = static_cast<int32_t>(map.size());
map[value] = expected;
} else {
expected = it->second;
}
ASSERT_OK(table.GetOrInsert(value, &actual));
ASSERT_EQ(actual, expected);
}
}
ASSERT_EQ(table.size(), map.size());
}
TEST(BinaryMemoTable, Empty) {
BinaryMemoTable<BinaryBuilder> table(default_memory_pool());
ASSERT_EQ(table.size(), 0);
BinaryMemoTable<BinaryBuilder>::builder_offset_type offsets[1];
table.CopyOffsets(0, offsets);
EXPECT_EQ(offsets[0], 0);
}
} // namespace internal
} // namespace arrow