blob: a3fa9314e604f0dab15e1f82cc24ad7b8899504d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <locale>
#include <memory>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>
#include <gtest/gtest.h>
#include "arrow/array.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/buffer.h"
#include "arrow/chunked_array.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_common.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/compute/api.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/ipc/json_simple.h"
namespace arrow {
using internal::checked_cast;
namespace compute {
// ----------------------------------------------------------------------
// Dictionary tests
template <typename T>
void CheckUnique(const std::shared_ptr<T>& input,
const std::shared_ptr<Array>& expected) {
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(input));
ASSERT_OK(result->ValidateFull());
// TODO: We probably shouldn't rely on array ordering.
ASSERT_ARRAYS_EQUAL(*expected, *result);
}
template <typename Type, typename T>
void CheckUnique(const std::shared_ptr<DataType>& type, const std::vector<T>& in_values,
const std::vector<bool>& in_is_valid, const std::vector<T>& out_values,
const std::vector<bool>& out_is_valid) {
std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
std::shared_ptr<Array> expected = _MakeArray<Type, T>(type, out_values, out_is_valid);
CheckUnique(input, expected);
}
// Check that ValueCounts() accepts a 0-length array with null buffers
void CheckValueCountsNull(const std::shared_ptr<DataType>& type) {
std::vector<std::shared_ptr<Buffer>> data_buffers(2);
Datum input;
input.value =
ArrayData::Make(type, 0 /* length */, std::move(data_buffers), 0 /* null_count */);
std::shared_ptr<Array> ex_values = ArrayFromJSON(type, "[]");
std::shared_ptr<Array> ex_counts = ArrayFromJSON(int64(), "[]");
ASSERT_OK_AND_ASSIGN(auto result_struct, ValueCounts(input));
ASSERT_OK(result_struct->ValidateFull());
ASSERT_NE(result_struct->GetFieldByName(kValuesFieldName), nullptr);
// TODO: We probably shouldn't rely on value ordering.
ASSERT_ARRAYS_EQUAL(*ex_values, *result_struct->GetFieldByName(kValuesFieldName));
ASSERT_ARRAYS_EQUAL(*ex_counts, *result_struct->GetFieldByName(kCountsFieldName));
}
template <typename T>
void CheckValueCounts(const std::shared_ptr<T>& input,
const std::shared_ptr<Array>& expected_values,
const std::shared_ptr<Array>& expected_counts) {
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ValueCounts(input));
ASSERT_OK(result->ValidateFull());
auto result_struct = std::dynamic_pointer_cast<StructArray>(result);
ASSERT_EQ(result_struct->num_fields(), 2);
// TODO: We probably shouldn't rely on value ordering.
ASSERT_ARRAYS_EQUAL(*expected_values, *result_struct->field(kValuesFieldIndex));
ASSERT_ARRAYS_EQUAL(*expected_counts, *result_struct->field(kCountsFieldIndex));
}
template <typename Type, typename T>
void CheckValueCounts(const std::shared_ptr<DataType>& type,
const std::vector<T>& in_values,
const std::vector<bool>& in_is_valid,
const std::vector<T>& out_values,
const std::vector<bool>& out_is_valid,
const std::vector<int64_t>& out_counts) {
std::vector<bool> all_valids(out_is_valid.size(), true);
std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
std::shared_ptr<Array> ex_values = _MakeArray<Type, T>(type, out_values, out_is_valid);
std::shared_ptr<Array> ex_counts =
_MakeArray<Int64Type, int64_t>(int64(), out_counts, all_valids);
CheckValueCounts(input, ex_values, ex_counts);
}
void CheckDictEncode(const std::shared_ptr<Array>& input,
const std::shared_ptr<Array>& expected_values,
const std::shared_ptr<Array>& expected_indices) {
auto type = dictionary(expected_indices->type(), expected_values->type());
DictionaryArray expected(type, expected_indices, expected_values);
ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input));
std::shared_ptr<Array> result = MakeArray(datum_out.array());
ASSERT_OK(result->ValidateFull());
ASSERT_ARRAYS_EQUAL(expected, *result);
}
template <typename Type, typename T>
void CheckDictEncode(const std::shared_ptr<DataType>& type,
const std::vector<T>& in_values,
const std::vector<bool>& in_is_valid,
const std::vector<T>& out_values,
const std::vector<bool>& out_is_valid,
const std::vector<int32_t>& out_indices) {
std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
std::shared_ptr<Array> ex_dict = _MakeArray<Type, T>(type, out_values, out_is_valid);
std::shared_ptr<Array> ex_indices =
_MakeArray<Int32Type, int32_t>(int32(), out_indices, in_is_valid);
return CheckDictEncode(input, ex_dict, ex_indices);
}
class TestHashKernel : public ::testing::Test {};
template <typename Type>
class TestHashKernelPrimitive : public ::testing::Test {};
typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType,
Date32Type, Date64Type>
PrimitiveDictionaries;
TYPED_TEST_SUITE(TestHashKernelPrimitive, PrimitiveDictionaries);
TYPED_TEST(TestHashKernelPrimitive, Unique) {
using T = typename TypeParam::c_type;
auto type = TypeTraits<TypeParam>::type_singleton();
CheckUnique<TypeParam, T>(type, {2, 1, 2, 1}, {true, false, true, true}, {2, 0, 1},
{1, 0, 1});
CheckUnique<TypeParam, T>(type, {2, 1, 3, 1}, {false, false, true, true}, {0, 3, 1},
{0, 1, 1});
// Sliced
CheckUnique(ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4),
ArrayFromJSON(type, "[2, null, 3]"));
}
TYPED_TEST(TestHashKernelPrimitive, ValueCounts) {
using T = typename TypeParam::c_type;
auto type = TypeTraits<TypeParam>::type_singleton();
CheckValueCounts<TypeParam, T>(type, {2, 1, 2, 1, 2, 3, 4},
{true, false, true, true, true, true, false},
{2, 0, 1, 3}, {1, 0, 1, 1}, {3, 2, 1, 1});
CheckValueCounts<TypeParam, T>(type, {}, {}, {}, {}, {});
CheckValueCountsNull(type);
// Sliced
CheckValueCounts(ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4),
ArrayFromJSON(type, "[2, null, 3]"),
ArrayFromJSON(int64(), "[2, 1, 1]"));
}
TYPED_TEST(TestHashKernelPrimitive, DictEncode) {
using T = typename TypeParam::c_type;
auto type = TypeTraits<TypeParam>::type_singleton();
CheckDictEncode<TypeParam, T>(type, {2, 1, 2, 1, 2, 3},
{true, false, true, true, true, true}, {2, 1, 3},
{1, 1, 1}, {0, 0, 0, 1, 0, 2});
// Sliced
CheckDictEncode(ArrayFromJSON(type, "[2, 1, null, 4, 3, 1, 42]")->Slice(1, 5),
ArrayFromJSON(type, "[1, 4, 3]"),
ArrayFromJSON(int32(), "[0, null, 1, 2, 0]"));
}
TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) {
auto type = TypeTraits<TypeParam>::type_singleton();
auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
AssertChunkedEqual(*result.chunked_array(),
ChunkedArray({}, dictionary(int32(), type)));
}
TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
using T = typename TypeParam::c_type;
const int64_t kTotalValues = std::min<int64_t>(INT16_MAX, 1UL << sizeof(T) / 2);
const int64_t kRepeats = 5;
std::vector<T> values;
std::vector<T> uniques;
std::vector<int32_t> indices;
std::vector<int64_t> counts;
for (int64_t i = 0; i < kTotalValues * kRepeats; i++) {
const auto val = static_cast<T>(i % kTotalValues);
values.push_back(val);
if (i < kTotalValues) {
uniques.push_back(val);
counts.push_back(kRepeats);
}
indices.push_back(static_cast<int32_t>(i % kTotalValues));
}
auto type = TypeTraits<TypeParam>::type_singleton();
CheckUnique<TypeParam, T>(type, values, {}, uniques, {});
CheckValueCounts<TypeParam, T>(type, values, {}, uniques, {}, counts);
CheckDictEncode<TypeParam, T>(type, values, {}, uniques, {}, indices);
}
TEST_F(TestHashKernel, UniqueTimeTimestamp) {
CheckUnique<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1},
{true, false, true, true}, {2, 0, 1}, {1, 0, 1});
CheckUnique<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1},
{true, false, true, true}, {2, 0, 1}, {1, 0, 1});
CheckUnique<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {2, 1, 2, 1},
{true, false, true, true}, {2, 0, 1}, {1, 0, 1});
}
TEST_F(TestHashKernel, ValueCountsTimeTimestamp) {
CheckValueCounts<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1},
{true, false, true, true}, {2, 0, 1}, {1, 0, 1},
{2, 1, 1});
CheckValueCounts<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1},
{true, false, true, true}, {2, 0, 1}, {1, 0, 1},
{2, 1, 1});
CheckValueCounts<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {2, 1, 2, 1},
{true, false, true, true}, {2, 0, 1},
{1, 0, 1}, {2, 1, 1});
}
TEST_F(TestHashKernel, UniqueBoolean) {
CheckUnique<BooleanType, bool>(boolean(), {true, true, false, true},
{true, false, true, true}, {true, false, false},
{1, 0, 1});
CheckUnique<BooleanType, bool>(boolean(), {false, true, false, true},
{true, false, true, true}, {false, false, true},
{1, 0, 1});
// No nulls
CheckUnique<BooleanType, bool>(boolean(), {true, true, false, true}, {}, {true, false},
{});
CheckUnique<BooleanType, bool>(boolean(), {false, true, false, true}, {}, {false, true},
{});
// Sliced
CheckUnique(ArrayFromJSON(boolean(), "[null, true, true, false]")->Slice(1, 2),
ArrayFromJSON(boolean(), "[true]"));
}
TEST_F(TestHashKernel, ValueCountsBoolean) {
CheckValueCounts<BooleanType, bool>(boolean(), {true, true, false, true},
{true, false, true, true}, {true, false, false},
{1, 0, 1}, {2, 1, 1});
CheckValueCounts<BooleanType, bool>(boolean(), {false, true, false, true},
{true, false, true, true}, {false, false, true},
{1, 0, 1}, {2, 1, 1});
// No nulls
CheckValueCounts<BooleanType, bool>(boolean(), {true, true, false, true}, {},
{true, false}, {}, {3, 1});
CheckValueCounts<BooleanType, bool>(boolean(), {false, true, false, true}, {},
{false, true}, {}, {2, 2});
// Sliced
CheckValueCounts(ArrayFromJSON(boolean(), "[true, false, false, null]")->Slice(1, 2),
ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), "[2]"));
}
TEST_F(TestHashKernel, ValueCountsNull) {
CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"),
ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(), "[3]"));
}
TEST_F(TestHashKernel, DictEncodeBoolean) {
CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false},
{true, false, true, true, true}, {true, false}, {},
{0, 0, 1, 0, 1});
CheckDictEncode<BooleanType, bool>(boolean(), {false, true, false, true, false},
{true, false, true, true, true}, {false, true}, {},
{0, 0, 0, 1, 0});
// No nulls
CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false}, {},
{true, false}, {}, {0, 0, 1, 0, 1});
CheckDictEncode<BooleanType, bool>(boolean(), {false, true, false, true, false}, {},
{false, true}, {}, {0, 1, 0, 1, 0});
// Sliced
CheckDictEncode(
ArrayFromJSON(boolean(), "[false, true, null, true, false]")->Slice(1, 3),
ArrayFromJSON(boolean(), "[true]"), ArrayFromJSON(int32(), "[0, null, 0]"));
}
template <typename ArrowType>
class TestHashKernelBinaryTypes : public TestHashKernel {
protected:
std::shared_ptr<DataType> type() { return TypeTraits<ArrowType>::type_singleton(); }
void CheckDictEncodeP(const std::vector<std::string>& in_values,
const std::vector<bool>& in_is_valid,
const std::vector<std::string>& out_values,
const std::vector<bool>& out_is_valid,
const std::vector<int32_t>& out_indices) {
CheckDictEncode<ArrowType, std::string>(type(), in_values, in_is_valid, out_values,
out_is_valid, out_indices);
}
void CheckValueCountsP(const std::vector<std::string>& in_values,
const std::vector<bool>& in_is_valid,
const std::vector<std::string>& out_values,
const std::vector<bool>& out_is_valid,
const std::vector<int64_t>& out_counts) {
CheckValueCounts<ArrowType, std::string>(type(), in_values, in_is_valid, out_values,
out_is_valid, out_counts);
}
void CheckUniqueP(const std::vector<std::string>& in_values,
const std::vector<bool>& in_is_valid,
const std::vector<std::string>& out_values,
const std::vector<bool>& out_is_valid) {
CheckUnique<ArrowType, std::string>(type(), in_values, in_is_valid, out_values,
out_is_valid);
}
};
TYPED_TEST_SUITE(TestHashKernelBinaryTypes, BinaryTypes);
TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
auto type = this->type();
auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
AssertChunkedEqual(*result.chunked_array(),
ChunkedArray({}, dictionary(int32(), type)));
}
TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) {
auto type = this->type();
auto two_chunks = std::make_shared<ChunkedArray>(
ArrayVector{
ArrayFromJSON(type, "[\"a\"]"),
ArrayFromJSON(type, "[\"b\"]"),
},
type);
ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks));
auto dict_type = dictionary(int32(), type);
auto dictionary = ArrayFromJSON(type, R"(["a", "b"])");
auto chunk_0 = std::make_shared<DictionaryArray>(
dict_type, ArrayFromJSON(int32(), "[0]"), dictionary);
auto chunk_1 = std::make_shared<DictionaryArray>(
dict_type, ArrayFromJSON(int32(), "[1]"), dictionary);
ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
AssertChunkedEqual(*result.chunked_array(),
ChunkedArray({chunk_0, chunk_1}, dict_type));
}
TYPED_TEST(TestHashKernelBinaryTypes, Unique) {
this->CheckUniqueP({"test", "", "test2", "test"}, {true, false, true, true},
{"test", "", "test2"}, {1, 0, 1});
// Sliced
CheckUnique(
ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ef", "cd", "gh"])")->Slice(1, 4),
ArrayFromJSON(this->type(), R"([null, "cd", "ef"])"));
}
TYPED_TEST(TestHashKernelBinaryTypes, ValueCounts) {
this->CheckValueCountsP({"test", "", "test2", "test"}, {true, false, true, true},
{"test", "", "test2"}, {1, 0, 1}, {2, 1, 1});
// Sliced
CheckValueCounts(
ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ab", "cd", "ef"])")->Slice(1, 4),
ArrayFromJSON(this->type(), R"([null, "cd", "ab"])"),
ArrayFromJSON(int64(), "[1, 2, 1]"));
}
TYPED_TEST(TestHashKernelBinaryTypes, DictEncode) {
this->CheckDictEncodeP({"test", "", "test2", "test", "baz"},
{true, false, true, true, true}, {"test", "test2", "baz"}, {},
{0, 0, 1, 0, 2});
// Sliced
CheckDictEncode(
ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ab", "cd", "ef"])")->Slice(1, 4),
ArrayFromJSON(this->type(), R"(["cd", "ab"])"),
ArrayFromJSON(int32(), "[null, 0, 1, 0]"));
}
TYPED_TEST(TestHashKernelBinaryTypes, BinaryResizeTable) {
const int32_t kTotalValues = 10000;
#if !defined(ARROW_VALGRIND)
const int32_t kRepeats = 10;
#else
// Mitigate Valgrind's slowness
const int32_t kRepeats = 3;
#endif
std::vector<std::string> values;
std::vector<std::string> uniques;
std::vector<int32_t> indices;
std::vector<int64_t> counts;
char buf[20] = "test";
for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
int32_t index = i % kTotalValues;
ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0);
values.emplace_back(buf);
if (i < kTotalValues) {
uniques.push_back(values.back());
counts.push_back(kRepeats);
}
indices.push_back(index);
}
this->CheckUniqueP(values, {}, uniques, {});
this->CheckValueCountsP(values, {}, uniques, {}, counts);
this->CheckDictEncodeP(values, {}, uniques, {}, indices);
}
TEST_F(TestHashKernel, UniqueFixedSizeBinary) {
auto type = fixed_size_binary(3);
CheckUnique<FixedSizeBinaryType, std::string>(type, {"aaa", "", "bbb", "aaa"},
{true, false, true, true},
{"aaa", "", "bbb"}, {1, 0, 1});
// Sliced
CheckUnique(
ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", "ddd"])")->Slice(1, 4),
ArrayFromJSON(type, R"([null, "bbb", "ccc"])"));
}
TEST_F(TestHashKernel, ValueCountsFixedSizeBinary) {
auto type = fixed_size_binary(3);
auto input = ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", null])");
CheckValueCounts(input, ArrayFromJSON(type, R"(["aaa", null, "bbb", "ccc"])"),
ArrayFromJSON(int64(), "[1, 2, 2, 1]"));
// Sliced
CheckValueCounts(input->Slice(1, 4), ArrayFromJSON(type, R"([null, "bbb", "ccc"])"),
ArrayFromJSON(int64(), "[1, 2, 1]"));
}
TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) {
auto type = fixed_size_binary(3);
CheckDictEncode<FixedSizeBinaryType, std::string>(
type, {"bbb", "", "bbb", "aaa", "ccc"}, {true, false, true, true, true},
{"bbb", "aaa", "ccc"}, {}, {0, 0, 0, 1, 2});
// Sliced
CheckDictEncode(
ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", "ddd"])")->Slice(1, 4),
ArrayFromJSON(type, R"(["bbb", "ccc"])"),
ArrayFromJSON(int32(), "[null, 0, 0, 1]"));
}
TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) {
const int32_t kTotalValues = 10000;
#if !defined(ARROW_VALGRIND)
const int32_t kRepeats = 10;
#else
// Mitigate Valgrind's slowness
const int32_t kRepeats = 3;
#endif
std::vector<std::string> values;
std::vector<std::string> uniques;
std::vector<int32_t> indices;
char buf[7] = "test..";
for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
int32_t index = i % kTotalValues;
buf[4] = static_cast<char>(index / 128);
buf[5] = static_cast<char>(index % 128);
values.emplace_back(buf, 6);
if (i < kTotalValues) {
uniques.push_back(values.back());
}
indices.push_back(index);
}
auto type = fixed_size_binary(6);
CheckUnique<FixedSizeBinaryType, std::string>(type, values, {}, uniques, {});
CheckDictEncode<FixedSizeBinaryType, std::string>(type, values, {}, uniques, {},
indices);
}
TEST_F(TestHashKernel, UniqueDecimal) {
std::vector<Decimal128> values{12, 12, 11, 12};
std::vector<Decimal128> expected{12, 0, 11};
CheckUnique<Decimal128Type, Decimal128>(decimal(2, 0), values,
{true, false, true, true}, expected, {1, 0, 1});
}
TEST_F(TestHashKernel, UniqueNull) {
CheckUnique<NullType, std::nullptr_t>(null(), {nullptr, nullptr}, {false, true},
{nullptr}, {false});
CheckUnique<NullType, std::nullptr_t>(null(), {}, {}, {}, {});
}
TEST_F(TestHashKernel, ValueCountsDecimal) {
std::vector<Decimal128> values{12, 12, 11, 12};
std::vector<Decimal128> expected{12, 0, 11};
CheckValueCounts<Decimal128Type, Decimal128>(
decimal(2, 0), values, {true, false, true, true}, expected, {1, 0, 1}, {2, 1, 1});
}
TEST_F(TestHashKernel, DictEncodeDecimal) {
std::vector<Decimal128> values{12, 12, 11, 12, 13};
std::vector<Decimal128> expected{12, 11, 13};
CheckDictEncode<Decimal128Type, Decimal128>(decimal(2, 0), values,
{true, false, true, true, true}, expected,
{}, {0, 0, 1, 0, 2});
}
TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) {
for (auto index_ty : {int8(), int16(), int32(), int64()}) {
auto indices = ArrayFromJSON(index_ty, "[3, 0, 0, 0, 1, 1, 3, 0, 1, 3, 0, 1]");
auto dict = ArrayFromJSON(int64(), "[10, 20, 30, 40]");
auto dict_ty = dictionary(index_ty, int64());
auto ex_indices = ArrayFromJSON(index_ty, "[3, 0, 1]");
auto input = std::make_shared<DictionaryArray>(dict_ty, indices, dict);
auto ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices, dict);
CheckUnique(input, ex_uniques);
auto ex_counts = ArrayFromJSON(int64(), "[3, 5, 4]");
CheckValueCounts(input, ex_uniques, ex_counts);
// Check chunked array
auto chunked = *ChunkedArray::Make({input->Slice(0, 2), input->Slice(2)});
CheckUnique(chunked, ex_uniques);
CheckValueCounts(chunked, ex_uniques, ex_counts);
// Different chunk dictionaries
auto input_2 = DictArrayFromJSON(dict_ty, "[1, null, 2, 3]", "[30, 40, 50, 60]");
auto ex_uniques_2 =
DictArrayFromJSON(dict_ty, "[3, 0, 1, null, 4, 5]", "[10, 20, 30, 40, 50, 60]");
auto ex_counts_2 = ArrayFromJSON(int64(), "[4, 5, 4, 1, 1, 1]");
auto different_dictionaries = *ChunkedArray::Make({input, input_2}, dict_ty);
CheckUnique(different_dictionaries, ex_uniques_2);
CheckValueCounts(different_dictionaries, ex_uniques_2, ex_counts_2);
// Dictionary with encoded nulls
auto dict_with_null = ArrayFromJSON(int64(), "[10, null, 30, 40]");
input = std::make_shared<DictionaryArray>(dict_ty, indices, dict_with_null);
ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices, dict_with_null);
CheckUnique(input, ex_uniques);
CheckValueCounts(input, ex_uniques, ex_counts);
// Dictionary with masked nulls
auto indices_with_null =
ArrayFromJSON(index_ty, "[3, 0, 0, 0, null, null, 3, 0, null, 3, 0, null]");
auto ex_indices_with_null = ArrayFromJSON(index_ty, "[3, 0, null]");
ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices_with_null, dict);
input = std::make_shared<DictionaryArray>(dict_ty, indices_with_null, dict);
CheckUnique(input, ex_uniques);
CheckValueCounts(input, ex_uniques, ex_counts);
// Dictionary with encoded AND masked nulls
auto some_indices_with_null =
ArrayFromJSON(index_ty, "[3, 0, 0, 0, 1, 1, 3, 0, null, 3, 0, null]");
ex_uniques =
std::make_shared<DictionaryArray>(dict_ty, ex_indices_with_null, dict_with_null);
input = std::make_shared<DictionaryArray>(dict_ty, indices_with_null, dict_with_null);
CheckUnique(input, ex_uniques);
CheckValueCounts(input, ex_uniques, ex_counts);
}
}
/* TODO(ARROW-4124): Determine if we want to do something that is reproducible with
* floats.
TEST_F(TestHashKernel, ValueCountsFloat) {
// No nulls
CheckValueCounts<FloatType, float>(float32(), {1.0f, 0.0f, -0.0f,
std::nan("1"), std::nan("2") },
{}, {0.0f, 1.0f, std::nan("1")}, {}, {});
CheckValueCounts<DoubleType, double>(float64(), {1.0f, 0.0f, -0.0f,
std::nan("1"), std::nan("2") },
{}, {0.0f, 1.0f, std::nan("1")}, {}, {});
}
*/
TEST_F(TestHashKernel, ChunkedArrayInvoke) {
std::vector<std::string> values1 = {"foo", "bar", "foo"};
std::vector<std::string> values2 = {"bar", "baz", "quuux", "foo"};
auto type = utf8();
auto a1 = _MakeArray<StringType, std::string>(type, values1, {});
auto a2 = _MakeArray<StringType, std::string>(type, values2, {});
std::vector<std::string> dict_values = {"foo", "bar", "baz", "quuux"};
auto ex_dict = _MakeArray<StringType, std::string>(type, dict_values, {});
auto ex_counts = _MakeArray<Int64Type, int64_t>(int64(), {3, 2, 1, 1}, {});
ArrayVector arrays = {a1, a2};
auto carr = std::make_shared<ChunkedArray>(arrays);
// Unique
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(carr));
ASSERT_ARRAYS_EQUAL(*ex_dict, *result);
// Dictionary encode
auto dict_type = dictionary(int32(), type);
auto i1 = _MakeArray<Int32Type, int32_t>(int32(), {0, 1, 0}, {});
auto i2 = _MakeArray<Int32Type, int32_t>(int32(), {1, 2, 3, 0}, {});
ArrayVector dict_arrays = {std::make_shared<DictionaryArray>(dict_type, i1, ex_dict),
std::make_shared<DictionaryArray>(dict_type, i2, ex_dict)};
auto dict_carr = std::make_shared<ChunkedArray>(dict_arrays);
// Unique counts
ASSERT_OK_AND_ASSIGN(auto counts, ValueCounts(carr));
ASSERT_ARRAYS_EQUAL(*ex_dict, *counts->field(0));
ASSERT_ARRAYS_EQUAL(*ex_counts, *counts->field(1));
// Dictionary encode
ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr));
ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array());
}
TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
// ARROW-7008
auto values = ArrayFromJSON(utf8(), "[]");
ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
std::shared_ptr<Array> result = datum_result.make_array();
const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
ASSERT_OK(dict_result.ValidateFull());
}
TEST_F(TestHashKernel, NullEncodingSchemes) {
auto values = ArrayFromJSON(uint8(), "[1, 1, null, 2, null]");
// Masking should put null in the indices array
auto expected_mask_indices = ArrayFromJSON(int32(), "[0, 0, null, 1, null]");
auto expected_mask_dictionary = ArrayFromJSON(uint8(), "[1, 2]");
auto dictionary_type = dictionary(int32(), uint8());
std::shared_ptr<Array> expected = std::make_shared<DictionaryArray>(
dictionary_type, expected_mask_indices, expected_mask_dictionary);
ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
std::shared_ptr<Array> result = datum_result.make_array();
AssertArraysEqual(*expected, *result);
// Encoding should put null in the dictionary
auto expected_encoded_indices = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]");
auto expected_encoded_dict = ArrayFromJSON(uint8(), "[1, null, 2]");
expected = std::make_shared<DictionaryArray>(dictionary_type, expected_encoded_indices,
expected_encoded_dict);
auto options = DictionaryEncodeOptions::Defaults();
options.null_encoding_behavior = DictionaryEncodeOptions::ENCODE;
ASSERT_OK_AND_ASSIGN(datum_result, DictionaryEncode(values, options));
result = datum_result.make_array();
AssertArraysEqual(*expected, *result);
}
TEST_F(TestHashKernel, ChunkedArrayZeroChunk) {
// ARROW-6857
auto chunked_array = std::make_shared<ChunkedArray>(ArrayVector{}, utf8());
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result_array, Unique(chunked_array));
auto expected = ArrayFromJSON(chunked_array->type(), "[]");
AssertArraysEqual(*expected, *result_array);
ASSERT_OK_AND_ASSIGN(result_array, ValueCounts(chunked_array));
expected = ArrayFromJSON(struct_({field(kValuesFieldName, chunked_array->type()),
field(kCountsFieldName, int64())}),
"[]");
AssertArraysEqual(*expected, *result_array);
ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array));
auto dict_type = dictionary(int32(), chunked_array->type());
ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY);
AssertChunkedEqual(*std::make_shared<ChunkedArray>(ArrayVector{}, dict_type),
*result_datum.chunked_array());
}
} // namespace compute
} // namespace arrow