blob: 5c8bf98e19627a375c3997b150a0f743930efb1b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <iosfwd>
#include <locale>
#include <memory>
#include <stdexcept>
#include <string>
#include <vector>
#include <gtest/gtest.h>
#include "arrow/array/array_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/chunked_array.h"
#include "arrow/compute/api.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_compat.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
namespace arrow {
namespace compute {
// ----------------------------------------------------------------------
// IsIn tests
void CheckIsIn(const std::shared_ptr<DataType>& type, const std::string& input_json,
const std::string& value_set_json, const std::string& expected_json,
bool skip_nulls = false) {
auto input = ArrayFromJSON(type, input_json);
auto value_set = ArrayFromJSON(type, value_set_json);
auto expected = ArrayFromJSON(boolean(), expected_json);
ASSERT_OK_AND_ASSIGN(Datum actual_datum,
IsIn(input, SetLookupOptions(value_set, skip_nulls)));
std::shared_ptr<Array> actual = actual_datum.make_array();
ASSERT_OK(actual->ValidateFull());
AssertArraysEqual(*expected, *actual, /*verbose=*/true);
}
void CheckIsInChunked(const std::shared_ptr<ChunkedArray>& input,
const std::shared_ptr<ChunkedArray>& value_set,
const std::shared_ptr<ChunkedArray>& expected,
bool skip_nulls = false) {
ASSERT_OK_AND_ASSIGN(Datum actual_datum,
IsIn(input, SetLookupOptions(value_set, skip_nulls)));
auto actual = actual_datum.chunked_array();
ASSERT_OK(actual->ValidateFull());
AssertChunkedEqual(*expected, *actual);
}
void CheckIsInDictionary(const std::shared_ptr<DataType>& type,
const std::shared_ptr<DataType>& index_type,
const std::string& input_dictionary_json,
const std::string& input_index_json,
const std::string& value_set_json,
const std::string& expected_json, bool skip_nulls = false) {
auto dict_type = dictionary(index_type, type);
auto indices = ArrayFromJSON(index_type, input_index_json);
auto dict = ArrayFromJSON(type, input_dictionary_json);
ASSERT_OK_AND_ASSIGN(auto input, DictionaryArray::FromArrays(dict_type, indices, dict));
auto value_set = ArrayFromJSON(type, value_set_json);
auto expected = ArrayFromJSON(boolean(), expected_json);
ASSERT_OK_AND_ASSIGN(Datum actual_datum,
IsIn(input, SetLookupOptions(value_set, skip_nulls)));
std::shared_ptr<Array> actual = actual_datum.make_array();
ASSERT_OK(actual->ValidateFull());
AssertArraysEqual(*expected, *actual, /*verbose=*/true);
}
class TestIsInKernel : public ::testing::Test {};
TEST_F(TestIsInKernel, CallBinary) {
auto input = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8]");
auto value_set = ArrayFromJSON(int8(), "[2, 3, 5, 7]");
ASSERT_RAISES(Invalid, CallFunction("is_in", {input, value_set}));
ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("is_in_meta_binary", {input, value_set}));
auto expected = ArrayFromJSON(boolean(), ("[false, false, true, true, false,"
"true, false, true, false]"));
AssertArraysEqual(*expected, *out.make_array());
}
TEST_F(TestIsInKernel, ImplicitlyCastValueSet) {
auto input = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8]");
SetLookupOptions opts{ArrayFromJSON(int32(), "[2, 3, 5, 7]")};
ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("is_in", {input}, &opts));
auto expected = ArrayFromJSON(boolean(), ("[false, false, true, true, false,"
"true, false, true, false]"));
AssertArraysEqual(*expected, *out.make_array());
// fails; value_set cannot be cast to int8
opts = SetLookupOptions{ArrayFromJSON(float32(), "[2.5, 3.1, 5.0]")};
ASSERT_RAISES(Invalid, CallFunction("is_in", {input}, &opts));
}
template <typename Type>
class TestIsInKernelPrimitive : public ::testing::Test {};
template <typename Type>
class TestIsInKernelBinary : public ::testing::Test {};
using PrimitiveTypes = ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type,
Int32Type, UInt32Type, Int64Type, UInt64Type,
FloatType, DoubleType, Date32Type, Date64Type>;
TYPED_TEST_SUITE(TestIsInKernelPrimitive, PrimitiveTypes);
TYPED_TEST(TestIsInKernelPrimitive, IsIn) {
auto type = TypeTraits<TypeParam>::type_singleton();
// No Nulls
CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]");
// Nulls in left array
CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]",
/*skip_nulls=*/true);
// Nulls in right array
CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]",
/*skip_nulls=*/true);
// Nulls in both the arrays
CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]",
"[false, true, true, false, true]", /*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]",
"[true, true, true, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]",
"[false, true, true, false, true]", /*skip_nulls=*/true);
// Empty Arrays
CheckIsIn(type, "[]", "[]", "[]");
}
TEST_F(TestIsInKernel, NullType) {
auto type = null();
CheckIsIn(type, "[null, null, null]", "[null]", "[true, true, true]");
CheckIsIn(type, "[null, null, null]", "[]", "[false, false, false]");
CheckIsIn(type, "[]", "[]", "[]");
CheckIsIn(type, "[null, null]", "[null]", "[false, false]", /*skip_nulls=*/true);
CheckIsIn(type, "[null, null]", "[]", "[false, false]", /*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, "[null, null, null]", "[null, null]", "[true, true, true]");
CheckIsIn(type, "[null, null]", "[null, null]", "[false, false]", /*skip_nulls=*/true);
}
TEST_F(TestIsInKernel, TimeTimestamp) {
for (const auto& type :
{time32(TimeUnit::SECOND), time64(TimeUnit::NANO), timestamp(TimeUnit::MICRO)}) {
CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]",
"[true, true, false, true, true]", /*skip_nulls=*/false);
CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]",
"[true, false, false, true, true]", /*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]",
"[true, true, false, true, true]", /*skip_nulls=*/false);
CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]",
"[true, false, false, true, true]", /*skip_nulls=*/true);
}
}
TEST_F(TestIsInKernel, Boolean) {
auto type = boolean();
CheckIsIn(type, "[true, false, null, true, false]", "[false]",
"[false, true, false, false, true]", /*skip_nulls=*/false);
CheckIsIn(type, "[true, false, null, true, false]", "[false]",
"[false, true, false, false, true]", /*skip_nulls=*/true);
CheckIsIn(type, "[true, false, null, true, false]", "[false, null]",
"[false, true, true, false, true]", /*skip_nulls=*/false);
CheckIsIn(type, "[true, false, null, true, false]", "[false, null]",
"[false, true, false, false, true]", /*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]",
"[false, true, true, false, true]", /*skip_nulls=*/false);
CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]",
"[false, true, false, false, true]", /*skip_nulls=*/true);
}
TYPED_TEST_SUITE(TestIsInKernelBinary, BinaryTypes);
TYPED_TEST(TestIsInKernelBinary, Binary) {
auto type = TypeTraits<TypeParam>::type_singleton();
CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])",
"[true, true, false, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])",
"[true, true, false, false, true]",
/*skip_nulls=*/true);
CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])",
"[true, true, false, true, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])",
"[true, true, false, false, true]",
/*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, R"(["aaa", "", "cc", null, ""])",
R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, true, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["aaa", "", "cc", null, ""])",
R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, false, true]",
/*skip_nulls=*/true);
}
TEST_F(TestIsInKernel, FixedSizeBinary) {
auto type = fixed_size_binary(3);
CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])",
"[true, true, false, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])",
"[true, true, false, false, true]",
/*skip_nulls=*/true);
CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])",
"[true, true, false, true, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])",
"[true, true, false, false, true]",
/*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])",
R"(["aaa", null, "aaa", "bbb", "bbb", null])",
"[true, true, false, true, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])",
R"(["aaa", null, "aaa", "bbb", "bbb", null])",
"[true, true, false, false, true]",
/*skip_nulls=*/true);
}
TEST_F(TestIsInKernel, Decimal) {
auto type = decimal(3, 1);
CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])",
"[true, false, true, false, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])",
"[true, false, true, false, true]",
/*skip_nulls=*/true);
CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
R"(["12.3", "78.9", null])", "[true, false, true, true, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
R"(["12.3", "78.9", null])", "[true, false, true, false, true]",
/*skip_nulls=*/true);
// Duplicates in right array
CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
R"([null, "12.3", "12.3", "78.9", "78.9", null])",
"[true, false, true, true, true]",
/*skip_nulls=*/false);
CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])",
R"([null, "12.3", "12.3", "78.9", "78.9", null])",
"[true, false, true, false, true]",
/*skip_nulls=*/true);
}
TEST_F(TestIsInKernel, DictionaryArray) {
for (auto index_ty : all_dictionary_index_types()) {
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 2, null, 0]",
/*value_set_json=*/R"(["A", "B", "C"])",
/*expected_json=*/"[true, true, false, true]",
/*skip_nulls=*/false);
CheckIsInDictionary(/*type=*/float32(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/"[4.1, -1.0, 42, 9.8]",
/*input_index_json=*/"[1, 2, null, 0]",
/*value_set_json=*/"[4.1, 42, -1.0]",
/*expected_json=*/"[true, true, false, true]",
/*skip_nulls=*/false);
// With nulls and skip_nulls=false
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[true, false, true, true, true]",
/*skip_nulls=*/false);
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[true, false, true, true, true]",
/*skip_nulls=*/false);
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A"])",
/*expected_json=*/"[false, false, false, true, false]",
/*skip_nulls=*/false);
// With nulls and skip_nulls=true
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[true, false, false, true, true]",
/*skip_nulls=*/true);
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[false, false, false, true, false]",
/*skip_nulls=*/true);
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A"])",
/*expected_json=*/"[false, false, false, true, false]",
/*skip_nulls=*/true);
// With duplicates in value_set
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 2, null, 0]",
/*value_set_json=*/R"(["A", "A", "B", "A", "B", "C"])",
/*expected_json=*/"[true, true, false, true]",
/*skip_nulls=*/false);
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])",
/*expected_json=*/"[true, false, true, true, true]",
/*skip_nulls=*/false);
CheckIsInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])",
/*expected_json=*/"[true, false, false, true, true]",
/*skip_nulls=*/true);
}
}
TEST_F(TestIsInKernel, ChunkedArrayInvoke) {
auto input = ChunkedArrayFromJSON(
utf8(), {R"(["abc", "def", "", "abc", "jkl"])", R"(["def", null, "abc", "zzz"])"});
// No null in value_set
auto value_set = ChunkedArrayFromJSON(utf8(), {R"(["", "def"])", R"(["abc"])"});
auto expected = ChunkedArrayFromJSON(
boolean(), {"[true, true, true, true, false]", "[true, false, true, false]"});
CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false);
CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true);
value_set = ChunkedArrayFromJSON(utf8(), {R"(["", "def"])", R"([null])"});
expected = ChunkedArrayFromJSON(
boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"});
CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false);
expected = ChunkedArrayFromJSON(
boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"});
CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true);
// Duplicates in value_set
value_set =
ChunkedArrayFromJSON(utf8(), {R"(["", null, "", "def"])", R"(["def", null])"});
expected = ChunkedArrayFromJSON(
boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"});
CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false);
expected = ChunkedArrayFromJSON(
boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"});
CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true);
}
// ----------------------------------------------------------------------
// IndexIn tests
class TestIndexInKernel : public ::testing::Test {
public:
void CheckIndexIn(const std::shared_ptr<DataType>& type, const std::string& input_json,
const std::string& value_set_json, const std::string& expected_json,
bool skip_nulls = false) {
std::shared_ptr<Array> input = ArrayFromJSON(type, input_json);
std::shared_ptr<Array> value_set = ArrayFromJSON(type, value_set_json);
std::shared_ptr<Array> expected = ArrayFromJSON(int32(), expected_json);
SetLookupOptions options(value_set, skip_nulls);
ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options));
std::shared_ptr<Array> actual = actual_datum.make_array();
ASSERT_OK(actual->ValidateFull());
AssertArraysEqual(*expected, *actual, /*verbose=*/true);
}
void CheckIndexInChunked(const std::shared_ptr<ChunkedArray>& input,
const std::shared_ptr<ChunkedArray>& value_set,
const std::shared_ptr<ChunkedArray>& expected,
bool skip_nulls) {
ASSERT_OK_AND_ASSIGN(Datum actual,
IndexIn(input, SetLookupOptions(value_set, skip_nulls)));
ASSERT_EQ(Datum::CHUNKED_ARRAY, actual.kind());
ASSERT_OK(actual.chunked_array()->ValidateFull());
AssertChunkedEqual(*expected, *actual.chunked_array());
}
void CheckIndexInDictionary(const std::shared_ptr<DataType>& type,
const std::shared_ptr<DataType>& index_type,
const std::string& input_dictionary_json,
const std::string& input_index_json,
const std::string& value_set_json,
const std::string& expected_json, bool skip_nulls = false) {
auto dict_type = dictionary(index_type, type);
auto indices = ArrayFromJSON(index_type, input_index_json);
auto dict = ArrayFromJSON(type, input_dictionary_json);
ASSERT_OK_AND_ASSIGN(auto input,
DictionaryArray::FromArrays(dict_type, indices, dict));
auto value_set = ArrayFromJSON(type, value_set_json);
auto expected = ArrayFromJSON(int32(), expected_json);
SetLookupOptions options(value_set, skip_nulls);
ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options));
std::shared_ptr<Array> actual = actual_datum.make_array();
ASSERT_OK(actual->ValidateFull());
AssertArraysEqual(*expected, *actual, /*verbose=*/true);
}
};
TEST_F(TestIndexInKernel, CallBinary) {
auto input = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]");
auto value_set = ArrayFromJSON(int8(), "[2, 3, 5, 7]");
ASSERT_RAISES(Invalid, CallFunction("index_in", {input, value_set}));
ASSERT_OK_AND_ASSIGN(Datum out,
CallFunction("index_in_meta_binary", {input, value_set}));
auto expected = ArrayFromJSON(int32(), ("[null, null, 0, 1, null, 2, null, 3, null,"
" null, null]"));
AssertArraysEqual(*expected, *out.make_array());
}
template <typename Type>
class TestIndexInKernelPrimitive : public TestIndexInKernel {};
using PrimitiveDictionaries =
::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type, UInt32Type,
Int64Type, UInt64Type, FloatType, DoubleType, Date32Type,
Date64Type>;
TYPED_TEST_SUITE(TestIndexInKernelPrimitive, PrimitiveDictionaries);
TYPED_TEST(TestIndexInKernelPrimitive, IndexIn) {
auto type = TypeTraits<TypeParam>::type_singleton();
// No Nulls
this->CheckIndexIn(type,
/* input= */ "[2, 1, 2, 1, 2, 3]",
/* value_set= */ "[2, 1, 3]",
/* expected= */ "[0, 1, 0, 1, 0, 2]");
// Haystack array all null
this->CheckIndexIn(type,
/* input= */ "[null, null, null, null, null, null]",
/* value_set= */ "[2, 1, 3]",
/* expected= */ "[null, null, null, null, null, null]");
// Needles array all null
this->CheckIndexIn(type,
/* input= */ "[2, 1, 2, 1, 2, 3]",
/* value_set= */ "[null]",
/* expected= */ "[null, null, null, null, null, null]");
// Both arrays all null
this->CheckIndexIn(type,
/* input= */ "[null, null, null, null]",
/* value_set= */ "[null]",
/* expected= */ "[0, 0, 0, 0]");
// Duplicates in value_set
this->CheckIndexIn(type,
/* input= */ "[2, 1, 2, 1, 2, 3]",
/* value_set= */ "[2, 2, 1, 1, 1, 3, 3]",
/* expected= */ "[0, 2, 0, 2, 0, 5]");
// Duplicates and nulls in value_set
this->CheckIndexIn(type,
/* input= */ "[2, 1, 2, 1, 2, 3]",
/* value_set= */ "[2, 2, null, null, 1, 1, 1, 3, 3]",
/* expected= */ "[0, 4, 0, 4, 0, 7]");
// No Match
this->CheckIndexIn(type,
/* input= */ "[2, null, 7, 3, 8]",
/* value_set= */ "[2, null, 6, 3]",
/* expected= */ "[0, 1, null, 3, null]");
// Empty Arrays
this->CheckIndexIn(type, "[]", "[]", "[]");
}
TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) {
auto type = TypeTraits<TypeParam>::type_singleton();
// No nulls in value_set
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, 3]",
/*expected=*/"[null, 0, null, 1, null]",
/*skip_nulls=*/false);
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, 3]",
/*expected=*/"[null, 0, null, 1, null]",
/*skip_nulls=*/true);
// Same with duplicates in value_set
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, 1, 3, 3]",
/*expected=*/"[null, 0, null, 2, null]",
/*skip_nulls=*/false);
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, 1, 3, 3]",
/*expected=*/"[null, 0, null, 2, null]",
/*skip_nulls=*/true);
// Nulls in value_set
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, null, 3]",
/*expected=*/"[null, 0, null, 2, 1]",
/*skip_nulls=*/false);
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, 1, null, null, 3, 3]",
/*expected=*/"[null, 0, null, 4, null]",
/*skip_nulls=*/true);
// Same with duplicates in value_set
this->CheckIndexIn(type,
/*input=*/"[0, 1, 2, 3, null]",
/*value_set=*/"[1, 1, null, null, 3, 3]",
/*expected=*/"[null, 0, null, 4, 2]",
/*skip_nulls=*/false);
}
TEST_F(TestIndexInKernel, NullType) {
CheckIndexIn(null(), "[null, null, null]", "[null]", "[0, 0, 0]");
CheckIndexIn(null(), "[null, null, null]", "[]", "[null, null, null]");
CheckIndexIn(null(), "[]", "[null, null]", "[]");
CheckIndexIn(null(), "[]", "[]", "[]");
CheckIndexIn(null(), "[null, null]", "[null]", "[null, null]", /*skip_nulls=*/true);
CheckIndexIn(null(), "[null, null]", "[]", "[null, null]", /*skip_nulls=*/true);
}
TEST_F(TestIndexInKernel, TimeTimestamp) {
CheckIndexIn(time32(TimeUnit::SECOND),
/* input= */ "[1, null, 5, 1, 2]",
/* value_set= */ "[2, 1, null]",
/* expected= */ "[1, 2, null, 1, 0]");
// Duplicates in value_set
CheckIndexIn(time32(TimeUnit::SECOND),
/* input= */ "[1, null, 5, 1, 2]",
/* value_set= */ "[2, 2, 1, 1, null, null]",
/* expected= */ "[2, 4, null, 2, 0]");
// Needles array has no nulls
CheckIndexIn(time32(TimeUnit::SECOND),
/* input= */ "[2, null, 5, 1]",
/* value_set= */ "[2, 1]",
/* expected= */ "[0, null, null, 1]");
// No match
CheckIndexIn(time32(TimeUnit::SECOND), "[3, null, 5, 3]", "[2, 1]",
"[null, null, null, null]");
// Empty arrays
CheckIndexIn(time32(TimeUnit::SECOND), "[]", "[]", "[]");
CheckIndexIn(time64(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]", "[0, 1, 0, 2]");
CheckIndexIn(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]",
"[0, 1, 0, 2]");
// Empty input array
CheckIndexIn(timestamp(TimeUnit::NANO), "[]", "[2, null, 1]", "[]");
// Empty value_set array
CheckIndexIn(timestamp(TimeUnit::NANO), "[2, null, 1]", "[]", "[null, null, null]");
// Both array are all null
CheckIndexIn(time32(TimeUnit::SECOND), "[null, null, null, null]", "[null]",
"[0, 0, 0, 0]");
}
TEST_F(TestIndexInKernel, Boolean) {
CheckIndexIn(boolean(),
/* input= */ "[false, null, false, true]",
/* value_set= */ "[null, false, true]",
/* expected= */ "[1, 0, 1, 2]");
CheckIndexIn(boolean(), "[false, null, false, true]", "[false, true, null]",
"[0, 2, 0, 1]");
// Duplicates in value_set
CheckIndexIn(boolean(), "[false, null, false, true]",
"[false, false, true, true, null, null]", "[0, 4, 0, 2]");
// No Nulls
CheckIndexIn(boolean(), "[true, true, false, true]", "[false, true]", "[1, 1, 0, 1]");
CheckIndexIn(boolean(), "[false, true, false, true]", "[true]", "[null, 0, null, 0]");
// No match
CheckIndexIn(boolean(), "[true, true, true, true]", "[false]",
"[null, null, null, null]");
// Nulls in input array
CheckIndexIn(boolean(), "[null, null, null, null]", "[true]",
"[null, null, null, null]");
// Nulls in value_set array
CheckIndexIn(boolean(), "[true, true, false, true]", "[null]",
"[null, null, null, null]");
// Both array have Nulls
CheckIndexIn(boolean(), "[null, null, null, null]", "[null]", "[0, 0, 0, 0]");
}
template <typename Type>
class TestIndexInKernelBinary : public TestIndexInKernel {};
TYPED_TEST_SUITE(TestIndexInKernelBinary, BinaryTypes);
TYPED_TEST(TestIndexInKernelBinary, Binary) {
auto type = TypeTraits<TypeParam>::type_singleton();
this->CheckIndexIn(type, R"(["foo", null, "bar", "foo"])", R"(["foo", null, "bar"])",
R"([0, 1, 2, 0])");
// Duplicates in value_set
this->CheckIndexIn(type, R"(["foo", null, "bar", "foo"])",
R"(["foo", "foo", null, null, "bar", "bar"])", R"([0, 2, 4, 0])");
// No match
this->CheckIndexIn(type,
/* input= */ R"(["foo", null, "bar", "foo"])",
/* value_set= */ R"(["baz", "bazzz"])",
/* expected= */ R"([null, null, null, null])");
// Nulls in input array
this->CheckIndexIn(type,
/* input= */ R"([null, null, null, null])",
/* value_set= */ R"(["foo", "bar"])",
/* expected= */ R"([null, null, null, null])");
// Nulls in value_set array
this->CheckIndexIn(type, R"(["foo", "bar", "foo"])", R"([null])",
R"([null, null, null])");
// Both array have Nulls
this->CheckIndexIn(type,
/* input= */ R"([null, null, null, null])",
/* value_set= */ R"([null])",
/* expected= */ R"([0, 0, 0, 0])");
// Empty arrays
this->CheckIndexIn(type, R"([])", R"([])", R"([])");
// Empty input array
this->CheckIndexIn(type, R"([])", R"(["foo", null, "bar"])", "[]");
// Empty value_set array
this->CheckIndexIn(type, R"(["foo", null, "bar", "foo"])", "[]",
R"([null, null, null, null])");
}
TEST_F(TestIndexInKernel, BinaryResizeTable) {
const int32_t kTotalValues = 10000;
#if !defined(ARROW_VALGRIND)
const int32_t kRepeats = 10;
#else
// Mitigate Valgrind's slowness
const int32_t kRepeats = 3;
#endif
const int32_t kBufSize = 20;
Int32Builder expected_builder;
StringBuilder input_builder;
ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats));
ASSERT_OK(input_builder.Resize(kTotalValues * kRepeats));
ASSERT_OK(input_builder.ReserveData(kBufSize * kTotalValues * kRepeats));
for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
int32_t index = i % kTotalValues;
char buf[kBufSize] = "test";
ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0);
input_builder.UnsafeAppend(util::string_view(buf));
expected_builder.UnsafeAppend(index);
}
std::shared_ptr<Array> input, value_set, expected;
ASSERT_OK(input_builder.Finish(&input));
value_set = input->Slice(0, kTotalValues);
ASSERT_OK(expected_builder.Finish(&expected));
ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, value_set));
std::shared_ptr<Array> actual = actual_datum.make_array();
ASSERT_ARRAYS_EQUAL(*expected, *actual);
}
TEST_F(TestIndexInKernel, FixedSizeBinary) {
CheckIndexIn(fixed_size_binary(3),
/*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])",
/*value_set=*/R"(["aaa", null, "bbb", "ccc"])",
/*expected=*/R"([2, 1, null, 0, 3, 0])");
CheckIndexIn(fixed_size_binary(3),
/*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])",
/*value_set=*/R"(["aaa", null, "bbb", "ccc"])",
/*expected=*/R"([2, null, null, 0, 3, 0])",
/*skip_nulls=*/true);
CheckIndexIn(fixed_size_binary(3),
/*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])",
/*value_set=*/R"(["aaa", "bbb", "ccc"])",
/*expected=*/R"([1, null, null, 0, 2, 0])");
CheckIndexIn(fixed_size_binary(3),
/*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])",
/*value_set=*/R"(["aaa", "bbb", "ccc"])",
/*expected=*/R"([1, null, null, 0, 2, 0])",
/*skip_nulls=*/true);
// Duplicates in value_set
CheckIndexIn(fixed_size_binary(3),
/*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])",
/*value_set=*/R"(["aaa", "aaa", null, null, "bbb", "bbb", "ccc"])",
/*expected=*/R"([4, 2, null, 0, 6, 0])");
CheckIndexIn(fixed_size_binary(3),
/*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])",
/*value_set=*/R"(["aaa", "aaa", null, null, "bbb", "bbb", "ccc"])",
/*expected=*/R"([4, null, null, 0, 6, 0])",
/*skip_nulls=*/true);
// Empty input array
CheckIndexIn(fixed_size_binary(5), R"([])", R"(["bbbbb", null, "aaaaa", "ccccc"])",
R"([])");
// Empty value_set array
CheckIndexIn(fixed_size_binary(5), R"(["bbbbb", null, "bbbbb"])", R"([])",
R"([null, null, null])");
// Empty arrays
CheckIndexIn(fixed_size_binary(0), R"([])", R"([])", R"([])");
}
TEST_F(TestIndexInKernel, Decimal) {
auto type = decimal(2, 0);
CheckIndexIn(type,
/*input=*/R"(["12", null, "11", "12", "13"])",
/*value_set=*/R"([null, "11", "12"])",
/*expected=*/R"([2, 0, 1, 2, null])",
/*skip_nulls=*/false);
CheckIndexIn(type,
/*input=*/R"(["12", null, "11", "12", "13"])",
/*value_set=*/R"([null, "11", "12"])",
/*expected=*/R"([2, null, 1, 2, null])",
/*skip_nulls=*/true);
CheckIndexIn(type,
/*input=*/R"(["12", null, "11", "12", "13"])",
/*value_set=*/R"(["11", "12"])",
/*expected=*/R"([1, null, 0, 1, null])",
/*skip_nulls=*/false);
CheckIndexIn(type,
/*input=*/R"(["12", null, "11", "12", "13"])",
/*value_set=*/R"(["11", "12"])",
/*expected=*/R"([1, null, 0, 1, null])",
/*skip_nulls=*/true);
// Duplicates in value_set
CheckIndexIn(type,
/*input=*/R"(["12", null, "11", "12", "13"])",
/*value_set=*/R"([null, null, "11", "11", "12", "12"])",
/*expected=*/R"([4, 0, 2, 4, null])",
/*skip_nulls=*/false);
CheckIndexIn(type,
/*input=*/R"(["12", null, "11", "12", "13"])",
/*value_set=*/R"([null, null, "11", "11", "12", "12"])",
/*expected=*/R"([4, null, 2, 4, null])",
/*skip_nulls=*/true);
}
TEST_F(TestIndexInKernel, DictionaryArray) {
for (auto index_ty : all_dictionary_index_types()) {
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 2, null, 0]",
/*value_set_json=*/R"(["A", "B", "C"])",
/*expected_json=*/"[1, 2, null, 0]",
/*skip_nulls=*/false);
CheckIndexInDictionary(/*type=*/float32(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/"[4.1, -1.0, 42, 9.8]",
/*input_index_json=*/"[1, 2, null, 0]",
/*value_set_json=*/"[4.1, 42, -1.0]",
/*expected_json=*/"[2, 1, null, 0]",
/*skip_nulls=*/false);
// With nulls and skip_nulls=false
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[1, null, 3, 2, 1]",
/*skip_nulls=*/false);
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[3, null, 3, 2, 3]",
/*skip_nulls=*/false);
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A"])",
/*expected_json=*/"[null, null, null, 2, null]",
/*skip_nulls=*/false);
// With nulls and skip_nulls=true
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[1, null, null, 2, 1]",
/*skip_nulls=*/true);
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A", null])",
/*expected_json=*/"[null, null, null, 2, null]",
/*skip_nulls=*/true);
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "B", "A"])",
/*expected_json=*/"[null, null, null, 2, null]",
/*skip_nulls=*/true);
// With duplicates in value_set
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", "B", "C", "D"])",
/*input_index_json=*/"[1, 2, null, 0]",
/*value_set_json=*/R"(["A", "A", "B", "B", "C", "C"])",
/*expected_json=*/"[2, 4, null, 0]",
/*skip_nulls=*/false);
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])",
/*expected_json=*/"[6, null, 6, 4, 6]",
/*skip_nulls=*/false);
CheckIndexInDictionary(/*type=*/utf8(),
/*index_type=*/index_ty,
/*input_dictionary_json=*/R"(["A", null, "C", "D"])",
/*input_index_json=*/"[1, 3, null, 0, 1]",
/*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])",
/*expected_json=*/"[null, null, null, 4, null]",
/*skip_nulls=*/true);
}
}
TEST_F(TestIndexInKernel, ChunkedArrayInvoke) {
auto input = ChunkedArrayFromJSON(utf8(), {R"(["abc", "def", "ghi", "abc", "jkl"])",
R"(["def", null, "abc", "zzz"])"});
// No null in value_set
auto value_set = ChunkedArrayFromJSON(utf8(), {R"(["ghi", "def"])", R"(["abc"])"});
auto expected =
ChunkedArrayFromJSON(int32(), {"[2, 1, 0, 2, null]", "[1, null, 2, null]"});
CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false);
CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true);
// Null in value_set
value_set = ChunkedArrayFromJSON(utf8(), {R"(["ghi", "def"])", R"([null, "abc"])"});
expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, 2, 3, null]"});
CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false);
expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, null, 3, null]"});
CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true);
// Duplicates in value_set
value_set = ChunkedArrayFromJSON(
utf8(), {R"(["ghi", "ghi", "def"])", R"(["def", null, null, "abc"])"});
expected = ChunkedArrayFromJSON(int32(), {"[6, 2, 0, 6, null]", "[2, 4, 6, null]"});
CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false);
expected = ChunkedArrayFromJSON(int32(), {"[6, 2, 0, 6, null]", "[2, null, 6, null]"});
CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true);
}
TEST(TestSetLookup, DispatchBest) {
for (std::string name : {"is_in", "index_in"}) {
CheckDispatchBest(name, {int32()}, {int32()});
CheckDispatchBest(name, {dictionary(int32(), utf8())}, {utf8()});
}
}
TEST(TestSetLookup, IsInWithImplicitCasts) {
SetLookupOptions opts{ArrayFromJSON(utf8(), R"(["b", null])")};
CheckScalarUnary("is_in",
ArrayFromJSON(dictionary(int32(), utf8()), R"(["a", "b", "c", null])"),
ArrayFromJSON(boolean(), "[0, 1, 0, 1]"), &opts);
}
} // namespace compute
} // namespace arrow