blob: 9f4fff8c25b13615befac10d36fee15b438f28ad [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <sstream>
#include <gtest/gtest.h>
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "gandiva/projector.h"
#include "gandiva/tests/test_util.h"
#include "gandiva/tree_expr_builder.h"
namespace gandiva {
using arrow::boolean;
using arrow::float32;
using arrow::float64;
using arrow::int32;
using arrow::int64;
using arrow::utf8;
class TestHash : public ::testing::Test {
public:
void SetUp() { pool_ = arrow::default_memory_pool(); }
protected:
arrow::MemoryPool* pool_;
};
TEST_F(TestHash, TestSimple) {
// schema for input fields
auto field_a = field("a", int32());
auto schema = arrow::schema({field_a});
// output fields
auto res_0 = field("res0", int32());
auto res_1 = field("res1", int64());
// build expression.
// hash32(a, 10)
// hash64(a)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto literal_10 = TreeExprBuilder::MakeLiteral((int32_t)10);
auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a, literal_10}, int32());
auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a}, int64());
auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);
// Build a projector for the expression.
std::shared_ptr<Projector> projector;
auto status =
Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector);
EXPECT_TRUE(status.ok()) << status.message();
// Create a row-batch with some sample data
int num_records = 4;
auto array_a = MakeArrowArrayInt32({1, 2, 3, 4}, {false, true, true, true});
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
EXPECT_TRUE(status.ok());
// Validate results
auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
EXPECT_EQ(int32_arr->null_count(), 0);
EXPECT_EQ(int32_arr->Value(0), 10);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
}
auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
EXPECT_EQ(int64_arr->null_count(), 0);
EXPECT_EQ(int64_arr->Value(0), 0);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
}
}
TEST_F(TestHash, TestBuf) {
// schema for input fields
auto field_a = field("a", utf8());
auto schema = arrow::schema({field_a});
// output fields
auto res_0 = field("res0", int32());
auto res_1 = field("res1", int64());
// build expressions.
// hash32(a)
// hash64(a, 10)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto literal_10 = TreeExprBuilder::MakeLiteral(static_cast<int64_t>(10));
auto hash32 = TreeExprBuilder::MakeFunction("hash32", {node_a}, int32());
auto hash64 = TreeExprBuilder::MakeFunction("hash64", {node_a, literal_10}, int64());
auto expr_0 = TreeExprBuilder::MakeExpression(hash32, res_0);
auto expr_1 = TreeExprBuilder::MakeExpression(hash64, res_1);
// Build a projector for the expressions.
std::shared_ptr<Projector> projector;
auto status =
Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector);
ASSERT_OK(status) << status.message();
// Create a row-batch with some sample data
int num_records = 4;
auto array_a =
MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {false, true, true, true});
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
ASSERT_OK(status);
// Validate results
auto int32_arr = std::dynamic_pointer_cast<arrow::Int32Array>(outputs.at(0));
EXPECT_EQ(int32_arr->null_count(), 0);
EXPECT_EQ(int32_arr->Value(0), 0);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int32_arr->Value(i), int32_arr->Value(i - 1));
}
auto int64_arr = std::dynamic_pointer_cast<arrow::Int64Array>(outputs.at(1));
EXPECT_EQ(int64_arr->null_count(), 0);
EXPECT_EQ(int64_arr->Value(0), 10);
for (int i = 1; i < num_records; ++i) {
EXPECT_NE(int64_arr->Value(i), int64_arr->Value(i - 1));
}
}
TEST_F(TestHash, TestSha256Simple) {
// schema for input fields
auto field_a = field("a", int32());
auto field_b = field("b", int64());
auto field_c = field("c", float32());
auto field_d = field("d", float64());
auto schema = arrow::schema({field_a, field_b, field_c, field_d});
// output fields
auto res_0 = field("res0", utf8());
auto res_1 = field("res1", utf8());
auto res_2 = field("res2", utf8());
auto res_3 = field("res3", utf8());
// build expressions.
// hashSHA256(a)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto hashSha256_1 = TreeExprBuilder::MakeFunction("hashSHA256", {node_a}, utf8());
auto expr_0 = TreeExprBuilder::MakeExpression(hashSha256_1, res_0);
auto node_b = TreeExprBuilder::MakeField(field_b);
auto hashSha256_2 = TreeExprBuilder::MakeFunction("hashSHA256", {node_b}, utf8());
auto expr_1 = TreeExprBuilder::MakeExpression(hashSha256_2, res_1);
auto node_c = TreeExprBuilder::MakeField(field_c);
auto hashSha256_3 = TreeExprBuilder::MakeFunction("hashSHA256", {node_c}, utf8());
auto expr_2 = TreeExprBuilder::MakeExpression(hashSha256_3, res_2);
auto node_d = TreeExprBuilder::MakeField(field_d);
auto hashSha256_4 = TreeExprBuilder::MakeFunction("hashSHA256", {node_d}, utf8());
auto expr_3 = TreeExprBuilder::MakeExpression(hashSha256_4, res_3);
// Build a projector for the expressions.
std::shared_ptr<Projector> projector;
auto status = Projector::Make(schema, {expr_0, expr_1, expr_2, expr_3},
TestConfiguration(), &projector);
ASSERT_OK(status) << status.message();
// Create a row-batch with some sample data
int num_records = 2;
auto validity_array = {false, true};
auto array_int32 = MakeArrowArrayInt32({1, 0}, validity_array);
auto array_int64 = MakeArrowArrayInt64({1, 0}, validity_array);
auto array_float32 = MakeArrowArrayFloat32({1.0, 0.0}, validity_array);
auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0}, validity_array);
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(
schema, num_records, {array_int32, array_int64, array_float32, array_float64});
// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
ASSERT_OK(status);
auto response_int32 = outputs.at(0);
auto response_int64 = outputs.at(1);
auto response_float32 = outputs.at(2);
auto response_float64 = outputs.at(3);
// Checks if the null and zero representation for numeric values
// are consistent between the types
EXPECT_ARROW_ARRAY_EQUALS(response_int32, response_int64);
EXPECT_ARROW_ARRAY_EQUALS(response_int64, response_float32);
EXPECT_ARROW_ARRAY_EQUALS(response_float32, response_float64);
const int sha256_hash_size = 64;
// Checks if the hash size in response is correct
for (int i = 1; i < num_records; ++i) {
const auto& value_at_position = response_int32->GetScalar(i).ValueOrDie()->ToString();
EXPECT_EQ(value_at_position.size(), sha256_hash_size);
EXPECT_NE(value_at_position,
response_int32->GetScalar(i - 1).ValueOrDie()->ToString());
}
}
TEST_F(TestHash, TestSha256Varlen) {
// schema for input fields
auto field_a = field("a", utf8());
auto schema = arrow::schema({field_a});
// output fields
auto res_0 = field("res0", utf8());
// build expressions.
// hashSHA256(a)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto hashSha256 = TreeExprBuilder::MakeFunction("hashSHA256", {node_a}, utf8());
auto expr_0 = TreeExprBuilder::MakeExpression(hashSha256, res_0);
// Build a projector for the expressions.
std::shared_ptr<Projector> projector;
auto status = Projector::Make(schema, {expr_0}, TestConfiguration(), &projector);
EXPECT_TRUE(status.ok()) << status.message();
// Create a row-batch with some sample data
int num_records = 3;
std::string first_string =
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY "
"[ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]";
std::string second_string =
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY "
"[ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
auto array_a =
MakeArrowArrayUtf8({"foo", first_string, second_string}, {false, true, true});
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
ASSERT_OK(status);
auto response = outputs.at(0);
const int sha256_hash_size = 64;
EXPECT_EQ(response->null_count(), 0);
// Checks that the null value was hashed
EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), "");
EXPECT_EQ(response->GetScalar(0).ValueOrDie()->ToString().size(), sha256_hash_size);
// Check that all generated hashes were different
for (int i = 1; i < num_records; ++i) {
const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString();
EXPECT_EQ(value_at_position.size(), sha256_hash_size);
EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString());
}
}
TEST_F(TestHash, TestSha1Simple) {
// schema for input fields
auto field_a = field("a", int32());
auto field_b = field("b", int64());
auto field_c = field("c", float32());
auto field_d = field("d", float64());
auto schema = arrow::schema({field_a, field_b, field_c, field_d});
// output fields
auto res_0 = field("res0", utf8());
auto res_1 = field("res1", utf8());
auto res_2 = field("res2", utf8());
auto res_3 = field("res3", utf8());
// build expressions.
// hashSHA1(a)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto hashSha1_1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_a}, utf8());
auto expr_0 = TreeExprBuilder::MakeExpression(hashSha1_1, res_0);
auto node_b = TreeExprBuilder::MakeField(field_b);
auto hashSha1_2 = TreeExprBuilder::MakeFunction("hashSHA1", {node_b}, utf8());
auto expr_1 = TreeExprBuilder::MakeExpression(hashSha1_2, res_1);
auto node_c = TreeExprBuilder::MakeField(field_c);
auto hashSha1_3 = TreeExprBuilder::MakeFunction("hashSHA1", {node_c}, utf8());
auto expr_2 = TreeExprBuilder::MakeExpression(hashSha1_3, res_2);
auto node_d = TreeExprBuilder::MakeField(field_d);
auto hashSha1_4 = TreeExprBuilder::MakeFunction("hashSHA1", {node_d}, utf8());
auto expr_3 = TreeExprBuilder::MakeExpression(hashSha1_4, res_3);
// Build a projector for the expressions.
std::shared_ptr<Projector> projector;
auto status = Projector::Make(schema, {expr_0, expr_1, expr_2, expr_3},
TestConfiguration(), &projector);
EXPECT_TRUE(status.ok()) << status.message();
// Create a row-batch with some sample data
int num_records = 2;
auto validity_array = {false, true};
auto array_int32 = MakeArrowArrayInt32({1, 0}, validity_array);
auto array_int64 = MakeArrowArrayInt64({1, 0}, validity_array);
auto array_float32 = MakeArrowArrayFloat32({1.0, 0.0}, validity_array);
auto array_float64 = MakeArrowArrayFloat64({1.0, 0.0}, validity_array);
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(
schema, num_records, {array_int32, array_int64, array_float32, array_float64});
// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
ASSERT_OK(status);
auto response_int32 = outputs.at(0);
auto response_int64 = outputs.at(1);
auto response_float32 = outputs.at(2);
auto response_float64 = outputs.at(3);
// Checks if the null and zero representation for numeric values
// are consistent between the types
EXPECT_ARROW_ARRAY_EQUALS(response_int32, response_int64);
EXPECT_ARROW_ARRAY_EQUALS(response_int64, response_float32);
EXPECT_ARROW_ARRAY_EQUALS(response_float32, response_float64);
const int sha1_hash_size = 40;
// Checks if the hash size in response is correct
for (int i = 1; i < num_records; ++i) {
const auto& value_at_position = response_int32->GetScalar(i).ValueOrDie()->ToString();
EXPECT_EQ(value_at_position.size(), sha1_hash_size);
EXPECT_NE(value_at_position,
response_int32->GetScalar(i - 1).ValueOrDie()->ToString());
}
}
TEST_F(TestHash, TestSha1Varlen) {
// schema for input fields
auto field_a = field("a", utf8());
auto schema = arrow::schema({field_a});
// output fields
auto res_0 = field("res0", utf8());
// build expressions.
// hashSHA1(a)
auto node_a = TreeExprBuilder::MakeField(field_a);
auto hashSha1 = TreeExprBuilder::MakeFunction("hashSHA1", {node_a}, utf8());
auto expr_0 = TreeExprBuilder::MakeExpression(hashSha1, res_0);
// Build a projector for the expressions.
std::shared_ptr<Projector> projector;
auto status = Projector::Make(schema, {expr_0}, TestConfiguration(), &projector);
ASSERT_OK(status) << status.message();
// Create a row-batch with some sample data
int num_records = 3;
std::string first_string =
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn\nY [ˈʏpsilɔn], "
"Yen [jɛn], Yoga [ˈjoːgɑ]";
std::string second_string =
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeın\nY [ˈʏpsilɔn], "
"Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
auto array_a =
MakeArrowArrayUtf8({"", first_string, second_string}, {false, true, true});
// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
ASSERT_OK(status);
auto response = outputs.at(0);
const int sha1_hash_size = 40;
EXPECT_EQ(response->null_count(), 0);
// Checks that the null value was hashed
EXPECT_NE(response->GetScalar(0).ValueOrDie()->ToString(), "");
EXPECT_EQ(response->GetScalar(0).ValueOrDie()->ToString().size(), sha1_hash_size);
// Check that all generated hashes were different
for (int i = 1; i < num_records; ++i) {
const auto& value_at_position = response->GetScalar(i).ValueOrDie()->ToString();
EXPECT_EQ(value_at_position.size(), sha1_hash_size);
EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString());
}
}
} // namespace gandiva