blob: a3810316f76043702ae9a2486826118793601be1 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <gtest/gtest.h>
#include "arrow/json/chunked_builder.h"
#include "arrow/json/converter.h"
#include "arrow/json/options.h"
#include "arrow/json/test_common.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/task_group.h"
#include "arrow/util/thread_pool.h"
namespace arrow {
namespace json {
using util::string_view;
using internal::checked_cast;
using internal::GetCpuThreadPool;
using internal::TaskGroup;
void AssertBuilding(const std::shared_ptr<ChunkedArrayBuilder>& builder,
const std::vector<std::string>& chunks,
std::shared_ptr<ChunkedArray>* out) {
ArrayVector unconverted;
auto options = ParseOptions::Defaults();
for (const auto& chunk : chunks) {
std::shared_ptr<Array> parsed;
ASSERT_OK(ParseFromString(options, chunk, &parsed));
unconverted.push_back(parsed);
}
int64_t i = 0;
for (const auto& parsed : unconverted) {
builder->Insert(i, field("", parsed->type()), parsed);
++i;
}
ASSERT_OK(builder->Finish(out));
ASSERT_OK((*out)->ValidateFull());
}
std::shared_ptr<ChunkedArray> ExtractField(const std::string& name,
const ChunkedArray& columns) {
auto chunks = columns.chunks();
for (auto& chunk : chunks) {
chunk = checked_cast<const StructArray&>(*chunk).GetFieldByName(name);
}
const auto& struct_type = checked_cast<const StructType&>(*columns.type());
return std::make_shared<ChunkedArray>(chunks, struct_type.GetFieldByName(name)->type());
}
void AssertFieldEqual(const std::vector<std::string>& path,
const std::shared_ptr<ChunkedArray>& columns,
const ChunkedArray& expected) {
ASSERT_EQ(expected.num_chunks(), columns->num_chunks()) << "# chunks unequal";
std::shared_ptr<ChunkedArray> actual = columns;
for (const auto& name : path) {
actual = ExtractField(name, *actual);
}
AssertChunkedEqual(expected, *actual);
}
TEST(ChunkedArrayBuilder, Empty) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
struct_({field("a", int32())}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder, {}, &actual);
ChunkedArray expected({}, int32());
AssertFieldEqual({"a"}, actual, expected);
}
TEST(ChunkedArrayBuilder, Basics) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
struct_({field("a", int32())}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder, {RowsOfOneColumn("a", {123, -456})}, &actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<Int32Type>({{123, -456}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(ChunkedArrayBuilder, Insert) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
struct_({field("a", int32())}), &builder));
auto options = ParseOptions::Defaults();
std::shared_ptr<ChunkedArray> actual, expected;
std::shared_ptr<Array> parsed;
ASSERT_OK(ParseFromString(options, RowsOfOneColumn("a", {-456}), &parsed));
builder->Insert(1, field("", parsed->type()), parsed);
ASSERT_OK(ParseFromString(options, RowsOfOneColumn("a", {123}), &parsed));
builder->Insert(0, field("", parsed->type()), parsed);
ASSERT_OK(builder->Finish(&actual));
ChunkedArrayFromVector<Int32Type>({{123}, {-456}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(ChunkedArrayBuilder, MultipleChunks) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
struct_({field("a", int32())}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
RowsOfOneColumn("a", {1, 2, 3}),
RowsOfOneColumn("a", {4, 5}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<Int32Type>({{1, 2, 3}, {4, 5}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(ChunkedArrayBuilder, MultipleChunksParallel) {
auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
struct_({field("a", int32())}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
RowsOfOneColumn("a", {1, 2}),
RowsOfOneColumn("a", {3}),
RowsOfOneColumn("a", {4, 5}),
RowsOfOneColumn("a", {6, 7}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<Int32Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
//////////////////////////////////////////////////////////////////////////
// Tests for type-inferring chunked array builders
TEST(InferringChunkedArrayBuilder, Empty) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder, {}, &actual);
ASSERT_TRUE(actual->type()->Equals(*struct_({})));
ASSERT_EQ(actual->num_chunks(), 0);
}
TEST(InferringChunkedArrayBuilder, SingleChunkNull) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n" + RowsOfOneColumn("a", {"null", "null"}),
},
&actual);
ASSERT_TRUE(actual->type()->Equals(*struct_({field("a", null())})));
ASSERT_EQ(actual->length(), 3);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkNull) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n{}\n",
"{}\n" + RowsOfOneColumn("a", {"null", "null"}),
RowsOfOneColumn("a", {"null"}),
RowsOfOneColumn("a", {"null", "null"}) + "{}\n",
},
&actual);
ASSERT_TRUE(actual->type()->Equals(*struct_({field("a", null())})));
ASSERT_EQ(actual->length(), 9);
}
TEST(InferringChunkedArrayBuilder, SingleChunkInteger) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(
builder,
{
"{}\n" + RowsOfOneColumn("a", {123, 456}) + RowsOfOneColumn("a", {"null"}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<Int64Type>({{false, true, true, false}}, {{0, 123, 456, 0}},
&expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n{}\n",
RowsOfOneColumn("a", {"null"}),
"{}\n" + RowsOfOneColumn("a", {123, 456}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<Int64Type>({{false, false}, {false}, {false, true, true}},
{{0, 0}, {0}, {0, 123, 456}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, SingleChunkDouble) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(
builder,
{
"{}\n" + RowsOfOneColumn("a", {0.0, 12.5}) + RowsOfOneColumn("a", {"null"}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<DoubleType>({{false, true, true, false}},
{{0.0, 0.0, 12.5, 0.0}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n{}\n",
RowsOfOneColumn("a", {"null"}),
RowsOfOneColumn("a", {8}),
RowsOfOneColumn("a", {"null", "12.5"}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<DoubleType>({{false, false}, {false}, {true}, {false, true}},
{{0.0, 0.0}, {0.0}, {8.0}, {0.0, 12.5}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n" + RowsOfOneColumn("a", {"null", "\"1970-01-01\"",
"\"2018-11-13 17:11:10\""}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
{{false, false, true, true}},
{{0, 0, 0, 1542129070}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n{}\n",
RowsOfOneColumn("a", {"null"}),
RowsOfOneColumn("a", {"\"1970-01-01\""}),
RowsOfOneColumn("a", {"\"2018-11-13 17:11:10\""}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
{{false, false}, {false}, {true}, {true}},
{{0, 0}, {0}, {0}, {1542129070}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, SingleChunkString) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(
builder,
{
"{}\n" + RowsOfOneColumn("a", {"null", "\"\"", "null", "\"foo\"", "\"baré\""}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<StringType, std::string>(
{{false, false, true, false, true, true}}, {{"", "", "", "", "foo", "baré"}},
&expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkString) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n{}\n",
RowsOfOneColumn("a", {"\"\"", "null"}),
RowsOfOneColumn("a", {"\"1970-01-01\""}),
RowsOfOneColumn("a", {"\"\"", "\"baré\""}),
},
&actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<StringType, std::string>(
{{false, false}, {true, false}, {true}, {true, true}},
{{"", ""}, {"", ""}, {"1970-01-01"}, {"", "baré"}}, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) {
auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
std::vector<std::string> chunks;
std::vector<std::vector<int>> expected_chunks;
for (int i = 0; i < 1 << 10; ++i) {
expected_chunks.push_back({i, i + 1, i + 2, i + 3});
chunks.push_back(RowsOfOneColumn("a", {i, i + 1, i + 2, i + 3}));
}
AssertBuilding(builder, chunks, &actual);
std::shared_ptr<ChunkedArray> expected;
ChunkedArrayFromVector<Int64Type>(expected_chunks, &expected);
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, SingleChunkList) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
std::string("{}\n") + "{\"a\": []}\n" + "{\"a\": [1, 2]}\n",
},
&actual);
auto expected = ChunkedArrayFromJSON(list(int64()), {"[null, [], [1, 2]]"});
AssertFieldEqual({"a"}, actual, *expected);
}
TEST(InferringChunkedArrayBuilder, MultipleChunkList) {
auto tg = TaskGroup::MakeSerial();
std::shared_ptr<ChunkedArrayBuilder> builder;
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
struct_({}), &builder));
std::shared_ptr<ChunkedArray> actual;
AssertBuilding(builder,
{
"{}\n",
"{\"a\": []}\n",
"{\"a\": [1, 2]}\n",
},
&actual);
auto expected = ChunkedArrayFromJSON(list(int64()), {"[null]", "[[]]", "[[1, 2]]"});
AssertFieldEqual({"a"}, actual, *expected);
}
} // namespace json
} // namespace arrow