blob: 327f1bf528ca6665e654ef1ea19aca2e811a0d21 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <arrow/compute/api.h>
#include <cstddef>
#include <iostream>
#include <memory>
#include <ostream>
#include <string>
#include "arrow/api.h"
#include "arrow/filesystem/api.h"
#include "graphar/arrow/chunk_reader.h"
#include "graphar/arrow/chunk_writer.h"
#include "graphar/fwd.h"
#include "graphar/graph_info.h"
#include "graphar/types.h"
#include "parquet/arrow/writer.h"
#include "graphar/api/high_level_writer.h"
#include "graphar/writer_util.h"
#include "./util.h"
#include <catch2/catch_test_macros.hpp>
std::shared_ptr<arrow::Table> read_csv_to_table(const std::string& filename) {
arrow::csv::ReadOptions read_options{};
arrow::csv::ParseOptions parse_options{};
arrow::csv::ConvertOptions convert_options{};
parse_options.delimiter = '|';
auto input =
arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool())
.ValueOrDie();
auto reader = arrow::csv::TableReader::Make(arrow::io::default_io_context(),
input, read_options,
parse_options, convert_options)
.ValueOrDie();
std::shared_ptr<arrow::Table> table;
table = reader->Read().ValueOrDie();
return table;
}
namespace graphar {
TEST_CASE_METHOD(GlobalFixture, "read multi-properties from csv file") {
// read labels csv file as arrow table
auto person_table = read_csv_to_table(test_data_dir + "/ldbc/person_0_0.csv");
auto seed = static_cast<unsigned int>(time(NULL));
int expected_row = rand_r(&seed) % person_table->num_rows();
auto person_schema = person_table->schema();
arrow::MemoryPool* pool = arrow::default_memory_pool();
auto value_builder = std::make_shared<arrow::StringBuilder>();
arrow::ListBuilder builder(pool, value_builder);
auto email_col_idx = person_table->schema()->GetFieldIndex("emails");
std::string expected_emails =
std::static_pointer_cast<arrow::StringArray>(
person_table->column(email_col_idx)->chunk(0))
->GetString(expected_row);
for (int64_t chunk_idx = 0;
chunk_idx < person_table->column(email_col_idx)->num_chunks();
++chunk_idx) {
auto chunk = person_table->column(email_col_idx)->chunk(chunk_idx);
auto email_column = std::static_pointer_cast<arrow::StringArray>(chunk);
for (int64_t row = 0; row < email_column->length(); ++row) {
auto result = builder.Append();
REQUIRE(result.ok());
if (email_column->IsValid(row)) {
std::string emails_string = email_column->GetString(row);
auto row_emails = SplitString(emails_string, ';');
for (const auto& email : row_emails) {
REQUIRE(value_builder->Append(email).ok());
}
}
}
}
std::shared_ptr<arrow::Array> array;
builder.Finish(&array);
auto person_emails_chunked_array = std::make_shared<arrow::ChunkedArray>(
std::vector<std::shared_ptr<arrow::Array>>{array});
int emailFieldIndex = person_schema->GetFieldIndex("emails");
person_table = person_table->RemoveColumn(emailFieldIndex).ValueOrDie();
person_schema = person_schema->RemoveField(emailFieldIndex).ValueOrDie();
person_schema =
person_schema
->AddField(person_schema->num_fields(),
arrow::field("emails", arrow::list(arrow::utf8())))
.ValueOrDie();
person_table = person_table
->AddColumn(person_table->num_columns(),
person_schema->fields().back(),
person_emails_chunked_array)
.ValueOrDie();
auto index = person_schema->GetFieldIndex("emails");
auto emails_col = person_table->column(index)->chunk(0);
auto result = std::static_pointer_cast<arrow::ListArray>(
emails_col->View(arrow::list(arrow::utf8())).ValueOrDie());
auto values = std::static_pointer_cast<arrow::StringArray>(result->values());
int64_t start = result->value_offset(expected_row);
int64_t end = result->value_offset(expected_row + 1);
std::string emails = "";
for (int64_t i = start; i < end; ++i) {
emails += values->GetString(i);
if (i < end - 1)
emails += ";";
}
std::cout << "random row: " << expected_row << std::endl;
REQUIRE(expected_emails == emails);
// write to parquet file
std::string path = test_data_dir + "/ldbc/parquet/" + "ldbc.graph.yml";
auto graph_info = graphar::GraphInfo::Load(path).value();
auto vertex_info = graph_info->GetVertexInfo("person");
auto maybe_writer =
VertexPropertyWriter::Make(vertex_info, "/tmp/ldbc/parquet/");
REQUIRE(!maybe_writer.has_error());
auto writer = maybe_writer.value();
REQUIRE(writer->WriteTable(person_table, 0).ok());
REQUIRE(writer->WriteVerticesNum(person_table->num_rows()).ok());
auto maybe_reader = VertexPropertyArrowChunkReader::Make(
vertex_info, vertex_info->GetPropertyGroup("emails"),
"/tmp/ldbc/parquet/");
REQUIRE(maybe_reader.status().ok());
auto reader = maybe_reader.value();
REQUIRE(reader->seek(expected_row).ok());
auto table_result = reader->GetChunk();
REQUIRE(table_result.status().ok());
auto table = table_result.value();
index = table->schema()->GetFieldIndex("emails");
emails_col = table->column(index)->chunk(0);
result = std::static_pointer_cast<arrow::ListArray>(
emails_col->View(arrow::list(arrow::large_utf8())).ValueOrDie());
auto email_result =
std::static_pointer_cast<arrow::LargeStringArray>(result->value_slice(0));
emails = "";
end = email_result->length();
for (int64_t i = 0; i < end; ++i) {
emails += email_result->GetString(i);
if (i < end - 1)
emails += ";";
}
std::cout << emails << std::endl;
REQUIRE(expected_emails == emails);
}
TEST_CASE_METHOD(GlobalFixture, "TestMultiProperty high level builder") {
int vertex_count = 3;
std::vector<std::string> property_names = {"id", "emails"};
std::vector<int64_t> id = {0, 1, 2};
std::vector<std::vector<std::string>> emails = {
{"john@example.com", "john.work@example.com"},
{"jane@example.com"},
{"alice@example.com", "alice123@example.com",
"a.wonderland@example.com"}};
std::string vertex_meta_file =
test_data_dir + "/ldbc/parquet/" + "person.vertex.yml";
auto vertex_meta = graphar::Yaml::LoadFile(vertex_meta_file).value();
auto vertex_info = graphar::VertexInfo::Load(vertex_meta).value();
graphar::IdType start_index = 0;
SECTION("add sample values to set property") {
graphar::builder::VerticesBuilder builder(
vertex_info, "/tmp/", start_index, nullptr,
graphar::ValidateLevel::strong_validate);
// prepare vertex data
int vertex_count = 1;
std::vector<std::vector<std::string>> emails = {
{"john@example.com", "john@example.com"}};
// add vertices
for (int i = 0; i < vertex_count; i++) {
graphar::builder::Vertex v;
for (const auto& email : emails[i]) {
v.AddProperty(graphar::Cardinality::SET, "emails",
email); // Multi-property
}
REQUIRE(builder.AddVertex(v).IsKeyError());
}
}
SECTION("test add single values to set property") {
graphar::builder::VerticesBuilder builder(
vertex_info, "/tmp/", start_index, nullptr,
graphar::ValidateLevel::strong_validate);
// prepare vertex data
int vertex_count = 3;
std::vector<std::string> emails = {"john@example.com", "jane@example.com",
"alice@example.com"};
for (int i = 0; i < vertex_count; i++) {
graphar::builder::Vertex v;
v.AddProperty("emails", emails[i]);
REQUIRE(builder.AddVertex(v).ok());
}
}
SECTION("test add multi values to single property") {
auto single_email =
CreatePropertyGroup({Property("single_email", string(), false)},
FileType::PARQUET, "single_email/");
auto test_vertex_info = vertex_info->AddPropertyGroup(single_email).value();
graphar::builder::VerticesBuilder builder(
test_vertex_info, "/tmp/", start_index, nullptr,
graphar::ValidateLevel::strong_validate);
for (int i = 0; i < vertex_count; i++) {
graphar::builder::Vertex v;
v.AddProperty(graphar::Cardinality::LIST, "single_email", emails[i]);
REQUIRE(builder.AddVertex(v).IsTypeError());
}
}
SECTION("test add multi values to set property") {
auto set_email = CreatePropertyGroup(
{Property("set_email", string(), false, true, Cardinality::SET)},
FileType::PARQUET, "set_email/");
auto test_vertex_info = vertex_info->AddPropertyGroup(set_email).value();
graphar::builder::VerticesBuilder builder(
test_vertex_info, "/tmp/", start_index, nullptr,
graphar::ValidateLevel::strong_validate);
int vertex_count = 1;
std::vector<std::vector<std::string>> emails = {
{"john@example.com", "john@example.com"}};
// add vertices
for (int i = 0; i < vertex_count; i++) {
graphar::builder::Vertex v;
for (const auto& email : emails[i]) {
v.AddProperty(graphar::Cardinality::LIST, "set_email",
email); // Multi-property
}
REQUIRE(builder.AddVertex(v).IsKeyError());
}
}
SECTION("test write to file") {
graphar::builder::VerticesBuilder builder(
vertex_info, "/tmp/", start_index, nullptr,
graphar::ValidateLevel::strong_validate);
// prepare vertex data
// add vertices
for (int i = 0; i < vertex_count; i++) {
graphar::builder::Vertex v;
v.AddProperty(property_names[0], id[i]);
for (const auto& email : emails[i]) {
v.AddProperty(graphar::Cardinality::SET, property_names[1], email);
}
REQUIRE(builder.AddVertex(v).ok());
}
auto st = builder.Dump();
std::cout << st.message() << std::endl;
REQUIRE(st.ok());
}
SECTION("test read from file") {
// read from file
auto maybe_reader = graphar::VertexPropertyArrowChunkReader::Make(
vertex_info, vertex_info->GetPropertyGroup(property_names[1]), "/tmp/");
REQUIRE(maybe_reader.status().ok());
auto reader = maybe_reader.value();
REQUIRE(reader->seek(0).ok());
auto table_result = reader->GetChunk();
REQUIRE(table_result.status().ok());
auto table = table_result.value();
auto index = table->schema()->GetFieldIndex(property_names[1]);
auto emails_col = table->column(index)->chunk(0);
auto result = std::static_pointer_cast<arrow::ListArray>(
emails_col->View(arrow::list(arrow::large_utf8())).ValueOrDie());
REQUIRE(result->length() == 3);
for (int i = 0; i < result->length(); i++) {
auto email_result = std::static_pointer_cast<arrow::LargeStringArray>(
result->value_slice(i));
for (int j = 0; j < email_result->length(); j++) {
REQUIRE(emails[i][j] == email_result->GetString(j));
}
}
}
}
} // namespace graphar