cpp/examples/arrow/row-wise-conversion-example.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements. See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership. The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License. You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <cstdint>
 #include <iostream>
 #include <vector>

 #include <arrow/api.h>

 using arrow::DoubleBuilder;
 using arrow::Int64Builder;
 using arrow::ListBuilder;

 // While we want to use columnar data structures to build efficient operations, we
 // often receive data in a row-wise fashion from other systems. In the following,
 // we want give a brief introduction into the classes provided by Apache Arrow by
 // showing how to transform row-wise data into a columnar table.
 //
 // The data in this example is stored in the following struct:
 struct data_row {
   int64_t id;
   double cost;
   std::vector<double> cost_components;
 };

 // Transforming a vector of structs into a columnar Table.
 //
 // The final representation should be an `arrow::Table` which in turn
 // is made up of an `arrow::Schema` and a list of
 // `arrow::ChunkedArray` instances. As the first step, we will iterate
 // over the data and build up the arrays incrementally.  For this
 // task, we provide `arrow::ArrayBuilder` classes that help in the
 // construction of the final `arrow::Array` instances.
 //
 // For each type, Arrow has a specially typed builder class. For the primitive
 // values `id` and `cost` we can use the respective `arrow::Int64Builder` and
 // `arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two
 // builders, a top-level `arrow::ListBuilder` that builds the array of offsets and
 // a nested `arrow::DoubleBuilder` that constructs the underlying values array that
 // is referenced by the offsets in the former array.
 arrow::Status VectorToColumnarTable(const std::vector<struct data_row>& rows,
                                     std::shared_ptr<arrow::Table>* table) {
   // The builders are more efficient using
   // arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of
   // the underlying memory regions in-place. At the moment, arrow::jemalloc is only
   // supported on Unix systems, not Windows.
   arrow::MemoryPool* pool = arrow::default_memory_pool();

   Int64Builder id_builder(pool);
   DoubleBuilder cost_builder(pool);
   ListBuilder components_builder(pool, std::make_shared<DoubleBuilder>(pool));
   // The following builder is owned by components_builder.
   DoubleBuilder& cost_components_builder =
       *(static_cast<DoubleBuilder*>(components_builder.value_builder()));

   // Now we can loop over our existing data and insert it into the builders. The
   // `Append` calls here may fail (e.g. we cannot allocate enough additional memory).
   // Thus we need to check their return values. For more information on these values,
   // check the documentation about `arrow::Status`.
   for (const data_row& row : rows) {
     ARROW_RETURN_NOT_OK(id_builder.Append(row.id));
     ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost));

     // Indicate the start of a new list row. This will memorise the current
     // offset in the values builder.
     ARROW_RETURN_NOT_OK(components_builder.Append());
     // Store the actual values. The final nullptr argument tells the underyling
     // builder that all added values are valid, i.e. non-null.
     ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(),
                                                              row.cost_components.size()));
   }

   // At the end, we finalise the arrays, declare the (type) schema and combine them
   // into a single `arrow::Table`:
   std::shared_ptr<arrow::Array> id_array;
   ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array));
   std::shared_ptr<arrow::Array> cost_array;
   ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array));
   // No need to invoke cost_components_builder.Finish because it is implied by
   // the parent builder's Finish invocation.
   std::shared_ptr<arrow::Array> cost_components_array;
   ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array));

   std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
       arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()),
       arrow::field("cost_components", arrow::list(arrow::float64()))};

   auto schema = std::make_shared<arrow::Schema>(schema_vector);

   // The final `table` variable is the one we then can pass on to other functions
   // that can consume Apache Arrow memory structures. This object has ownership of
   // all referenced data, thus we don't have to care about undefined references once
   // we leave the scope of the function building the table and its underlying arrays.
   *table = arrow::Table::Make(schema, {id_array, cost_array, cost_components_array});

   return arrow::Status::OK();
 }

 arrow::Status ColumnarTableToVector(const std::shared_ptr<arrow::Table>& table,
                                     std::vector<struct data_row>* rows) {
   // To convert an Arrow table back into the same row-wise representation as in the
   // above section, we first will check that the table conforms to our expected
   // schema and then will build up the vector of rows incrementally.
   //
   // For the check if the table is as expected, we can utilise solely its schema.
   std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
       arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()),
       arrow::field("cost_components", arrow::list(arrow::float64()))};
   auto expected_schema = std::make_shared<arrow::Schema>(schema_vector);

   if (!expected_schema->Equals(*table->schema())) {
     // The table doesn't have the expected schema thus we cannot directly
     // convert it to our target representation.
     return arrow::Status::Invalid("Schemas are not matching!");
   }

   // As we have ensured that the table has the expected structure, we can unpack the
   // underlying arrays. For the primitive columns `id` and `cost` we can use the high
   // level functions to get the values whereas for the nested column
   // `cost_components` we need to access the C-pointer to the data to copy its
   // contents into the resulting `std::vector<double>`. Here we need to be care to
   // also add the offset to the pointer. This offset is needed to enable zero-copy
   // slicing operations. While this could be adjusted automatically for double
   // arrays, this cannot be done for the accompanying bitmap as often the slicing
   // border would be inside a byte.

   auto ids =
       std::static_pointer_cast<arrow::Int64Array>(table->column(0)->chunk(0));
   auto costs =
       std::static_pointer_cast<arrow::DoubleArray>(table->column(1)->chunk(0));
   auto cost_components =
       std::static_pointer_cast<arrow::ListArray>(table->column(2)->chunk(0));
   auto cost_components_values =
       std::static_pointer_cast<arrow::DoubleArray>(cost_components->values());
   // To enable zero-copy slices, the native values pointer might need to account
   // for this slicing offset. This is not needed for the higher level functions
   // like Value(…) that already account for this offset internally.
   const double* ccv_ptr = cost_components_values->data()->GetValues<double>(1);

   for (int64_t i = 0; i < table->num_rows(); i++) {
     // Another simplification in this example is that we assume that there are
     // no null entries, e.g. each row is fill with valid values.
     int64_t id = ids->Value(i);
     double cost = costs->Value(i);
     const double* first = ccv_ptr + cost_components->value_offset(i);
     const double* last = ccv_ptr + cost_components->value_offset(i + 1);
     std::vector<double> components_vec(first, last);
     rows->push_back({id, cost, components_vec});
   }

   return arrow::Status::OK();
 }

 #define EXIT_ON_FAILURE(expr)                      \
   do {                                             \
     arrow::Status status_ = (expr);                \
     if (!status_.ok()) {                           \
       std::cerr << status_.message() << std::endl; \
       return EXIT_FAILURE;                         \
     }                                              \
   } while (0);

 int main(int argc, char** argv) {
   std::vector<data_row> rows = {
       {1, 1.0, {1.0}}, {2, 2.0, {1.0, 2.0}}, {3, 3.0, {1.0, 2.0, 3.0}}};

   std::shared_ptr<arrow::Table> table;
   EXIT_ON_FAILURE(VectorToColumnarTable(rows, &table));

   std::vector<data_row> expected_rows;
   EXIT_ON_FAILURE(ColumnarTableToVector(table, &expected_rows));

   assert(rows.size() == expected_rows.size());

   return EXIT_SUCCESS;
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <cstdint>
	#include <iostream>
	#include <vector>

	#include <arrow/api.h>

	using arrow::DoubleBuilder;
	using arrow::Int64Builder;
	using arrow::ListBuilder;

	// While we want to use columnar data structures to build efficient operations, we
	// often receive data in a row-wise fashion from other systems. In the following,
	// we want give a brief introduction into the classes provided by Apache Arrow by
	// showing how to transform row-wise data into a columnar table.
	//
	// The data in this example is stored in the following struct:
	struct data_row {
	int64_t id;
	double cost;
	std::vector<double> cost_components;
	};

	// Transforming a vector of structs into a columnar Table.
	//
	// The final representation should be an `arrow::Table` which in turn
	// is made up of an `arrow::Schema` and a list of
	// `arrow::ChunkedArray` instances. As the first step, we will iterate
	// over the data and build up the arrays incrementally. For this
	// task, we provide `arrow::ArrayBuilder` classes that help in the
	// construction of the final `arrow::Array` instances.
	//
	// For each type, Arrow has a specially typed builder class. For the primitive
	// values `id` and `cost` we can use the respective `arrow::Int64Builder` and
	// `arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two
	// builders, a top-level `arrow::ListBuilder` that builds the array of offsets and
	// a nested `arrow::DoubleBuilder` that constructs the underlying values array that
	// is referenced by the offsets in the former array.
	arrow::Status VectorToColumnarTable(const std::vector<struct data_row>& rows,
	std::shared_ptr<arrow::Table>* table) {
	// The builders are more efficient using
	// arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of
	// the underlying memory regions in-place. At the moment, arrow::jemalloc is only
	// supported on Unix systems, not Windows.
	arrow::MemoryPool* pool = arrow::default_memory_pool();

	Int64Builder id_builder(pool);
	DoubleBuilder cost_builder(pool);
	ListBuilder components_builder(pool, std::make_shared<DoubleBuilder>(pool));
	// The following builder is owned by components_builder.
	DoubleBuilder& cost_components_builder =
	(static_cast<DoubleBuilder>(components_builder.value_builder()));

	// Now we can loop over our existing data and insert it into the builders. The
	// `Append` calls here may fail (e.g. we cannot allocate enough additional memory).
	// Thus we need to check their return values. For more information on these values,
	// check the documentation about `arrow::Status`.
	for (const data_row& row : rows) {
	ARROW_RETURN_NOT_OK(id_builder.Append(row.id));
	ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost));

	// Indicate the start of a new list row. This will memorise the current
	// offset in the values builder.
	ARROW_RETURN_NOT_OK(components_builder.Append());
	// Store the actual values. The final nullptr argument tells the underyling
	// builder that all added values are valid, i.e. non-null.
	ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(),
	row.cost_components.size()));
	}

	// At the end, we finalise the arrays, declare the (type) schema and combine them
	// into a single `arrow::Table`:
	std::shared_ptr<arrow::Array> id_array;
	ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array));
	std::shared_ptr<arrow::Array> cost_array;
	ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array));
	// No need to invoke cost_components_builder.Finish because it is implied by
	// the parent builder's Finish invocation.
	std::shared_ptr<arrow::Array> cost_components_array;
	ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array));

	std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
	arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()),
	arrow::field("cost_components", arrow::list(arrow::float64()))};

	auto schema = std::make_shared<arrow::Schema>(schema_vector);

	// The final `table` variable is the one we then can pass on to other functions
	// that can consume Apache Arrow memory structures. This object has ownership of
	// all referenced data, thus we don't have to care about undefined references once
	// we leave the scope of the function building the table and its underlying arrays.
	*table = arrow::Table::Make(schema, {id_array, cost_array, cost_components_array});

	return arrow::Status::OK();
	}

	arrow::Status ColumnarTableToVector(const std::shared_ptr<arrow::Table>& table,
	std::vector<struct data_row>* rows) {
	// To convert an Arrow table back into the same row-wise representation as in the
	// above section, we first will check that the table conforms to our expected
	// schema and then will build up the vector of rows incrementally.
	//
	// For the check if the table is as expected, we can utilise solely its schema.
	std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
	arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()),
	arrow::field("cost_components", arrow::list(arrow::float64()))};
	auto expected_schema = std::make_shared<arrow::Schema>(schema_vector);

	if (!expected_schema->Equals(*table->schema())) {
	// The table doesn't have the expected schema thus we cannot directly
	// convert it to our target representation.
	return arrow::Status::Invalid("Schemas are not matching!");
	}

	// As we have ensured that the table has the expected structure, we can unpack the
	// underlying arrays. For the primitive columns `id` and `cost` we can use the high
	// level functions to get the values whereas for the nested column
	// `cost_components` we need to access the C-pointer to the data to copy its
	// contents into the resulting `std::vector<double>`. Here we need to be care to
	// also add the offset to the pointer. This offset is needed to enable zero-copy
	// slicing operations. While this could be adjusted automatically for double
	// arrays, this cannot be done for the accompanying bitmap as often the slicing
	// border would be inside a byte.

	auto ids =
	std::static_pointer_cast<arrow::Int64Array>(table->column(0)->chunk(0));
	auto costs =
	std::static_pointer_cast<arrow::DoubleArray>(table->column(1)->chunk(0));
	auto cost_components =
	std::static_pointer_cast<arrow::ListArray>(table->column(2)->chunk(0));
	auto cost_components_values =
	std::static_pointer_cast<arrow::DoubleArray>(cost_components->values());
	// To enable zero-copy slices, the native values pointer might need to account
	// for this slicing offset. This is not needed for the higher level functions
	// like Value(…) that already account for this offset internally.
	const double* ccv_ptr = cost_components_values->data()->GetValues<double>(1);

	for (int64_t i = 0; i < table->num_rows(); i++) {
	// Another simplification in this example is that we assume that there are
	// no null entries, e.g. each row is fill with valid values.
	int64_t id = ids->Value(i);
	double cost = costs->Value(i);
	const double* first = ccv_ptr + cost_components->value_offset(i);
	const double* last = ccv_ptr + cost_components->value_offset(i + 1);
	std::vector<double> components_vec(first, last);
	rows->push_back({id, cost, components_vec});
	}

	return arrow::Status::OK();
	}

	#define EXIT_ON_FAILURE(expr) \
	do { \
	arrow::Status status_ = (expr); \
	if (!status_.ok()) { \
	std::cerr << status_.message() << std::endl; \
	return EXIT_FAILURE; \
	} \
	} while (0);

	int main(int argc, char** argv) {
	std::vector<data_row> rows = {
	{1, 1.0, {1.0}}, {2, 2.0, {1.0, 2.0}}, {3, 3.0, {1.0, 2.0, 3.0}}};

	std::shared_ptr<arrow::Table> table;
	EXIT_ON_FAILURE(VectorToColumnarTable(rows, &table));

	std::vector<data_row> expected_rows;
	EXIT_ON_FAILURE(ColumnarTableToVector(table, &expected_rows));

	assert(rows.size() == expected_rows.size());

	return EXIT_SUCCESS;
	}