cpp/examples/tutorial_examples/dataset_example.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements. See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership. The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License. You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations
 // under the License.

 // (Doc section: Dataset Example)

 // (Doc section: Includes)
 #include <arrow/api.h>
 #include <arrow/dataset/api.h>
 // We use Parquet headers for setting up examples; they are not required for using
 // datasets.
 #include <parquet/arrow/reader.h>
 #include <parquet/arrow/writer.h>

 #include <unistd.h>
 #include <iostream>
 // (Doc section: Includes)

 // (Doc section: Helper Functions)
 // Generate some data for the rest of this example.
 arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
   // This code should look familiar from the basic Arrow example, and is not the
   // focus of this example. However, we need data to work on it, and this makes that!
   auto schema =
       arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()),
                      arrow::field("c", arrow::int64())});
   std::shared_ptr<arrow::Array> array_a;
   std::shared_ptr<arrow::Array> array_b;
   std::shared_ptr<arrow::Array> array_c;
   arrow::NumericBuilder<arrow::Int64Type> builder;
   ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
   ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
   builder.Reset();
   ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
   ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
   builder.Reset();
   ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
   ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
   return arrow::Table::Make(schema, {array_a, array_b, array_c});
 }

 // Set up a dataset by writing two Parquet files.
 arrow::Result<std::string> CreateExampleParquetDataset(
     const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
     const std::string& root_path) {
   // Much like CreateTable(), this is utility that gets us the dataset we'll be reading
   // from. Don't worry, we also write a dataset in the example proper.
   auto base_path = root_path + "parquet_dataset";
   ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
   // Create an Arrow Table
   ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
   // Write it into two Parquet files
   ARROW_ASSIGN_OR_RAISE(auto output,
                         filesystem->OpenOutputStream(base_path + "/data1.parquet"));
   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
       *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
   ARROW_ASSIGN_OR_RAISE(output,
                         filesystem->OpenOutputStream(base_path + "/data2.parquet"));
   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
       *table->Slice(5), arrow::default_memory_pool(), output, 2048));
   return base_path;
 }

 arrow::Status PrepareEnv() {
   // Get our environment prepared for reading, by setting up some quick writing.
   ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
   std::shared_ptr<arrow::fs::FileSystem> setup_fs;
   // Note this operates in the directory the executable is built in.
   char setup_path[256];
   char* result = getcwd(setup_path, 256);
   if (result == NULL) {
     return arrow::Status::IOError("Fetching PWD failed.");
   }

   ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path));
   ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, ""));

   return arrow::Status::OK();
 }
 // (Doc section: Helper Functions)

 // (Doc section: RunMain)
 arrow::Status RunMain() {
   // (Doc section: RunMain)
   // (Doc section: PrepareEnv)
   ARROW_RETURN_NOT_OK(PrepareEnv());
   // (Doc section: PrepareEnv)

   // (Doc section: FileSystem Declare)
   // First, we need a filesystem object, which lets us interact with our local
   // filesystem starting at a given path. For the sake of simplicity, that'll be
   // the current directory.
   std::shared_ptr<arrow::fs::FileSystem> fs;
   // (Doc section: FileSystem Declare)

   // (Doc section: FileSystem Init)
   // Get the CWD, use it to make the FileSystem object.
   char init_path[256];
   char* result = getcwd(init_path, 256);
   if (result == NULL) {
     return arrow::Status::IOError("Fetching PWD failed.");
   }
   ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
   // (Doc section: FileSystem Init)

   // (Doc section: FileSelector Declare)
   // A file selector lets us actually traverse a multi-file dataset.
   arrow::fs::FileSelector selector;
   // (Doc section: FileSelector Declare)
   // (Doc section: FileSelector Config)
   selector.base_dir = "parquet_dataset";
   // Recursive is a safe bet if you don't know the nesting of your dataset.
   selector.recursive = true;
   // (Doc section: FileSelector Config)
   // (Doc section: FileSystemFactoryOptions)
   // Making an options object lets us configure our dataset reading.
   arrow::dataset::FileSystemFactoryOptions options;
   // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition
   // schema. We won't set any other options, defaults are fine.
   options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
   // (Doc section: FileSystemFactoryOptions)
   // (Doc section: File Format Setup)
   auto read_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
   // (Doc section: File Format Setup)
   // (Doc section: FileSystemDatasetFactory Make)
   // Now, we get a factory that will let us get our dataset -- we don't have the
   // dataset yet!
   ARROW_ASSIGN_OR_RAISE(auto factory, arrow::dataset::FileSystemDatasetFactory::Make(
                                           fs, selector, read_format, options));
   // (Doc section: FileSystemDatasetFactory Make)
   // (Doc section: FileSystemDatasetFactory Finish)
   // Now we build our dataset from the factory.
   ARROW_ASSIGN_OR_RAISE(auto read_dataset, factory->Finish());
   // (Doc section: FileSystemDatasetFactory Finish)
   // (Doc section: Dataset Fragments)
   // Print out the fragments
   ARROW_ASSIGN_OR_RAISE(auto fragments, read_dataset->GetFragments());
   for (const auto& fragment : fragments) {
     std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl;
     std::cout << "Partition expression: "
               << (*fragment)->partition_expression().ToString() << std::endl;
   }
   // (Doc section: Dataset Fragments)
   // (Doc section: Read Scan Builder)
   // Scan dataset into a Table -- once this is done, you can do
   // normal table things with it, like computation and printing. However, now you're
   // also dedicated to being in memory.
   ARROW_ASSIGN_OR_RAISE(auto read_scan_builder, read_dataset->NewScan());
   // (Doc section: Read Scan Builder)
   // (Doc section: Read Scanner)
   ARROW_ASSIGN_OR_RAISE(auto read_scanner, read_scan_builder->Finish());
   // (Doc section: Read Scanner)
   // (Doc section: To Table)
   ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, read_scanner->ToTable());
   std::cout << table->ToString();
   // (Doc section: To Table)

   // (Doc section: TableBatchReader)
   // Now, let's get a table out to disk as a dataset!
   // We make a RecordBatchReader from our Table, then set up a scanner, which lets us
   // go to a file.
   std::shared_ptr<arrow::TableBatchReader> write_dataset =
       std::make_shared<arrow::TableBatchReader>(table);
   // (Doc section: TableBatchReader)
   // (Doc section: WriteScanner)
   auto write_scanner_builder =
       arrow::dataset::ScannerBuilder::FromRecordBatchReader(write_dataset);
   ARROW_ASSIGN_OR_RAISE(auto write_scanner, write_scanner_builder->Finish())
   // (Doc section: WriteScanner)
   // (Doc section: Partition Schema)
   // The partition schema determines which fields are used as keys for partitioning.
   auto partition_schema = arrow::schema({arrow::field("a", arrow::utf8())});
   // (Doc section: Partition Schema)
   // (Doc section: Partition Create)
   // We'll use Hive-style partitioning, which creates directories with "key=value"
   // pairs.
   auto partitioning =
       std::make_shared<arrow::dataset::HivePartitioning>(partition_schema);
   // (Doc section: Partition Create)
   // (Doc section: Write Format)
   // Now, we declare we'll be writing Parquet files.
   auto write_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
   // (Doc section: Write Format)
   // (Doc section: Write Options)
   // This time, we make Options for writing, but do much more configuration.
   arrow::dataset::FileSystemDatasetWriteOptions write_options;
   // Defaults to start.
   write_options.file_write_options = write_format->DefaultWriteOptions();
   // (Doc section: Write Options)
   // (Doc section: Options FS)
   // Use the filesystem we already have.
   write_options.filesystem = fs;
   // (Doc section: Options FS)
   // (Doc section: Options Target)
   // Write to the folder "write_dataset" in current directory.
   write_options.base_dir = "write_dataset";
   // (Doc section: Options Target)
   // (Doc section: Options Partitioning)
   // Use the partitioning declared above.
   write_options.partitioning = partitioning;
   // (Doc section: Options Partitioning)
   // (Doc section: Options Name Template)
   // Define what the name for the files making up the dataset will be.
   write_options.basename_template = "part{i}.parquet";
   // (Doc section: Options Name Template)
   // (Doc section: Options File Behavior)
   // Set behavior to overwrite existing data -- specifically, this lets this example
   // be run more than once, and allows whatever code you have to overwrite what's there.
   write_options.existing_data_behavior =
       arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore;
   // (Doc section: Options File Behavior)
   // (Doc section: Write Dataset)
   // Write to disk!
   ARROW_RETURN_NOT_OK(
       arrow::dataset::FileSystemDataset::Write(write_options, write_scanner));
   // (Doc section: Write Dataset)
   // (Doc section: Ret)
   return arrow::Status::OK();
 }
 // (Doc section: Ret)
 // (Doc section: Main)
 int main() {
   arrow::Status st = RunMain();
   if (!st.ok()) {
     std::cerr << st << std::endl;
     return 1;
   }
   return 0;
 }
 // (Doc section: Main)

 // (Doc section: Dataset Example)
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	// (Doc section: Dataset Example)

	// (Doc section: Includes)
	#include <arrow/api.h>
	#include <arrow/dataset/api.h>
	// We use Parquet headers for setting up examples; they are not required for using
	// datasets.
	#include <parquet/arrow/reader.h>
	#include <parquet/arrow/writer.h>

	#include <unistd.h>
	#include <iostream>
	// (Doc section: Includes)

	// (Doc section: Helper Functions)
	// Generate some data for the rest of this example.
	arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
	// This code should look familiar from the basic Arrow example, and is not the
	// focus of this example. However, we need data to work on it, and this makes that!
	auto schema =
	arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()),
	arrow::field("c", arrow::int64())});
	std::shared_ptr<arrow::Array> array_a;
	std::shared_ptr<arrow::Array> array_b;
	std::shared_ptr<arrow::Array> array_c;
	arrow::NumericBuilder<arrow::Int64Type> builder;
	ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
	ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
	builder.Reset();
	ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
	ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
	builder.Reset();
	ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
	ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
	return arrow::Table::Make(schema, {array_a, array_b, array_c});
	}

	// Set up a dataset by writing two Parquet files.
	arrow::Result<std::string> CreateExampleParquetDataset(
	const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
	const std::string& root_path) {
	// Much like CreateTable(), this is utility that gets us the dataset we'll be reading
	// from. Don't worry, we also write a dataset in the example proper.
	auto base_path = root_path + "parquet_dataset";
	ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
	// Create an Arrow Table
	ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
	// Write it into two Parquet files
	ARROW_ASSIGN_OR_RAISE(auto output,
	filesystem->OpenOutputStream(base_path + "/data1.parquet"));
	ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
	*table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
	ARROW_ASSIGN_OR_RAISE(output,
	filesystem->OpenOutputStream(base_path + "/data2.parquet"));
	ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
	*table->Slice(5), arrow::default_memory_pool(), output, 2048));
	return base_path;
	}

	arrow::Status PrepareEnv() {
	// Get our environment prepared for reading, by setting up some quick writing.
	ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
	std::shared_ptr<arrow::fs::FileSystem> setup_fs;
	// Note this operates in the directory the executable is built in.
	char setup_path[256];
	char* result = getcwd(setup_path, 256);
	if (result == NULL) {
	return arrow::Status::IOError("Fetching PWD failed.");
	}

	ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path));
	ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs, ""));

	return arrow::Status::OK();
	}
	// (Doc section: Helper Functions)

	// (Doc section: RunMain)
	arrow::Status RunMain() {
	// (Doc section: RunMain)
	// (Doc section: PrepareEnv)
	ARROW_RETURN_NOT_OK(PrepareEnv());
	// (Doc section: PrepareEnv)

	// (Doc section: FileSystem Declare)
	// First, we need a filesystem object, which lets us interact with our local
	// filesystem starting at a given path. For the sake of simplicity, that'll be
	// the current directory.
	std::shared_ptr<arrow::fs::FileSystem> fs;
	// (Doc section: FileSystem Declare)

	// (Doc section: FileSystem Init)
	// Get the CWD, use it to make the FileSystem object.
	char init_path[256];
	char* result = getcwd(init_path, 256);
	if (result == NULL) {
	return arrow::Status::IOError("Fetching PWD failed.");
	}
	ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
	// (Doc section: FileSystem Init)

	// (Doc section: FileSelector Declare)
	// A file selector lets us actually traverse a multi-file dataset.
	arrow::fs::FileSelector selector;
	// (Doc section: FileSelector Declare)
	// (Doc section: FileSelector Config)
	selector.base_dir = "parquet_dataset";
	// Recursive is a safe bet if you don't know the nesting of your dataset.
	selector.recursive = true;
	// (Doc section: FileSelector Config)
	// (Doc section: FileSystemFactoryOptions)
	// Making an options object lets us configure our dataset reading.
	arrow::dataset::FileSystemFactoryOptions options;
	// We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition
	// schema. We won't set any other options, defaults are fine.
	options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
	// (Doc section: FileSystemFactoryOptions)
	// (Doc section: File Format Setup)
	auto read_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
	// (Doc section: File Format Setup)
	// (Doc section: FileSystemDatasetFactory Make)
	// Now, we get a factory that will let us get our dataset -- we don't have the
	// dataset yet!
	ARROW_ASSIGN_OR_RAISE(auto factory, arrow::dataset::FileSystemDatasetFactory::Make(
	fs, selector, read_format, options));
	// (Doc section: FileSystemDatasetFactory Make)
	// (Doc section: FileSystemDatasetFactory Finish)
	// Now we build our dataset from the factory.
	ARROW_ASSIGN_OR_RAISE(auto read_dataset, factory->Finish());
	// (Doc section: FileSystemDatasetFactory Finish)
	// (Doc section: Dataset Fragments)
	// Print out the fragments
	ARROW_ASSIGN_OR_RAISE(auto fragments, read_dataset->GetFragments());
	for (const auto& fragment : fragments) {
	std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl;
	std::cout << "Partition expression: "
	<< (*fragment)->partition_expression().ToString() << std::endl;
	}
	// (Doc section: Dataset Fragments)
	// (Doc section: Read Scan Builder)
	// Scan dataset into a Table -- once this is done, you can do
	// normal table things with it, like computation and printing. However, now you're
	// also dedicated to being in memory.
	ARROW_ASSIGN_OR_RAISE(auto read_scan_builder, read_dataset->NewScan());
	// (Doc section: Read Scan Builder)
	// (Doc section: Read Scanner)
	ARROW_ASSIGN_OR_RAISE(auto read_scanner, read_scan_builder->Finish());
	// (Doc section: Read Scanner)
	// (Doc section: To Table)
	ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, read_scanner->ToTable());
	std::cout << table->ToString();
	// (Doc section: To Table)

	// (Doc section: TableBatchReader)
	// Now, let's get a table out to disk as a dataset!
	// We make a RecordBatchReader from our Table, then set up a scanner, which lets us
	// go to a file.
	std::shared_ptr<arrow::TableBatchReader> write_dataset =
	std::make_shared<arrow::TableBatchReader>(table);
	// (Doc section: TableBatchReader)
	// (Doc section: WriteScanner)
	auto write_scanner_builder =
	arrow::dataset::ScannerBuilder::FromRecordBatchReader(write_dataset);
	ARROW_ASSIGN_OR_RAISE(auto write_scanner, write_scanner_builder->Finish())
	// (Doc section: WriteScanner)
	// (Doc section: Partition Schema)
	// The partition schema determines which fields are used as keys for partitioning.
	auto partition_schema = arrow::schema({arrow::field("a", arrow::utf8())});
	// (Doc section: Partition Schema)
	// (Doc section: Partition Create)
	// We'll use Hive-style partitioning, which creates directories with "key=value"
	// pairs.
	auto partitioning =
	std::make_shared<arrow::dataset::HivePartitioning>(partition_schema);
	// (Doc section: Partition Create)
	// (Doc section: Write Format)
	// Now, we declare we'll be writing Parquet files.
	auto write_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
	// (Doc section: Write Format)
	// (Doc section: Write Options)
	// This time, we make Options for writing, but do much more configuration.
	arrow::dataset::FileSystemDatasetWriteOptions write_options;
	// Defaults to start.
	write_options.file_write_options = write_format->DefaultWriteOptions();
	// (Doc section: Write Options)
	// (Doc section: Options FS)
	// Use the filesystem we already have.
	write_options.filesystem = fs;
	// (Doc section: Options FS)
	// (Doc section: Options Target)
	// Write to the folder "write_dataset" in current directory.
	write_options.base_dir = "write_dataset";
	// (Doc section: Options Target)
	// (Doc section: Options Partitioning)
	// Use the partitioning declared above.
	write_options.partitioning = partitioning;
	// (Doc section: Options Partitioning)
	// (Doc section: Options Name Template)
	// Define what the name for the files making up the dataset will be.
	write_options.basename_template = "part{i}.parquet";
	// (Doc section: Options Name Template)
	// (Doc section: Options File Behavior)
	// Set behavior to overwrite existing data -- specifically, this lets this example
	// be run more than once, and allows whatever code you have to overwrite what's there.
	write_options.existing_data_behavior =
	arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore;
	// (Doc section: Options File Behavior)
	// (Doc section: Write Dataset)
	// Write to disk!
	ARROW_RETURN_NOT_OK(
	arrow::dataset::FileSystemDataset::Write(write_options, write_scanner));
	// (Doc section: Write Dataset)
	// (Doc section: Ret)
	return arrow::Status::OK();
	}
	// (Doc section: Ret)
	// (Doc section: Main)
	int main() {
	arrow::Status st = RunMain();
	if (!st.ok()) {
	std::cerr << st << std::endl;
	return 1;
	}
	return 0;
	}
	// (Doc section: Main)

	// (Doc section: Dataset Example)