| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| // (Doc section: File I/O) |
| |
| // (Doc section: Includes) |
| #include <arrow/api.h> |
| #include <arrow/csv/api.h> |
| #include <arrow/io/api.h> |
| #include <arrow/ipc/api.h> |
| #include <parquet/arrow/reader.h> |
| #include <parquet/arrow/writer.h> |
| |
| #include <iostream> |
| // (Doc section: Includes) |
| |
| // (Doc section: GenInitialFile) |
| arrow::Status GenInitialFile() { |
| // Make a couple 8-bit integer arrays and a 16-bit integer array -- just like |
| // basic Arrow example. |
| arrow::Int8Builder int8builder; |
| int8_t days_raw[5] = {1, 12, 17, 23, 28}; |
| ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5)); |
| std::shared_ptr<arrow::Array> days; |
| ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish()); |
| |
| int8_t months_raw[5] = {1, 3, 5, 7, 1}; |
| ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5)); |
| std::shared_ptr<arrow::Array> months; |
| ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish()); |
| |
| arrow::Int16Builder int16builder; |
| int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995}; |
| ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5)); |
| std::shared_ptr<arrow::Array> years; |
| ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish()); |
| |
| // Get a vector of our Arrays |
| std::vector<std::shared_ptr<arrow::Array>> columns = {days, months, years}; |
| |
| // Make a schema to initialize the Table with |
| std::shared_ptr<arrow::Field> field_day, field_month, field_year; |
| std::shared_ptr<arrow::Schema> schema; |
| |
| field_day = arrow::field("Day", arrow::int8()); |
| field_month = arrow::field("Month", arrow::int8()); |
| field_year = arrow::field("Year", arrow::int16()); |
| |
| schema = arrow::schema({field_day, field_month, field_year}); |
| // With the schema and data, create a Table |
| std::shared_ptr<arrow::Table> table; |
| table = arrow::Table::Make(schema, columns); |
| |
| // Write out test files in IPC, CSV, and Parquet for the example to use. |
| std::shared_ptr<arrow::io::FileOutputStream> outfile; |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.arrow")); |
| ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc_writer, |
| arrow::ipc::MakeFileWriter(outfile, schema)); |
| ARROW_RETURN_NOT_OK(ipc_writer->WriteTable(*table)); |
| ARROW_RETURN_NOT_OK(ipc_writer->Close()); |
| |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.csv")); |
| ARROW_ASSIGN_OR_RAISE(auto csv_writer, |
| arrow::csv::MakeCSVWriter(outfile, table->schema())); |
| ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*table)); |
| ARROW_RETURN_NOT_OK(csv_writer->Close()); |
| |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.parquet")); |
| PARQUET_THROW_NOT_OK( |
| parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, 5)); |
| |
| return arrow::Status::OK(); |
| } |
| // (Doc section: GenInitialFile) |
| |
| // (Doc section: RunMain) |
| arrow::Status RunMain() { |
| // (Doc section: RunMain) |
| // (Doc section: Gen Files) |
| // Generate initial files for each format with a helper function -- don't worry, |
| // we'll also write a table in this example. |
| ARROW_RETURN_NOT_OK(GenInitialFile()); |
| // (Doc section: Gen Files) |
| |
| // (Doc section: ReadableFile Definition) |
| // First, we have to set up a ReadableFile object, which just lets us point our |
| // readers to the right data on disk. We'll be reusing this object, and rebinding |
| // it to multiple files throughout the example. |
| std::shared_ptr<arrow::io::ReadableFile> infile; |
| // (Doc section: ReadableFile Definition) |
| // (Doc section: Arrow ReadableFile Open) |
| // Get "test_in.arrow" into our file pointer |
| ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open( |
| "test_in.arrow", arrow::default_memory_pool())); |
| // (Doc section: Arrow ReadableFile Open) |
| // (Doc section: Arrow Read Open) |
| // Open up the file with the IPC features of the library, gives us a reader object. |
| ARROW_ASSIGN_OR_RAISE(auto ipc_reader, arrow::ipc::RecordBatchFileReader::Open(infile)); |
| // (Doc section: Arrow Read Open) |
| // (Doc section: Arrow Read) |
| // Using the reader, we can read Record Batches. Note that this is specific to IPC; |
| // for other formats, we focus on Tables, but here, RecordBatches are used. |
| std::shared_ptr<arrow::RecordBatch> rbatch; |
| ARROW_ASSIGN_OR_RAISE(rbatch, ipc_reader->ReadRecordBatch(0)); |
| // (Doc section: Arrow Read) |
| |
| // (Doc section: Arrow Write Open) |
| // Just like with input, we get an object for the output file. |
| std::shared_ptr<arrow::io::FileOutputStream> outfile; |
| // Bind it to "test_out.arrow" |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.arrow")); |
| // (Doc section: Arrow Write Open) |
| // (Doc section: Arrow Writer) |
| // Set up a writer with the output file -- and the schema! We're defining everything |
| // here, loading to fire. |
| ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc_writer, |
| arrow::ipc::MakeFileWriter(outfile, rbatch->schema())); |
| // (Doc section: Arrow Writer) |
| // (Doc section: Arrow Write) |
| // Write the record batch. |
| ARROW_RETURN_NOT_OK(ipc_writer->WriteRecordBatch(*rbatch)); |
| // (Doc section: Arrow Write) |
| // (Doc section: Arrow Close) |
| // Specifically for IPC, the writer needs to be explicitly closed. |
| ARROW_RETURN_NOT_OK(ipc_writer->Close()); |
| // (Doc section: Arrow Close) |
| |
| // (Doc section: CSV Read Open) |
| // Bind our input file to "test_in.csv" |
| ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open("test_in.csv")); |
| // (Doc section: CSV Read Open) |
| // (Doc section: CSV Table Declare) |
| std::shared_ptr<arrow::Table> csv_table; |
| // (Doc section: CSV Table Declare) |
| // (Doc section: CSV Reader Make) |
| // The CSV reader has several objects for various options. For now, we'll use defaults. |
| ARROW_ASSIGN_OR_RAISE( |
| auto csv_reader, |
| arrow::csv::TableReader::Make( |
| arrow::io::default_io_context(), infile, arrow::csv::ReadOptions::Defaults(), |
| arrow::csv::ParseOptions::Defaults(), arrow::csv::ConvertOptions::Defaults())); |
| // (Doc section: CSV Reader Make) |
| // (Doc section: CSV Read) |
| // Read the table. |
| ARROW_ASSIGN_OR_RAISE(csv_table, csv_reader->Read()) |
| // (Doc section: CSV Read) |
| |
| // (Doc section: CSV Write) |
| // Bind our output file to "test_out.csv" |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.csv")); |
| // The CSV writer has simpler defaults, review API documentation for more complex usage. |
| ARROW_ASSIGN_OR_RAISE(auto csv_writer, |
| arrow::csv::MakeCSVWriter(outfile, csv_table->schema())); |
| ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*csv_table)); |
| // Not necessary, but a safe practice. |
| ARROW_RETURN_NOT_OK(csv_writer->Close()); |
| // (Doc section: CSV Write) |
| |
| // (Doc section: Parquet Read Open) |
| // Bind our input file to "test_in.parquet" |
| ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open("test_in.parquet")); |
| // (Doc section: Parquet Read Open) |
| // (Doc section: Parquet FileReader) |
| std::unique_ptr<parquet::arrow::FileReader> reader; |
| // (Doc section: Parquet FileReader) |
| // (Doc section: Parquet OpenFile) |
| // Note that Parquet's OpenFile() takes the reader by reference, rather than returning |
| // a reader. |
| PARQUET_ASSIGN_OR_THROW(reader, |
| parquet::arrow::OpenFile(infile, arrow::default_memory_pool())); |
| // (Doc section: Parquet OpenFile) |
| |
| // (Doc section: Parquet Read) |
| std::shared_ptr<arrow::Table> parquet_table; |
| // Read the table. |
| PARQUET_THROW_NOT_OK(reader->ReadTable(&parquet_table)); |
| // (Doc section: Parquet Read) |
| |
| // (Doc section: Parquet Write) |
| // Parquet writing does not need a declared writer object. Just get the output |
| // file bound, then pass in the table, memory pool, output, and chunk size for |
| // breaking up the Table on-disk. |
| ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.parquet")); |
| PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable( |
| *parquet_table, arrow::default_memory_pool(), outfile, 5)); |
| // (Doc section: Parquet Write) |
| // (Doc section: Return) |
| return arrow::Status::OK(); |
| } |
| // (Doc section: Return) |
| |
| // (Doc section: Main) |
| int main() { |
| arrow::Status st = RunMain(); |
| if (!st.ok()) { |
| std::cerr << st << std::endl; |
| return 1; |
| } |
| return 0; |
| } |
| // (Doc section: Main) |
| // (Doc section: File I/O) |