| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <arrow/api.h> |
| #include <arrow/io/api.h> |
| #include <parquet/arrow/reader.h> |
| #include <parquet/arrow/writer.h> |
| #include <parquet/exception.h> |
| |
| // #0 Build dummy data to pass around |
| // To have some input data, we first create an Arrow Table that holds |
| // some data. |
| std::shared_ptr<arrow::Table> generate_table() { |
| arrow::Int64Builder i64builder; |
| PARQUET_THROW_NOT_OK(i64builder.Append({1, 2, 3, 4, 5})); |
| std::shared_ptr<arrow::Array> i64array; |
| PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array)); |
| |
| arrow::StringBuilder strbuilder; |
| PARQUET_THROW_NOT_OK(strbuilder.Append("some")); |
| PARQUET_THROW_NOT_OK(strbuilder.Append("string")); |
| PARQUET_THROW_NOT_OK(strbuilder.Append("content")); |
| PARQUET_THROW_NOT_OK(strbuilder.Append("in")); |
| PARQUET_THROW_NOT_OK(strbuilder.Append("rows")); |
| std::shared_ptr<arrow::Array> strarray; |
| PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray)); |
| |
| std::shared_ptr<arrow::Schema> schema = arrow::schema( |
| {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())}); |
| |
| return arrow::Table::Make(schema, {i64array, strarray}); |
| } |
| |
| // #1 Write out the data as a Parquet file |
| void write_parquet_file(const arrow::Table& table) { |
| std::shared_ptr<arrow::io::FileOutputStream> outfile; |
| PARQUET_THROW_NOT_OK( |
| arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet", &outfile)); |
| // The last argument to the function call is the size of the RowGroup in |
| // the parquet file. Normally you would choose this to be rather large but |
| // for the example, we use a small value to have multiple RowGroups. |
| PARQUET_THROW_NOT_OK( |
| parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); |
| } |
| |
| // #2: Fully read in the file |
| void read_whole_file() { |
| std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; |
| std::shared_ptr<arrow::io::ReadableFile> infile; |
| PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open( |
| "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile)); |
| |
| std::unique_ptr<parquet::arrow::FileReader> reader; |
| PARQUET_THROW_NOT_OK( |
| parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); |
| std::shared_ptr<arrow::Table> table; |
| PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); |
| std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() |
| << " columns." << std::endl; |
| } |
| |
| // #3: Read only a single RowGroup of the parquet file |
| void read_single_rowgroup() { |
| std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl; |
| std::shared_ptr<arrow::io::ReadableFile> infile; |
| PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open( |
| "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile)); |
| |
| std::unique_ptr<parquet::arrow::FileReader> reader; |
| PARQUET_THROW_NOT_OK( |
| parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); |
| std::shared_ptr<arrow::Table> table; |
| PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table)); |
| std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() |
| << " columns." << std::endl; |
| } |
| |
| // #4: Read only a single column of the whole parquet file |
| void read_single_column() { |
| std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; |
| std::shared_ptr<arrow::io::ReadableFile> infile; |
| PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open( |
| "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile)); |
| |
| std::unique_ptr<parquet::arrow::FileReader> reader; |
| PARQUET_THROW_NOT_OK( |
| parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); |
| std::shared_ptr<arrow::Array> array; |
| PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); |
| PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); |
| std::cout << std::endl; |
| } |
| |
| // #5: Read only a single column of a RowGroup (this is known as ColumnChunk) |
| // from the Parquet file. |
| void read_single_column_chunk() { |
| std::cout << "Reading first ColumnChunk of the first RowGroup of " |
| "parquet-arrow-example.parquet" |
| << std::endl; |
| std::shared_ptr<arrow::io::ReadableFile> infile; |
| PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open( |
| "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile)); |
| |
| std::unique_ptr<parquet::arrow::FileReader> reader; |
| PARQUET_THROW_NOT_OK( |
| parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); |
| std::shared_ptr<arrow::Array> array; |
| PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); |
| PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); |
| std::cout << std::endl; |
| } |
| |
| int main(int argc, char** argv) { |
| std::shared_ptr<arrow::Table> table = generate_table(); |
| write_parquet_file(*table); |
| read_whole_file(); |
| read_single_rowgroup(); |
| read_single_column(); |
| read_single_column_chunk(); |
| } |