blob: f333cab2797fac3c8bb432b8d84c98858c91692c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>
// #0 Build dummy data to pass around
// To have some input data, we first create an Arrow Table that holds
// some data.
std::shared_ptr<arrow::Table> generate_table() {
arrow::Int64Builder i64builder;
PARQUET_THROW_NOT_OK(i64builder.Append({1, 2, 3, 4, 5}));
std::shared_ptr<arrow::Array> i64array;
PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
arrow::StringBuilder strbuilder;
PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
std::shared_ptr<arrow::Array> strarray;
PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
std::shared_ptr<arrow::Schema> schema = arrow::schema(
{arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
return arrow::Table::Make(schema, {i64array, strarray});
}
// #1 Write out the data as a Parquet file
void write_parquet_file(const arrow::Table& table) {
std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_THROW_NOT_OK(
arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet", &outfile));
// The last argument to the function call is the size of the RowGroup in
// the parquet file. Normally you would choose this to be rather large but
// for the example, we use a small value to have multiple RowGroups.
PARQUET_THROW_NOT_OK(
parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
}
// #2: Fully read in the file
void read_whole_file() {
std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::Table> table;
PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
<< " columns." << std::endl;
}
// #3: Read only a single RowGroup of the parquet file
void read_single_rowgroup() {
std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::Table> table;
PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
<< " columns." << std::endl;
}
// #4: Read only a single column of the whole parquet file
void read_single_column() {
std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
std::cout << std::endl;
}
// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
// from the Parquet file.
void read_single_column_chunk() {
std::cout << "Reading first ColumnChunk of the first RowGroup of "
"parquet-arrow-example.parquet"
<< std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
std::cout << std::endl;
}
int main(int argc, char** argv) {
std::shared_ptr<arrow::Table> table = generate_table();
write_parquet_file(*table);
read_whole_file();
read_single_rowgroup();
read_single_column();
read_single_column_chunk();
}