Cpp API

Paimon C++ is a high-performance C++ implementation of Apache Paimon. Paimon C++ aims to provide a native, high-performance and extensible implementation that allows native engines to access the Paimon datalake format with maximum efficiency.

Environment Settings

Paimon C++ is currently governed under Alibaba open source community. You can checkout the document for more details about envinroment settings.

git clone https://github.com/alibaba/paimon-cpp.git
cd paimon-cpp
mkdir build-release
cd build-release
cmake ..
make -j8       # if you have 8 CPU cores, otherwise adjust
make install

Create Catalog

Before coming into contact with the Table, you need to create a Catalog.

#include "paimon/catalog/catalog.h"

// Note that keys and values are all string
std::map<std::string, std::string> options;
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::Catalog> catalog,
                       paimon::Catalog::Create(root_path, options));

Current C++ Paimon only supports filesystem catalog. In the future, we will support REST catalog. See [Catalog]({{< ref “concepts/catalog” >}}).

You can use the catalog to create table for writing data.

Create Database

Table is located in a database. If you want to create table in a new database, you should create it.

PAIMON_RETURN_NOT_OK(catalog->CreateDatabase('database_name', options, /*ignore_if_exists=*/false));

Create Table

Table schema contains fields definition, partition keys, primary keys, table options. The field definition is described by Arrow::Schema. All arguments except fields definition are optional.

for example:

arrow::FieldVector fields = {
    arrow::field("f0", arrow::utf8()),
    arrow::field("f1", arrow::int32()),
    arrow::field("f2", arrow::int32()),
    arrow::field("f3", arrow::float64()),
};
std::shared_ptr<arrow::Schema> schema = arrow::schema(fields);
::ArrowSchema arrow_schema;
arrow::Status arrow_status = arrow::ExportSchema(*schema, &arrow_schema);
if (!arrow_status.ok()) {
    return paimon::Status::Invalid(arrow_status.message());
}
PAIMON_RETURN_NOT_OK(catalog->CreateTable(paimon::Identifier(db_name, table_name),
                                            &arrow_schema,
                                            /*partition_keys=*/{},
                                            /*primary_keys=*/{}, options,
                                            /*ignore_if_exists=*/false));

See Data Types for all supported arrow-to-paimon data types mapping.

Batch Write

Paimon table write is Two-Phase Commit, you can write many times, but once committed, no more data can be written. C++ Paimon uses Apache Arrow as [in-memory format], check out document for more details.

for example:

arrow::Result<std::shared_ptr<arrow::StructArray>> PrepareData(const arrow::FieldVector& fields) {
    arrow::StringBuilder f0_builder;
    arrow::Int32Builder f1_builder;
    arrow::Int32Builder f2_builder;
    arrow::DoubleBuilder f3_builder;

    std::vector<std::tuple<std::string, int, int, double>> data = {
        {"Alice", 1, 0, 11.0}, {"Bob", 1, 1, 12.1}, {"Cathy", 1, 2, 13.2}};

    for (const auto& row : data) {
        ARROW_RETURN_NOT_OK(f0_builder.Append(std::get<0>(row)));
        ARROW_RETURN_NOT_OK(f1_builder.Append(std::get<1>(row)));
        ARROW_RETURN_NOT_OK(f2_builder.Append(std::get<2>(row)));
        ARROW_RETURN_NOT_OK(f3_builder.Append(std::get<3>(row)));
    }

    std::shared_ptr<arrow::Array> f0_array, f1_array, f2_array, f3_array;
    ARROW_RETURN_NOT_OK(f0_builder.Finish(&f0_array));
    ARROW_RETURN_NOT_OK(f1_builder.Finish(&f1_array));
    ARROW_RETURN_NOT_OK(f2_builder.Finish(&f2_array));
    ARROW_RETURN_NOT_OK(f3_builder.Finish(&f3_array));

    std::vector<std::shared_ptr<arrow::Array>> children = {f0_array, f1_array, f2_array, f3_array};
    auto struct_type = arrow::struct_(fields);
    return std::make_shared<arrow::StructArray>(struct_type, f0_array->length(), children);
}

std::string table_path = root_path + "/" + db_name + ".db/" + table_name;
std::string commit_user = "some_commit_user";
// write
paimon::WriteContextBuilder context_builder(table_path, commit_user);
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::WriteContext> write_context,
                        context_builder.SetOptions(options).Finish());
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::FileStoreWrite> writer,
                        paimon::FileStoreWrite::Create(std::move(write_context)));
// prepare data
auto struct_array = PrepareData(fields);
if (!struct_array.ok()) {
    return paimon::Status::Invalid(struct_array.status().ToString());
}
::ArrowArray arrow_array;
arrow_status = arrow::ExportArray(*struct_array.ValueUnsafe(), &arrow_array);
if (!arrow_status.ok()) {
    return paimon::Status::Invalid(arrow_status.message());
}
paimon::RecordBatchBuilder batch_builder(&arrow_array);
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::RecordBatch> record_batch,
                        batch_builder.Finish());
PAIMON_RETURN_NOT_OK(writer->Write(std::move(record_batch)));
PAIMON_ASSIGN_OR_RAISE(std::vector<std::shared_ptr<paimon::CommitMessage>> commit_message,
                        writer->PrepareCommit());

// commit
paimon::CommitContextBuilder commit_context_builder(table_path, commit_user);
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::CommitContext> commit_context,
                        commit_context_builder.SetOptions(options).Finish());
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::FileStoreCommit> committer,
                        paimon::FileStoreCommit::Create(std::move(commit_context)));
PAIMON_RETURN_NOT_OK(committer->Commit(commit_message));

Batch Read

Predicate pushdown

A ReadContextBuilder is used to pass context to reader, push down and filter is done by reader.

ReadContextBuilder read_context_builder(table_path);

You can use PredicateBuilder to build filters and pushdown them by ReadContextBuilder:

# Example filter: 'f3' > 12.0 OR 'f1' == 1
PAIMON_ASSIGN_OR_RAISE(
    auto predicate,
    PredicateBuilder::Or(
        {PredicateBuilder::GreaterThan(/*field_index=*/3, /*field_name=*/"f3",
                                        FieldType::DOUBLE, Literal(static_cast<double>(12.0))),
        PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT,
                                    Literal(1))}));
ReadContextBuilder read_context_builder(table_path);
read_context_builder.SetPredicate(predicate).EnablePredicateFilter(true);

You can also pushdown projection by ReadContextBuilder:

# select f3 and f2 columns
read_context_builder.SetReadSchema({"f3", "f1", "f2"});

Generate Splits

Then you can step into Scan Plan stage to get splits:

// scan
paimon::ScanContextBuilder scan_context_builder(table_path);
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::ScanContext> scan_context,
                        scan_context_builder.SetOptions(options).Finish());
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::TableScan> scanner,
                        paimon::TableScan::Create(std::move(scan_context)));
PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<paimon::Plan> plan, scanner->CreatePlan());
auto splits = plan->Splits();

Finally, you can read data from the splits to arrow format.

Read Apache Arrow

This requires C++ Arrow to be installed.

PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::ReadContext> read_context,
                        read_context_builder.SetOptions(options).Finish());
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::TableRead> table_read,
                        paimon::TableRead::Create(std::move(read_context)));
PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<paimon::BatchReader> batch_reader,
                        table_read->CreateReader(splits));
arrow::ArrayVector result_array_vector;
while (true) {
    PAIMON_ASSIGN_OR_RAISE(paimon::BatchReader::ReadBatch batch, batch_reader->NextBatch());
    if (paimon::BatchReader::IsEofBatch(batch)) {
        break;
    }
    auto& [c_array, c_schema] = batch;
    auto arrow_result = arrow::ImportArray(c_array.get(), c_schema.get());
    if (!arrow_result.ok()) {
        return paimon::Status::Invalid(arrow_result.status().ToString());
    }
    auto result_array = arrow_result.ValueUnsafe();
    result_array_vector.push_back(result_array);
}
auto chunk_result = arrow::ChunkedArray::Make(result_array_vector);
if (!chunk_result.ok()) {
    return paimon::Status::Invalid(chunk_result.status().ToString());
}

Documentation

For more information, See C++ Paimon Documentation.