| /* |
| * Copyright 2024-present Alibaba Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #pragma once |
| |
| #include <cstdint> |
| #include <map> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "paimon/result.h" |
| #include "paimon/visibility.h" |
| |
| struct ArrowArray; |
| |
| namespace paimon { |
| /// `RecordBatch` encapsulates a batch of data with the same schema, supporting different types such |
| /// as `INSERT`, `UPDATE_BEFORE`, `UPDATE_AFTER`, and `DELETE`. It is typically used in streaming |
| /// write or batch processing scenarios, with underlying data stored in the Apache Arrow format. |
| /// @note Do not use this class directly, use `RecordBatchBuilder` to build a `RecordBatch` which |
| /// has input validation. |
| class PAIMON_EXPORT RecordBatch { |
| public: |
| enum class PAIMON_EXPORT RowKind : int8_t { |
| INSERT = 0, |
| UPDATE_BEFORE = 1, |
| UPDATE_AFTER = 2, |
| DELETE = 3, |
| }; |
| |
| /// @note 1. Data cannot be reused, as it will be released after Write. 2. If a partition |
| /// field's value is null, it should be represented as "__DEFAULT_PARTITION__"(or a user-defined |
| /// default value) in the partition map. However, in the Arrow array, partition column values |
| /// MUST NOT be set to "__DEFAULT_PARTITION__" (or a user-defined default value). Instead, they |
| /// should be properly set as actual nulls. If used, it may lead to behavioral inconsistencies |
| /// between C++ Paimon and Java Paimon. |
| RecordBatch(const std::map<std::string, std::string>& partition, int32_t bucket, |
| const std::vector<RowKind>& row_kinds, ArrowArray* data); |
| ~RecordBatch(); |
| |
| RecordBatch(RecordBatch&&); |
| RecordBatch& operator=(RecordBatch&&); |
| |
| RecordBatch(const RecordBatch&) = delete; |
| RecordBatch& operator=(const RecordBatch&) = delete; |
| |
| const std::map<std::string, std::string>& GetPartition() const { |
| return partition_; |
| } |
| |
| int32_t GetBucket() const { |
| return bucket_; |
| } |
| |
| ArrowArray* GetData() const { |
| return data_; |
| } |
| |
| const std::vector<RecordBatch::RowKind>& GetRowKind() const { |
| return row_kinds_; |
| } |
| |
| void SetBucket(int32_t bucket) { |
| bucket_ = bucket; |
| } |
| |
| bool HasSpecifiedBucket() const; |
| |
| private: |
| std::map<std::string, std::string> partition_; |
| int32_t bucket_; |
| std::vector<RecordBatch::RowKind> row_kinds_; |
| ::ArrowArray* data_; |
| }; |
| |
| /// Builder for constructing `RecordBatch` instances. |
| /// |
| /// This class provides a convenient way to build `RecordBatch` objects by setting |
| /// various properties such as data, row kinds, partition information, and bucket id. |
| class PAIMON_EXPORT RecordBatchBuilder { |
| public: |
| /// Constructs a `RecordBatchBuilder` with Arrow data |
| /// |
| /// @note The `data` must conform to table schema: |
| /// - Each array in `data` corresponds to a field in table schema. |
| /// - If a field in table schema is marked as non-nullable (`nullable = false`), |
| /// the corresponding array in `data` must have zero null entries. |
| /// |
| /// @note Consistency between `data` and table schema will be validated during the write |
| /// process. |
| /// |
| /// @param data ArrowArray struct containing the columnar data (via C Data Interface) |
| explicit RecordBatchBuilder(::ArrowArray* data); |
| |
| ~RecordBatchBuilder(); |
| |
| /// Move new Arrow data into the builder, replacing existing data. |
| /// @param data New Arrow array data. |
| RecordBatchBuilder& MoveData(::ArrowArray* data); |
| |
| /// Set the row kinds for each record in the batch. |
| /// @param row_kinds A vector of row kinds, including INSERT, UPDATE_BEFORE, UPDATE_AFTER and |
| /// DELETE. If not set, default value is `INSERT`. |
| /// @note `row_kinds` must have the same length as the number of records in the data. |
| RecordBatchBuilder& SetRowKinds(const std::vector<RecordBatch::RowKind>& row_kinds); |
| |
| /// Set the partition information for this record batch. |
| /// @param data Map of partition column names to their string values. |
| RecordBatchBuilder& SetPartition(const std::map<std::string, std::string>& data); |
| |
| /// Set the bucket id for this record batch. If not set, default value is `-1`. |
| /// @param bucket The bucket id for data distribution. |
| RecordBatchBuilder& SetBucket(int32_t bucket); |
| |
| /// Build and return the final `RecordBatch` instance. |
| /// |
| /// This method validates the configuration and creates `RecordBatch` with all |
| /// the specified properties. |
| Result<std::unique_ptr<RecordBatch>> Finish(); |
| |
| class Impl; |
| |
| private: |
| std::unique_ptr<Impl> impl_; |
| }; |
| |
| } // namespace paimon |