blob: fd7f61bfe3aff6b80e95b02543e4b13dc653bf3f [file] [log] [blame]
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>
#include "paimon/data/timestamp.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"
namespace paimon {
class MemoryPool;
/// Input data split for reading operation. Needed by most batch computation engines.
class PAIMON_EXPORT DataSplit : public Split {
public:
/// Metadata structure for simple data files.
///
/// Contains essential information about a data file including its location,
/// size, row count, sequence numbers, schema information, and timestamps.
/// This structure is used to track file metadata without loading the actual file content.
struct SimpleDataFileMeta {
SimpleDataFileMeta(const std::string& _file_path, int64_t _file_size, int64_t _row_count,
int64_t _min_sequence_number, int64_t _max_sequence_number,
int64_t _schema_id, int32_t _level, const Timestamp& _creation_time,
const std::optional<int64_t>& _delete_row_count)
: file_path(_file_path),
file_size(_file_size),
row_count(_row_count),
min_sequence_number(_min_sequence_number),
max_sequence_number(_max_sequence_number),
schema_id(_schema_id),
level(_level),
creation_time(_creation_time),
delete_row_count(_delete_row_count) {}
/// Absolute path of the data file.
///
/// If external path is enabled, `file_path` indicates the actual location in the external
/// storage system.
std::string file_path;
int64_t file_size;
int64_t row_count;
int64_t min_sequence_number;
int64_t max_sequence_number;
int64_t schema_id;
int32_t level;
Timestamp creation_time;
std::optional<int64_t> delete_row_count;
bool operator==(const SimpleDataFileMeta& other) const;
std::string ToString() const;
};
/// Get the list of metadata for all data files in this split.
/// @note This method will be removed in future versions and is only used for append tables.
virtual std::vector<SimpleDataFileMeta> GetFileList() const = 0;
};
} // namespace paimon