blob: d9bda85586c8a59a19b110cdd3395bd7b98503cd [file] [log] [blame]
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstdint>
#include <limits>
#include "paimon/visibility.h"
namespace paimon {
/// Enumeration of supported data types in Paimon tables.
enum class FieldType {
BOOLEAN = 1,
TINYINT = 2,
SMALLINT = 3,
INT = 4,
BIGINT = 5,
FLOAT = 6,
DOUBLE = 7,
STRING = 8,
BINARY = 9,
/// timestamp type only supports precision values of 0, 3, 6, 9:
/// - 0: second precision
/// - 3: millisecond precision
/// - 6: microsecond precision
/// - 9: nanosecond precision
TIMESTAMP = 10,
DECIMAL = 11,
DATE = 12,
ARRAY = 13,
MAP = 14,
STRUCT = 15,
BLOB = 16,
UNKNOWN = 128,
};
/// Configuration options and constants for Paimon table operations.
///
/// The Options struct contains static string constants that define configuration keys
/// used throughout the Paimon system.
struct PAIMON_EXPORT Options {
/// @name merge-on-read configurations
/// The 5 constants are the prefixes or suffixes for merge on read configuration.
/// The complete configuration keys can be:
/// - fields.$field_name.aggregate-function
/// - fields.$field_name.ignore-retract
/// - fields.$field_names.sequence-group ($field_names support one or more field_name, split
/// with FIELDS_SEPARATOR)
///
/// examples:
/// - fields.f1.aggregate-function
/// - fields.f2.sequence-group
/// - fields.f3,f4.sequence-group
///
/// @{
/// FIELDS_SEPARATOR is ","
static const char FIELDS_SEPARATOR[];
/// FIELDS_PREFIX is "fields"
static const char FIELDS_PREFIX[];
/// AGG_FUNCTION is "aggregate-function"
static const char AGG_FUNCTION[];
/// DEFAULT_AGG_FUNCTION is "default-aggregate-function"
static const char DEFAULT_AGG_FUNCTION[];
/// IGNORE_RETRACT is "ignore-retract"
static const char IGNORE_RETRACT[];
/// SEQUENCE_GROUP is "sequence-group"
static const char SEQUENCE_GROUP[];
/// @}
/// "bucket" - Bucket number for file store. It should either be equal to -1 (dynamic bucket
/// mode), or it must be greater than 0 (fixed bucket mode).
static const char BUCKET[];
/// "bucket-key" - Specify the paimon distribution policy. Data is assigned to each bucket
/// according to the hash value of bucket-key. If you specify multiple fields, delimiter is ','.
/// If not specified, the primary key will be used, if there is no primary key, the full row
/// will be used.
static const char BUCKET_KEY[];
// TODO(yonghao.fyh): This option has not been used yet
/// "page-size" - Memory page size, default value 64 kb.
static const char PAGE_SIZE[];
/// "file.format" - Specify the message format of data files.
/// Default value is parquet.
static const char FILE_FORMAT[];
/// "file-system" - Specify the file system.
/// Default value is local.
static const char FILE_SYSTEM[];
/// "target-file-size" - Target size of a file. Default value is 256MB.
// TODO(xinyu.lxy): change the default value to 128MB for primary key table.
static const char TARGET_FILE_SIZE[];
/// "blob.target-file-size" - Target size of a blob file. Default is TARGET_FILE_SIZE.
static const char BLOB_TARGET_FILE_SIZE[];
/// "partition.default-name" - The default partition name in case the dynamic partition column
/// value is null/empty string.
static const char PARTITION_DEFAULT_NAME[];
/// "file.compression" - The default file compression is zstd. For faster read and write, it is
/// recommended to use zstd.
static const char FILE_COMPRESSION[];
/// "file.compression.zstd-level"
/// Default file compression zstd level. For higher compression rates, it can be configured to
/// 9, but the read and write speed will significantly decrease. Default value is 1.
static const char FILE_COMPRESSION_ZSTD_LEVEL[];
/// "manifest.target-file-size" - Suggested file size of a manifest file.
/// Default value is 8MB.
static const char MANIFEST_TARGET_FILE_SIZE[];
/// "manifest.format" - Specify the message format of manifest files.
/// Default value is avro.
static const char MANIFEST_FORMAT[];
/// "manifest.compression" - File compression for manifest, default value is zstd.
static const char MANIFEST_COMPRESSION[];
/// "manifest.merge-min-count" - To avoid frequent manifest merges, this parameter specifies the
/// minimum number of ManifestFileMeta to merge, default value is 30.
static const char MANIFEST_MERGE_MIN_COUNT[];
/// "manifest.full-compaction-threshold-size" - The size threshold for triggering full
/// compaction of manifest, default value is 16MB.
static const char MANIFEST_FULL_COMPACTION_FILE_SIZE[];
/// "source.split.target-size" - Target size of a source split when scanning a bucket. Default
/// value is 128MB.
static const char SOURCE_SPLIT_TARGET_SIZE[];
/// "source.split.open-file-cost" - Open file cost of a source file. It is used to avoid reading
/// too many files with a source split, which can be very slow. Default value is 4MB.
static const char SOURCE_SPLIT_OPEN_FILE_COST[];
/// "scan.snapshot-id" - Optional snapshot id used in case of "from-snapshot" or
/// "from-snapshot-full" scan mode
static const char SCAN_SNAPSHOT_ID[];
/// "scan.mode" - Specify the scanning behavior of the source. Values can be: "default",
/// "latest-full", "latest", "from-snapshot", "from-snapshot-full". Default value is "default".
static const char SCAN_MODE[];
/// "read.batch-size" - Read batch size for any file format if it supports.
/// The default value is 1024.
static const char READ_BATCH_SIZE[];
/// "write.batch-size" - Write batch size for any file format if it supports.
/// The default value is 1024.
static const char WRITE_BATCH_SIZE[];
/// "write-buffer-size" - Amount of data to build up in memory before converting to a sorted
/// on-disk file. The default value is 256 mb
static const char WRITE_BUFFER_SIZE[];
/// "snapshot.num-retained.min" - The minimum number of completed snapshots to retain. Should be
/// greater than or equal to 1. Default value is 10
static const char SNAPSHOT_NUM_RETAINED_MIN[];
/// "snapshot.num-retained.max" - The maximum number of completed snapshots to retain. Should be
/// greater than or equal to the minimum number. Default value is int32 max value.
static const char SNAPSHOT_NUM_RETAINED_MAX[];
/// "snapshot.time-retained" - The maximum time of completed snapshots to retain. Default value
/// is 1 hour.
static const char SNAPSHOT_TIME_RETAINED[];
/// "snapshot.expire.limit" - The maximum number of snapshots allowed to expire at a time.
/// Default value is 10.
static const char SNAPSHOT_EXPIRE_LIMIT[];
/// "snapshot.clean-empty-directories" - Whether to try to clean empty directories when expiring
/// snapshots, if enabled, please note: hdfs: may print exceptions in NameNode. oss/s3: may
/// cause performance issue. Default value is false.
static const char SNAPSHOT_CLEAN_EMPTY_DIRECTORIES[];
/// "commit.timeout" - Timeout duration of retry when commit failed. No default value.
static const char COMMIT_TIMEOUT[];
/// "commit.max-retries" - Maximum number of retries when commit failed. Default value is 10.
static const char COMMIT_MAX_RETRIES[];
/// "sequence.field" - The field that generates the sequence number for primary key table, the
/// sequence number determines which data is the most recent. Value use "," as delimiter.
static const char SEQUENCE_FIELD[];
/// "sequence.field.sort-order" - Specify the order of sequence.field. Values can be:
/// "ascending", "descending". Default value is "ascending".
static const char SEQUENCE_FIELD_SORT_ORDER[];
/// "merge-engine" - Specify the merge engine for table with primary key. Values can be:
/// "deduplicate", "partial-update", "aggregation", "first-row". Default value is "deduplicate".
static const char MERGE_ENGINE[];
/// "sort-engine" - Specify the sort engine for table with primary key. Values can be:
/// "min-heap", "loser-tree". Default value is "loser-tree".
static const char SORT_ENGINE[];
/// "ignore-delete" - Whether to ignore delete records. Default value is "false".
static const char IGNORE_DELETE[];
/// "fields.default-aggregate-function" - Default aggregate function of all fields for
/// partial-update and aggregate merge function.
static const char FIELDS_DEFAULT_AGG_FUNC[];
/// "deletion-vectors.enabled" - Whether to enable deletion vectors mode. In this mode, index
/// files containing deletion vectors are generated when data is written, which marks the data
/// for deletion. During read operations, by applying these index files, merging can be avoided.
/// Default value is false.
static const char DELETION_VECTORS_ENABLED[];
/// @note `CHANGELOG_PRODUCER` currently only support `none`
///
/// "changelog-producer" - Whether to double write to a changelog file. This changelog file
/// keeps the details of data changes, it can be read directly during stream reads. This can be
/// applied to tables with primary keys. Values can be "none", "input", "lookup",
/// "full-compaction". Default value is "none".
static const char CHANGELOG_PRODUCER[];
/// "force-lookup" - Whether to force the use of lookup for compaction. Default value is
/// "false".
static const char FORCE_LOOKUP[];
/// "partial-update.remove-record-on-delete" - Whether to remove the whole row in partial-update
/// engine when records are received. Default value is "false".
static const char PARTIAL_UPDATE_REMOVE_RECORD_ON_DELETE[];
/// "partial-update.remove-record-on-sequence-group" - When records of the given sequence groups
/// are received, remove the whole row.
static const char PARTIAL_UPDATE_REMOVE_RECORD_ON_SEQUENCE_GROUP[];
/// "scan.fallback-branch" - When a batch job queries from a table, if a partition does not
/// exist in the current branch, the reader will try to get this partition from this fallback
/// branch.
static const char SCAN_FALLBACK_BRANCH[];
/// "branch" - Specify branch name. Default value is "main".
static const char BRANCH[];
/// "file-index.read.enabled" - Whether enabled read file index. Default value is "true".
static const char FILE_INDEX_READ_ENABLED[];
/// "data-file.external-paths" - The external paths where the data of this table will be
/// written, multiple elements separated by commas.
static const char DATA_FILE_EXTERNAL_PATHS[];
/// "data-file.external-paths.strategy" - The strategy of selecting an external path when
/// writing data. Values can be: "none", "specific-fs", "round-robin". Default value is "none".
static const char DATA_FILE_EXTERNAL_PATHS_STRATEGY[];
/// "data-file.prefix" - Specify the file name prefix of data files. Default value is "data-".
static const char DATA_FILE_PREFIX[];
/// "index-file-in-data-file-dir" - Whether index file in data file directory. Default value is
/// "false".
static const char INDEX_FILE_IN_DATA_FILE_DIR[];
/// "row-tracking.enabled" - Whether enable unique row id for append table. Default value is
/// "false".
static const char ROW_TRACKING_ENABLED[];
/// "data-evolution.enabled" - Whether enable data evolution for row tracking table. Default
/// value is "false".
static const char DATA_EVOLUTION_ENABLED[];
/// "partition.legacy-name" - The legacy partition name is using `ToString` for all types. If
/// false, using casting to string for all types. Default value is "true".
static const char PARTITION_GENERATE_LEGACY_NAME[];
/// "blob-as-descriptor" - Read and write blob field using blob descriptor rather than blob
/// bytes. Default value is "false".
static const char BLOB_AS_DESCRIPTOR[];
/// "global-index.enabled" - Whether to enable global index for scan. Default value is "true".
static const char GLOBAL_INDEX_ENABLED[];
/// "global-index.external-path" - Global index root directory, if not set, the global index
/// files will be stored under the index directory.
static const char GLOBAL_INDEX_EXTERNAL_PATH[];
/// "scan.tag-name" - Optional tag name used in case of "from-snapshot" scan mode.
static const char SCAN_TAG_NAME[];
};
static constexpr int64_t BATCH_WRITE_COMMIT_IDENTIFIER = std::numeric_limits<int64_t>::max();
} // namespace paimon