| // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| // This source code is licensed under both the GPLv2 (found in the |
| // COPYING file in the root directory) and Apache 2.0 License |
| // (found in the LICENSE.Apache file in the root directory). |
| // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| |
| #ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ |
| #define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <string> |
| #include <memory> |
| #include <vector> |
| #include <limits> |
| #include <unordered_map> |
| |
| #include "rocksdb/advanced_options.h" |
| #include "rocksdb/comparator.h" |
| #include "rocksdb/env.h" |
| #include "rocksdb/listener.h" |
| #include "rocksdb/universal_compaction.h" |
| #include "rocksdb/version.h" |
| #include "rocksdb/write_buffer_manager.h" |
| |
| #ifdef max |
| #undef max |
| #endif |
| |
| namespace rocksdb { |
| |
| class Cache; |
| class CompactionFilter; |
| class CompactionFilterFactory; |
| class Comparator; |
| class Env; |
| enum InfoLogLevel : unsigned char; |
| class SstFileManager; |
| class FilterPolicy; |
| class Logger; |
| class MergeOperator; |
| class Snapshot; |
| class MemTableRepFactory; |
| class RateLimiter; |
| class Slice; |
| class Statistics; |
| class InternalKeyComparator; |
| class WalFilter; |
| |
| // DB contents are stored in a set of blocks, each of which holds a |
| // sequence of key,value pairs. Each block may be compressed before |
| // being stored in a file. The following enum describes which |
| // compression method (if any) is used to compress a block. |
| enum CompressionType : unsigned char { |
| // NOTE: do not change the values of existing entries, as these are |
| // part of the persistent format on disk. |
| kNoCompression = 0x0, |
| kSnappyCompression = 0x1, |
| kZlibCompression = 0x2, |
| kBZip2Compression = 0x3, |
| kLZ4Compression = 0x4, |
| kLZ4HCCompression = 0x5, |
| kXpressCompression = 0x6, |
| kZSTD = 0x7, |
| |
| // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than |
| // 0.8.0 or consider a possibility of downgrading the service or copying |
| // the database files to another service running with an older version of |
| // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will |
| // eventually remove the option from the public API. |
| kZSTDNotFinalCompression = 0x40, |
| |
| // kDisableCompressionOption is used to disable some compression options. |
| kDisableCompressionOption = 0xff, |
| }; |
| |
| struct Options; |
| |
| struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { |
| // The function recovers options to a previous version. Only 4.6 or later |
| // versions are supported. |
| ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, |
| int rocksdb_minor_version = 6); |
| |
| // Some functions that make it easier to optimize RocksDB |
| // Use this if your DB is very small (like under 1GB) and you don't want to |
| // spend lots of memory for memtables. |
| ColumnFamilyOptions* OptimizeForSmallDb(); |
| |
| // Use this if you don't need to keep the data sorted, i.e. you'll never use |
| // an iterator, only Put() and Get() API calls |
| // |
| // Not supported in ROCKSDB_LITE |
| ColumnFamilyOptions* OptimizeForPointLookup( |
| uint64_t block_cache_size_mb); |
| |
| // Default values for some parameters in ColumnFamilyOptions are not |
| // optimized for heavy workloads and big datasets, which means you might |
| // observe write stalls under some conditions. As a starting point for tuning |
| // RocksDB options, use the following two functions: |
| // * OptimizeLevelStyleCompaction -- optimizes level style compaction |
| // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction |
| // Universal style compaction is focused on reducing Write Amplification |
| // Factor for big data sets, but increases Space Amplification. You can learn |
| // more about the different styles here: |
| // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide |
| // Make sure to also call IncreaseParallelism(), which will provide the |
| // biggest performance gains. |
| // Note: we might use more memory than memtable_memory_budget during high |
| // write rate period |
| // |
| // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE |
| ColumnFamilyOptions* OptimizeLevelStyleCompaction( |
| uint64_t memtable_memory_budget = 512 * 1024 * 1024); |
| ColumnFamilyOptions* OptimizeUniversalStyleCompaction( |
| uint64_t memtable_memory_budget = 512 * 1024 * 1024); |
| |
| // ------------------- |
| // Parameters that affect behavior |
| |
| // Comparator used to define the order of keys in the table. |
| // Default: a comparator that uses lexicographic byte-wise ordering |
| // |
| // REQUIRES: The client must ensure that the comparator supplied |
| // here has the same name and orders keys *exactly* the same as the |
| // comparator provided to previous open calls on the same DB. |
| const Comparator* comparator = BytewiseComparator(); |
| |
| // REQUIRES: The client must provide a merge operator if Merge operation |
| // needs to be accessed. Calling Merge on a DB without a merge operator |
| // would result in Status::NotSupported. The client must ensure that the |
| // merge operator supplied here has the same name and *exactly* the same |
| // semantics as the merge operator provided to previous open calls on |
| // the same DB. The only exception is reserved for upgrade, where a DB |
| // previously without a merge operator is introduced to Merge operation |
| // for the first time. It's necessary to specify a merge operator when |
| // opening the DB in this case. |
| // Default: nullptr |
| std::shared_ptr<MergeOperator> merge_operator = nullptr; |
| |
| // A single CompactionFilter instance to call into during compaction. |
| // Allows an application to modify/delete a key-value during background |
| // compaction. |
| // |
| // If the client requires a new compaction filter to be used for different |
| // compaction runs, it can specify compaction_filter_factory instead of this |
| // option. The client should specify only one of the two. |
| // compaction_filter takes precedence over compaction_filter_factory if |
| // client specifies both. |
| // |
| // If multithreaded compaction is being used, the supplied CompactionFilter |
| // instance may be used from different threads concurrently and so should be |
| // thread-safe. |
| // |
| // Default: nullptr |
| const CompactionFilter* compaction_filter = nullptr; |
| |
| // This is a factory that provides compaction filter objects which allow |
| // an application to modify/delete a key-value during background compaction. |
| // |
| // A new filter will be created on each compaction run. If multithreaded |
| // compaction is being used, each created CompactionFilter will only be used |
| // from a single thread and so does not need to be thread-safe. |
| // |
| // Default: nullptr |
| std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr; |
| |
| // ------------------- |
| // Parameters that affect performance |
| |
| // Amount of data to build up in memory (backed by an unsorted log |
| // on disk) before converting to a sorted on-disk file. |
| // |
| // Larger values increase performance, especially during bulk loads. |
| // Up to max_write_buffer_number write buffers may be held in memory |
| // at the same time, |
| // so you may wish to adjust this parameter to control memory usage. |
| // Also, a larger write buffer will result in a longer recovery time |
| // the next time the database is opened. |
| // |
| // Note that write_buffer_size is enforced per column family. |
| // See db_write_buffer_size for sharing memory across column families. |
| // |
| // Default: 64MB |
| // |
| // Dynamically changeable through SetOptions() API |
| size_t write_buffer_size = 64 << 20; |
| |
| // Compress blocks using the specified compression algorithm. This |
| // parameter can be changed dynamically. |
| // |
| // Default: kSnappyCompression, if it's supported. If snappy is not linked |
| // with the library, the default is kNoCompression. |
| // |
| // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: |
| // ~200-500MB/s compression |
| // ~400-800MB/s decompression |
| // Note that these speeds are significantly faster than most |
| // persistent storage speeds, and therefore it is typically never |
| // worth switching to kNoCompression. Even if the input data is |
| // incompressible, the kSnappyCompression implementation will |
| // efficiently detect that and will switch to uncompressed mode. |
| CompressionType compression; |
| |
| // Compression algorithm that will be used for the bottommost level that |
| // contain files. If level-compaction is used, this option will only affect |
| // levels after base level. |
| // |
| // Default: kDisableCompressionOption (Disabled) |
| CompressionType bottommost_compression = kDisableCompressionOption; |
| |
| // different options for compression algorithms |
| CompressionOptions compression_opts; |
| |
| // Number of files to trigger level-0 compaction. A value <0 means that |
| // level-0 compaction will not be triggered by number of files at all. |
| // |
| // Default: 4 |
| // |
| // Dynamically changeable through SetOptions() API |
| int level0_file_num_compaction_trigger = 4; |
| |
| // If non-nullptr, use the specified function to determine the |
| // prefixes for keys. These prefixes will be placed in the filter. |
| // Depending on the workload, this can reduce the number of read-IOP |
| // cost for scans when a prefix is passed via ReadOptions to |
| // db.NewIterator(). For prefix filtering to work properly, |
| // "prefix_extractor" and "comparator" must be such that the following |
| // properties hold: |
| // |
| // 1) key.starts_with(prefix(key)) |
| // 2) Compare(prefix(key), key) <= 0. |
| // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 |
| // 4) prefix(prefix(key)) == prefix(key) |
| // |
| // Default: nullptr |
| std::shared_ptr<const SliceTransform> prefix_extractor = nullptr; |
| |
| // Control maximum total data size for a level. |
| // max_bytes_for_level_base is the max total for level-1. |
| // Maximum number of bytes for level L can be calculated as |
| // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) |
| // For example, if max_bytes_for_level_base is 200MB, and if |
| // max_bytes_for_level_multiplier is 10, total data size for level-1 |
| // will be 200MB, total file size for level-2 will be 2GB, |
| // and total file size for level-3 will be 20GB. |
| // |
| // Default: 256MB. |
| // |
| // Dynamically changeable through SetOptions() API |
| uint64_t max_bytes_for_level_base = 256 * 1048576; |
| |
| // Disable automatic compactions. Manual compactions can still |
| // be issued on this column family |
| // |
| // Dynamically changeable through SetOptions() API |
| bool disable_auto_compactions = false; |
| |
| // This is a factory that provides TableFactory objects. |
| // Default: a block-based table factory that provides a default |
| // implementation of TableBuilder and TableReader with default |
| // BlockBasedTableOptions. |
| std::shared_ptr<TableFactory> table_factory; |
| |
| // Create ColumnFamilyOptions with default values for all fields |
| ColumnFamilyOptions(); |
| // Create ColumnFamilyOptions from Options |
| explicit ColumnFamilyOptions(const Options& options); |
| |
| void Dump(Logger* log) const; |
| }; |
| |
| enum class WALRecoveryMode : char { |
| // Original levelDB recovery |
| // We tolerate incomplete record in trailing data on all logs |
| // Use case : This is legacy behavior (default) |
| kTolerateCorruptedTailRecords = 0x00, |
| // Recover from clean shutdown |
| // We don't expect to find any corruption in the WAL |
| // Use case : This is ideal for unit tests and rare applications that |
| // can require high consistency guarantee |
| kAbsoluteConsistency = 0x01, |
| // Recover to point-in-time consistency |
| // We stop the WAL playback on discovering WAL inconsistency |
| // Use case : Ideal for systems that have disk controller cache like |
| // hard disk, SSD without super capacitor that store related data |
| kPointInTimeRecovery = 0x02, |
| // Recovery after a disaster |
| // We ignore any corruption in the WAL and try to salvage as much data as |
| // possible |
| // Use case : Ideal for last ditch effort to recover data or systems that |
| // operate with low grade unrelated data |
| kSkipAnyCorruptedRecords = 0x03, |
| }; |
| |
| struct DbPath { |
| std::string path; |
| uint64_t target_size; // Target size of total files under the path, in byte. |
| |
| DbPath() : target_size(0) {} |
| DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} |
| }; |
| |
| |
| struct DBOptions { |
| // The function recovers options to the option as in version 4.6. |
| DBOptions* OldDefaults(int rocksdb_major_version = 4, |
| int rocksdb_minor_version = 6); |
| |
| // Some functions that make it easier to optimize RocksDB |
| |
| // Use this if your DB is very small (like under 1GB) and you don't want to |
| // spend lots of memory for memtables. |
| DBOptions* OptimizeForSmallDb(); |
| |
| #ifndef ROCKSDB_LITE |
| // By default, RocksDB uses only one background thread for flush and |
| // compaction. Calling this function will set it up such that total of |
| // `total_threads` is used. Good value for `total_threads` is the number of |
| // cores. You almost definitely want to call this function if your system is |
| // bottlenecked by RocksDB. |
| DBOptions* IncreaseParallelism(int total_threads = 16); |
| #endif // ROCKSDB_LITE |
| |
| // If true, the database will be created if it is missing. |
| // Default: false |
| bool create_if_missing = false; |
| |
| // If true, missing column families will be automatically created. |
| // Default: false |
| bool create_missing_column_families = false; |
| |
| // If true, an error is raised if the database already exists. |
| // Default: false |
| bool error_if_exists = false; |
| |
| // If true, RocksDB will aggressively check consistency of the data. |
| // Also, if any of the writes to the database fails (Put, Delete, Merge, |
| // Write), the database will switch to read-only mode and fail all other |
| // Write operations. |
| // In most cases you want this to be set to true. |
| // Default: true |
| bool paranoid_checks = true; |
| |
| // Use the specified object to interact with the environment, |
| // e.g. to read/write files, schedule background work, etc. |
| // Default: Env::Default() |
| Env* env = Env::Default(); |
| |
| // Use to control write rate of flush and compaction. Flush has higher |
| // priority than compaction. Rate limiting is disabled if nullptr. |
| // If rate limiter is enabled, bytes_per_sync is set to 1MB by default. |
| // Default: nullptr |
| std::shared_ptr<RateLimiter> rate_limiter = nullptr; |
| |
| // Use to track SST files and control their file deletion rate. |
| // |
| // Features: |
| // - Throttle the deletion rate of the SST files. |
| // - Keep track the total size of all SST files. |
| // - Set a maximum allowed space limit for SST files that when reached |
| // the DB wont do any further flushes or compactions and will set the |
| // background error. |
| // - Can be shared between multiple dbs. |
| // Limitations: |
| // - Only track and throttle deletes of SST files in |
| // first db_path (db_name if db_paths is empty). |
| // |
| // Default: nullptr |
| std::shared_ptr<SstFileManager> sst_file_manager = nullptr; |
| |
| // Any internal progress/error information generated by the db will |
| // be written to info_log if it is non-nullptr, or to a file stored |
| // in the same directory as the DB contents if info_log is nullptr. |
| // Default: nullptr |
| std::shared_ptr<Logger> info_log = nullptr; |
| |
| #ifdef NDEBUG |
| InfoLogLevel info_log_level = INFO_LEVEL; |
| #else |
| InfoLogLevel info_log_level = DEBUG_LEVEL; |
| #endif // NDEBUG |
| |
| // Number of open files that can be used by the DB. You may need to |
| // increase this if your database has a large working set. Value -1 means |
| // files opened are always kept open. You can estimate number of files based |
| // on target_file_size_base and target_file_size_multiplier for level-based |
| // compaction. For universal-style compaction, you can usually set it to -1. |
| // Default: -1 |
| int max_open_files = -1; |
| |
| // If max_open_files is -1, DB will open all files on DB::Open(). You can |
| // use this option to increase the number of threads used to open the files. |
| // Default: 16 |
| int max_file_opening_threads = 16; |
| |
| // Once write-ahead logs exceed this size, we will start forcing the flush of |
| // column families whose memtables are backed by the oldest live WAL file |
| // (i.e. the ones that are causing all the space amplification). If set to 0 |
| // (default), we will dynamically choose the WAL size limit to be |
| // [sum of all write_buffer_size * max_write_buffer_number] * 4 |
| // Default: 0 |
| uint64_t max_total_wal_size = 0; |
| |
| // If non-null, then we should collect metrics about database operations |
| std::shared_ptr<Statistics> statistics = nullptr; |
| |
| // If true, then every store to stable storage will issue a fsync. |
| // If false, then every store to stable storage will issue a fdatasync. |
| // This parameter should be set to true while storing data to |
| // filesystem like ext3 that can lose files after a reboot. |
| // Default: false |
| // Note: on many platforms fdatasync is defined as fsync, so this parameter |
| // would make no difference. Refer to fdatasync definition in this code base. |
| bool use_fsync = false; |
| |
| // A list of paths where SST files can be put into, with its target size. |
| // Newer data is placed into paths specified earlier in the vector while |
| // older data gradually moves to paths specified later in the vector. |
| // |
| // For example, you have a flash device with 10GB allocated for the DB, |
| // as well as a hard drive of 2TB, you should config it to be: |
| // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] |
| // |
| // The system will try to guarantee data under each path is close to but |
| // not larger than the target size. But current and future file sizes used |
| // by determining where to place a file are based on best-effort estimation, |
| // which means there is a chance that the actual size under the directory |
| // is slightly more than target size under some workloads. User should give |
| // some buffer room for those cases. |
| // |
| // If none of the paths has sufficient room to place a file, the file will |
| // be placed to the last path anyway, despite to the target size. |
| // |
| // Placing newer data to earlier paths is also best-efforts. User should |
| // expect user files to be placed in higher levels in some extreme cases. |
| // |
| // If left empty, only one path will be used, which is db_name passed when |
| // opening the DB. |
| // Default: empty |
| std::vector<DbPath> db_paths; |
| |
| // This specifies the info LOG dir. |
| // If it is empty, the log files will be in the same dir as data. |
| // If it is non empty, the log files will be in the specified dir, |
| // and the db data dir's absolute path will be used as the log file |
| // name's prefix. |
| std::string db_log_dir = ""; |
| |
| // This specifies the absolute dir path for write-ahead logs (WAL). |
| // If it is empty, the log files will be in the same dir as data, |
| // dbname is used as the data dir by default |
| // If it is non empty, the log files will be in kept the specified dir. |
| // When destroying the db, |
| // all log files in wal_dir and the dir itself is deleted |
| std::string wal_dir = ""; |
| |
| // The periodicity when obsolete files get deleted. The default |
| // value is 6 hours. The files that get out of scope by compaction |
| // process will still get automatically delete on every compaction, |
| // regardless of this setting |
| uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000; |
| |
| // Maximum number of concurrent background jobs (compactions and flushes). |
| int max_background_jobs = 2; |
| |
| // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the |
| // value of max_background_jobs. This option is ignored. |
| int base_background_compactions = -1; |
| |
| // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the |
| // value of max_background_jobs. For backwards compatibility we will set |
| // `max_background_jobs = max_background_compactions + max_background_flushes` |
| // in the case where user sets at least one of `max_background_compactions` or |
| // `max_background_flushes` (we replace -1 by 1 in case one option is unset). |
| // |
| // Maximum number of concurrent background compaction jobs, submitted to |
| // the default LOW priority thread pool. |
| // |
| // If you're increasing this, also consider increasing number of threads in |
| // LOW priority thread pool. For more information, see |
| // Env::SetBackgroundThreads |
| // Default: -1 |
| int max_background_compactions = -1; |
| |
| // This value represents the maximum number of threads that will |
| // concurrently perform a compaction job by breaking it into multiple, |
| // smaller ones that are run simultaneously. |
| // Default: 1 (i.e. no subcompactions) |
| uint32_t max_subcompactions = 1; |
| |
| // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the |
| // value of max_background_jobs. For backwards compatibility we will set |
| // `max_background_jobs = max_background_compactions + max_background_flushes` |
| // in the case where user sets at least one of `max_background_compactions` or |
| // `max_background_flushes`. |
| // |
| // Maximum number of concurrent background memtable flush jobs, submitted by |
| // default to the HIGH priority thread pool. If the HIGH priority thread pool |
| // is configured to have zero threads, flush jobs will share the LOW priority |
| // thread pool with compaction jobs. |
| // |
| // It is important to use both thread pools when the same Env is shared by |
| // multiple db instances. Without a separate pool, long running compaction |
| // jobs could potentially block memtable flush jobs of other db instances, |
| // leading to unnecessary Put stalls. |
| // |
| // If you're increasing this, also consider increasing number of threads in |
| // HIGH priority thread pool. For more information, see |
| // Env::SetBackgroundThreads |
| // Default: -1 |
| int max_background_flushes = -1; |
| |
| // Specify the maximal size of the info log file. If the log file |
| // is larger than `max_log_file_size`, a new info log file will |
| // be created. |
| // If max_log_file_size == 0, all logs will be written to one |
| // log file. |
| size_t max_log_file_size = 0; |
| |
| // Time for the info log file to roll (in seconds). |
| // If specified with non-zero value, log file will be rolled |
| // if it has been active longer than `log_file_time_to_roll`. |
| // Default: 0 (disabled) |
| // Not supported in ROCKSDB_LITE mode! |
| size_t log_file_time_to_roll = 0; |
| |
| // Maximal info log files to be kept. |
| // Default: 1000 |
| size_t keep_log_file_num = 1000; |
| |
| // Recycle log files. |
| // If non-zero, we will reuse previously written log files for new |
| // logs, overwriting the old data. The value indicates how many |
| // such files we will keep around at any point in time for later |
| // use. This is more efficient because the blocks are already |
| // allocated and fdatasync does not need to update the inode after |
| // each write. |
| // Default: 0 |
| size_t recycle_log_file_num = 0; |
| |
| // manifest file is rolled over on reaching this limit. |
| // The older manifest file be deleted. |
| // The default value is MAX_INT so that roll-over does not take place. |
| uint64_t max_manifest_file_size = std::numeric_limits<uint64_t>::max(); |
| |
| // Number of shards used for table cache. |
| int table_cache_numshardbits = 6; |
| |
| // NOT SUPPORTED ANYMORE |
| // int table_cache_remove_scan_count_limit; |
| |
| // The following two fields affect how archived logs will be deleted. |
| // 1. If both set to 0, logs will be deleted asap and will not get into |
| // the archive. |
| // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, |
| // WAL files will be checked every 10 min and if total size is greater |
| // then WAL_size_limit_MB, they will be deleted starting with the |
| // earliest until size_limit is met. All empty files will be deleted. |
| // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then |
| // WAL files will be checked every WAL_ttl_secondsi / 2 and those that |
| // are older than WAL_ttl_seconds will be deleted. |
| // 4. If both are not 0, WAL files will be checked every 10 min and both |
| // checks will be performed with ttl being first. |
| uint64_t WAL_ttl_seconds = 0; |
| uint64_t WAL_size_limit_MB = 0; |
| |
| // Number of bytes to preallocate (via fallocate) the manifest |
| // files. Default is 4mb, which is reasonable to reduce random IO |
| // as well as prevent overallocation for mounts that preallocate |
| // large amounts of data (such as xfs's allocsize option). |
| size_t manifest_preallocation_size = 4 * 1024 * 1024; |
| |
| // Allow the OS to mmap file for reading sst tables. Default: false |
| bool allow_mmap_reads = false; |
| |
| // Allow the OS to mmap file for writing. |
| // DB::SyncWAL() only works if this is set to false. |
| // Default: false |
| bool allow_mmap_writes = false; |
| |
| // Enable direct I/O mode for read/write |
| // they may or may not improve performance depending on the use case |
| // |
| // Files will be opened in "direct I/O" mode |
| // which means that data r/w from the disk will not be cached or |
| // buffered. The hardware buffer of the devices may however still |
| // be used. Memory mapped files are not impacted by these parameters. |
| |
| // Use O_DIRECT for user reads |
| // Default: false |
| // Not supported in ROCKSDB_LITE mode! |
| bool use_direct_reads = false; |
| |
| // Use O_DIRECT for both reads and writes in background flush and compactions |
| // When true, we also force new_table_reader_for_compaction_inputs to true. |
| // Default: false |
| // Not supported in ROCKSDB_LITE mode! |
| bool use_direct_io_for_flush_and_compaction = false; |
| |
| // If false, fallocate() calls are bypassed |
| bool allow_fallocate = true; |
| |
| // Disable child process inherit open files. Default: true |
| bool is_fd_close_on_exec = true; |
| |
| // NOT SUPPORTED ANYMORE -- this options is no longer used |
| bool skip_log_error_on_recovery = false; |
| |
| // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec |
| // Default: 600 (10 min) |
| unsigned int stats_dump_period_sec = 600; |
| |
| // If set true, will hint the underlying file system that the file |
| // access pattern is random, when a sst file is opened. |
| // Default: true |
| bool advise_random_on_open = true; |
| |
| // Amount of data to build up in memtables across all column |
| // families before writing to disk. |
| // |
| // This is distinct from write_buffer_size, which enforces a limit |
| // for a single memtable. |
| // |
| // This feature is disabled by default. Specify a non-zero value |
| // to enable it. |
| // |
| // Default: 0 (disabled) |
| size_t db_write_buffer_size = 0; |
| |
| // The memory usage of memtable will report to this object. The same object |
| // can be passed into multiple DBs and it will track the sum of size of all |
| // the DBs. If the total size of all live memtables of all the DBs exceeds |
| // a limit, a flush will be triggered in the next DB to which the next write |
| // is issued. |
| // |
| // If the object is only passed to on DB, the behavior is the same as |
| // db_write_buffer_size. When write_buffer_manager is set, the value set will |
| // override db_write_buffer_size. |
| // |
| // This feature is disabled by default. Specify a non-zero value |
| // to enable it. |
| // |
| // Default: null |
| std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr; |
| |
| // Specify the file access pattern once a compaction is started. |
| // It will be applied to all input files of a compaction. |
| // Default: NORMAL |
| enum AccessHint { |
| NONE, |
| NORMAL, |
| SEQUENTIAL, |
| WILLNEED |
| }; |
| AccessHint access_hint_on_compaction_start = NORMAL; |
| |
| // If true, always create a new file descriptor and new table reader |
| // for compaction inputs. Turn this parameter on may introduce extra |
| // memory usage in the table reader, if it allocates extra memory |
| // for indexes. This will allow file descriptor prefetch options |
| // to be set for compaction input files and not to impact file |
| // descriptors for the same file used by user queries. |
| // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks |
| // for this mode if using block-based table. |
| // |
| // Default: false |
| bool new_table_reader_for_compaction_inputs = false; |
| |
| // If non-zero, we perform bigger reads when doing compaction. If you're |
| // running RocksDB on spinning disks, you should set this to at least 2MB. |
| // That way RocksDB's compaction is doing sequential instead of random reads. |
| // |
| // When non-zero, we also force new_table_reader_for_compaction_inputs to |
| // true. |
| // |
| // Default: 0 |
| size_t compaction_readahead_size = 0; |
| |
| // This is a maximum buffer size that is used by WinMmapReadableFile in |
| // unbuffered disk I/O mode. We need to maintain an aligned buffer for |
| // reads. We allow the buffer to grow until the specified value and then |
| // for bigger requests allocate one shot buffers. In unbuffered mode we |
| // always bypass read-ahead buffer at ReadaheadRandomAccessFile |
| // When read-ahead is required we then make use of compaction_readahead_size |
| // value and always try to read ahead. With read-ahead we always |
| // pre-allocate buffer to the size instead of growing it up to a limit. |
| // |
| // This option is currently honored only on Windows |
| // |
| // Default: 1 Mb |
| // |
| // Special value: 0 - means do not maintain per instance buffer. Allocate |
| // per request buffer and avoid locking. |
| size_t random_access_max_buffer_size = 1024 * 1024; |
| |
| // This is the maximum buffer size that is used by WritableFileWriter. |
| // On Windows, we need to maintain an aligned buffer for writes. |
| // We allow the buffer to grow until it's size hits the limit in buffered |
| // IO and fix the buffer size when using direct IO to ensure alignment of |
| // write requests if the logical sector size is unusual |
| // |
| // Default: 1024 * 1024 (1 MB) |
| size_t writable_file_max_buffer_size = 1024 * 1024; |
| |
| |
| // Use adaptive mutex, which spins in the user space before resorting |
| // to kernel. This could reduce context switch when the mutex is not |
| // heavily contended. However, if the mutex is hot, we could end up |
| // wasting spin time. |
| // Default: false |
| bool use_adaptive_mutex = false; |
| |
| // Create DBOptions with default values for all fields |
| DBOptions(); |
| // Create DBOptions from Options |
| explicit DBOptions(const Options& options); |
| |
| void Dump(Logger* log) const; |
| |
| // Allows OS to incrementally sync files to disk while they are being |
| // written, asynchronously, in the background. This operation can be used |
| // to smooth out write I/Os over time. Users shouldn't rely on it for |
| // persistency guarantee. |
| // Issue one request for every bytes_per_sync written. 0 turns it off. |
| // Default: 0 |
| // |
| // You may consider using rate_limiter to regulate write rate to device. |
| // When rate limiter is enabled, it automatically enables bytes_per_sync |
| // to 1MB. |
| // |
| // This option applies to table files |
| uint64_t bytes_per_sync = 0; |
| |
| // Same as bytes_per_sync, but applies to WAL files |
| // Default: 0, turned off |
| uint64_t wal_bytes_per_sync = 0; |
| |
| // A vector of EventListeners which call-back functions will be called |
| // when specific RocksDB event happens. |
| std::vector<std::shared_ptr<EventListener>> listeners; |
| |
| // If true, then the status of the threads involved in this DB will |
| // be tracked and available via GetThreadList() API. |
| // |
| // Default: false |
| bool enable_thread_tracking = false; |
| |
| // The limited write rate to DB if soft_pending_compaction_bytes_limit or |
| // level0_slowdown_writes_trigger is triggered, or we are writing to the |
| // last mem table allowed and we allow more than 3 mem tables. It is |
| // calculated using size of user write requests before compression. |
| // RocksDB may decide to slow down more if the compaction still |
| // gets behind further. |
| // If the value is 0, we will infer a value from `rater_limiter` value |
| // if it is not empty, or 16MB if `rater_limiter` is empty. Note that |
| // if users change the rate in `rate_limiter` after DB is opened, |
| // `delayed_write_rate` won't be adjusted. |
| // |
| // Unit: byte per second. |
| // |
| // Default: 0 |
| uint64_t delayed_write_rate = 0; |
| |
| // By default, a single write thread queue is maintained. The thread gets |
| // to the head of the queue becomes write batch group leader and responsible |
| // for writing to WAL and memtable for the batch group. |
| // |
| // If enable_pipelined_write is true, separate write thread queue is |
| // maintained for WAL write and memtable write. A write thread first enter WAL |
| // writer queue and then memtable writer queue. Pending thread on the WAL |
| // writer queue thus only have to wait for previous writers to finish their |
| // WAL writing but not the memtable writing. Enabling the feature may improve |
| // write throughput and reduce latency of the prepare phase of two-phase |
| // commit. |
| // |
| // Default: false |
| bool enable_pipelined_write = false; |
| |
| // If true, allow multi-writers to update mem tables in parallel. |
| // Only some memtable_factory-s support concurrent writes; currently it |
| // is implemented only for SkipListFactory. Concurrent memtable writes |
| // are not compatible with inplace_update_support or filter_deletes. |
| // It is strongly recommended to set enable_write_thread_adaptive_yield |
| // if you are going to use this feature. |
| // |
| // Default: true |
| bool allow_concurrent_memtable_write = true; |
| |
| // If true, threads synchronizing with the write batch group leader will |
| // wait for up to write_thread_max_yield_usec before blocking on a mutex. |
| // This can substantially improve throughput for concurrent workloads, |
| // regardless of whether allow_concurrent_memtable_write is enabled. |
| // |
| // Default: true |
| bool enable_write_thread_adaptive_yield = true; |
| |
| // The maximum number of microseconds that a write operation will use |
| // a yielding spin loop to coordinate with other write threads before |
| // blocking on a mutex. (Assuming write_thread_slow_yield_usec is |
| // set properly) increasing this value is likely to increase RocksDB |
| // throughput at the expense of increased CPU usage. |
| // |
| // Default: 100 |
| uint64_t write_thread_max_yield_usec = 100; |
| |
| // The latency in microseconds after which a std::this_thread::yield |
| // call (sched_yield on Linux) is considered to be a signal that |
| // other processes or threads would like to use the current core. |
| // Increasing this makes writer threads more likely to take CPU |
| // by spinning, which will show up as an increase in the number of |
| // involuntary context switches. |
| // |
| // Default: 3 |
| uint64_t write_thread_slow_yield_usec = 3; |
| |
| // If true, then DB::Open() will not update the statistics used to optimize |
| // compaction decision by loading table properties from many files. |
| // Turning off this feature will improve DBOpen time especially in |
| // disk environment. |
| // |
| // Default: false |
| bool skip_stats_update_on_db_open = false; |
| |
| // Recovery mode to control the consistency while replaying WAL |
| // Default: kPointInTimeRecovery |
| WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; |
| |
| // if set to false then recovery will fail when a prepared |
| // transaction is encountered in the WAL |
| bool allow_2pc = false; |
| |
| // A global cache for table-level rows. |
| // Default: nullptr (disabled) |
| // Not supported in ROCKSDB_LITE mode! |
| std::shared_ptr<Cache> row_cache = nullptr; |
| |
| #ifndef ROCKSDB_LITE |
| // A filter object supplied to be invoked while processing write-ahead-logs |
| // (WALs) during recovery. The filter provides a way to inspect log |
| // records, ignoring a particular record or skipping replay. |
| // The filter is invoked at startup and is invoked from a single-thread |
| // currently. |
| WalFilter* wal_filter = nullptr; |
| #endif // ROCKSDB_LITE |
| |
| // If true, then DB::Open / CreateColumnFamily / DropColumnFamily |
| // / SetOptions will fail if options file is not detected or properly |
| // persisted. |
| // |
| // DEFAULT: false |
| bool fail_if_options_file_error = false; |
| |
| // If true, then print malloc stats together with rocksdb.stats |
| // when printing to LOG. |
| // DEFAULT: false |
| bool dump_malloc_stats = false; |
| |
| // By default RocksDB replay WAL logs and flush them on DB open, which may |
| // create very small SST files. If this option is enabled, RocksDB will try |
| // to avoid (but not guarantee not to) flush during recovery. Also, existing |
| // WAL logs will be kept, so that if crash happened before flush, we still |
| // have logs to recover from. |
| // |
| // DEFAULT: false |
| bool avoid_flush_during_recovery = false; |
| |
| // By default RocksDB will flush all memtables on DB close if there are |
| // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup |
| // DB close. Unpersisted data WILL BE LOST. |
| // |
| // DEFAULT: false |
| // |
| // Dynamically changeable through SetDBOptions() API. |
| bool avoid_flush_during_shutdown = false; |
| |
| // Set this option to true during creation of database if you want |
| // to be able to ingest behind (call IngestExternalFile() skipping keys |
| // that already exist, rather than overwriting matching keys). |
| // Setting this option to true will affect 2 things: |
| // 1) Disable some internal optimizations around SST file compression |
| // 2) Reserve bottom-most level for ingested files only. |
| // 3) Note that num_levels should be >= 3 if this option is turned on. |
| // |
| // DEFAULT: false |
| // Immutable. |
| bool allow_ingest_behind = false; |
| |
| // If enabled it uses two queues for writes, one for the ones with |
| // disable_memtable and one for the ones that also write to memtable. This |
| // allows the memtable writes not to lag behind other writes. It can be used |
| // to optimize MySQL 2PC in which only the commits, which are serial, write to |
| // memtable. |
| bool concurrent_prepare = false; |
| |
| // If true WAL is not flushed automatically after each write. Instead it |
| // relies on manual invocation of FlushWAL to write the WAL buffer to its |
| // file. |
| bool manual_wal_flush = false; |
| }; |
| |
| // Options to control the behavior of a database (passed to DB::Open) |
| struct Options : public DBOptions, public ColumnFamilyOptions { |
| // Create an Options object with default values for all fields. |
| Options() : DBOptions(), ColumnFamilyOptions() {} |
| |
| Options(const DBOptions& db_options, |
| const ColumnFamilyOptions& column_family_options) |
| : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} |
| |
| // The function recovers options to the option as in version 4.6. |
| Options* OldDefaults(int rocksdb_major_version = 4, |
| int rocksdb_minor_version = 6); |
| |
| void Dump(Logger* log) const; |
| |
| void DumpCFOptions(Logger* log) const; |
| |
| // Some functions that make it easier to optimize RocksDB |
| |
| // Set appropriate parameters for bulk loading. |
| // The reason that this is a function that returns "this" instead of a |
| // constructor is to enable chaining of multiple similar calls in the future. |
| // |
| |
| // All data will be in level 0 without any automatic compaction. |
| // It's recommended to manually call CompactRange(NULL, NULL) before reading |
| // from the database, because otherwise the read can be very slow. |
| Options* PrepareForBulkLoad(); |
| |
| // Use this if your DB is very small (like under 1GB) and you don't want to |
| // spend lots of memory for memtables. |
| Options* OptimizeForSmallDb(); |
| }; |
| |
| // |
| // An application can issue a read request (via Get/Iterators) and specify |
| // if that read should process data that ALREADY resides on a specified cache |
| // level. For example, if an application specifies kBlockCacheTier then the |
| // Get call will process data that is already processed in the memtable or |
| // the block cache. It will not page in data from the OS cache or data that |
| // resides in storage. |
| enum ReadTier { |
| kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage |
| kBlockCacheTier = 0x1, // data in memtable or block cache |
| kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option |
| // will skip data in memtable. |
| // Note that this ReadTier currently only supports |
| // Get and MultiGet and does not support iterators. |
| kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. |
| }; |
| |
| // Options that control read operations |
| struct ReadOptions { |
| // If "snapshot" is non-nullptr, read as of the supplied snapshot |
| // (which must belong to the DB that is being read and which must |
| // not have been released). If "snapshot" is nullptr, use an implicit |
| // snapshot of the state at the beginning of this read operation. |
| // Default: nullptr |
| const Snapshot* snapshot; |
| |
| // "iterate_upper_bound" defines the extent upto which the forward iterator |
| // can returns entries. Once the bound is reached, Valid() will be false. |
| // "iterate_upper_bound" is exclusive ie the bound value is |
| // not a valid entry. If iterator_extractor is not null, the Seek target |
| // and iterator_upper_bound need to have the same prefix. |
| // This is because ordering is not guaranteed outside of prefix domain. |
| // There is no lower bound on the iterator. If needed, that can be easily |
| // implemented. |
| // |
| // Default: nullptr |
| const Slice* iterate_upper_bound; |
| |
| // If non-zero, NewIterator will create a new table reader which |
| // performs reads of the given size. Using a large size (> 2MB) can |
| // improve the performance of forward iteration on spinning disks. |
| // Default: 0 |
| size_t readahead_size; |
| |
| // A threshold for the number of keys that can be skipped before failing an |
| // iterator seek as incomplete. The default value of 0 should be used to |
| // never fail a request as incomplete, even on skipping too many keys. |
| // Default: 0 |
| uint64_t max_skippable_internal_keys; |
| |
| // Specify if this read request should process data that ALREADY |
| // resides on a particular cache. If the required data is not |
| // found at the specified cache, then Status::Incomplete is returned. |
| // Default: kReadAllTier |
| ReadTier read_tier; |
| |
| // If true, all data read from underlying storage will be |
| // verified against corresponding checksums. |
| // Default: true |
| bool verify_checksums; |
| |
| // Should the "data block"/"index block"/"filter block" read for this |
| // iteration be cached in memory? |
| // Callers may wish to set this field to false for bulk scans. |
| // Default: true |
| bool fill_cache; |
| |
| // Specify to create a tailing iterator -- a special iterator that has a |
| // view of the complete database (i.e. it can also be used to read newly |
| // added data) and is optimized for sequential reads. It will return records |
| // that were inserted into the database after the creation of the iterator. |
| // Default: false |
| // Not supported in ROCKSDB_LITE mode! |
| bool tailing; |
| |
| // Specify to create a managed iterator -- a special iterator that |
| // uses less resources by having the ability to free its underlying |
| // resources on request. |
| // Default: false |
| // Not supported in ROCKSDB_LITE mode! |
| bool managed; |
| |
| // Enable a total order seek regardless of index format (e.g. hash index) |
| // used in the table. Some table format (e.g. plain table) may not support |
| // this option. |
| // If true when calling Get(), we also skip prefix bloom when reading from |
| // block based table. It provides a way to read existing data after |
| // changing implementation of prefix extractor. |
| bool total_order_seek; |
| |
| // Enforce that the iterator only iterates over the same prefix as the seek. |
| // This option is effective only for prefix seeks, i.e. prefix_extractor is |
| // non-null for the column family and total_order_seek is false. Unlike |
| // iterate_upper_bound, prefix_same_as_start only works within a prefix |
| // but in both directions. |
| // Default: false |
| bool prefix_same_as_start; |
| |
| // Keep the blocks loaded by the iterator pinned in memory as long as the |
| // iterator is not deleted, If used when reading from tables created with |
| // BlockBasedTableOptions::use_delta_encoding = false, |
| // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to |
| // return 1. |
| // Default: false |
| bool pin_data; |
| |
| // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we |
| // schedule a background job in the flush job queue and delete obsolete files |
| // in background. |
| // Default: false |
| bool background_purge_on_iterator_cleanup; |
| |
| // If true, keys deleted using the DeleteRange() API will be visible to |
| // readers until they are naturally deleted during compaction. This improves |
| // read performance in DBs with many range deletions. |
| // Default: false |
| bool ignore_range_deletions; |
| |
| ReadOptions(); |
| ReadOptions(bool cksum, bool cache); |
| }; |
| |
| // Options that control write operations |
| struct WriteOptions { |
| // If true, the write will be flushed from the operating system |
| // buffer cache (by calling WritableFile::Sync()) before the write |
| // is considered complete. If this flag is true, writes will be |
| // slower. |
| // |
| // If this flag is false, and the machine crashes, some recent |
| // writes may be lost. Note that if it is just the process that |
| // crashes (i.e., the machine does not reboot), no writes will be |
| // lost even if sync==false. |
| // |
| // In other words, a DB write with sync==false has similar |
| // crash semantics as the "write()" system call. A DB write |
| // with sync==true has similar crash semantics to a "write()" |
| // system call followed by "fdatasync()". |
| // |
| // Default: false |
| bool sync; |
| |
| // If true, writes will not first go to the write ahead log, |
| // and the write may got lost after a crash. |
| bool disableWAL; |
| |
| // If true and if user is trying to write to column families that don't exist |
| // (they were dropped), ignore the write (don't return an error). If there |
| // are multiple writes in a WriteBatch, other writes will succeed. |
| // Default: false |
| bool ignore_missing_column_families; |
| |
| // If true and we need to wait or sleep for the write request, fails |
| // immediately with Status::Incomplete(). |
| bool no_slowdown; |
| |
| // If true, this write request is of lower priority if compaction is |
| // behind. In this case, no_slowdown = true, the request will be cancelled |
| // immediately with Status::Incomplete() returned. Otherwise, it will be |
| // slowed down. The slowdown value is determined by RocksDB to guarantee |
| // it introduces minimum impacts to high priority writes. |
| // |
| // Default: false |
| bool low_pri; |
| |
| WriteOptions() |
| : sync(false), |
| disableWAL(false), |
| ignore_missing_column_families(false), |
| no_slowdown(false), |
| low_pri(false) {} |
| }; |
| |
| // Options that control flush operations |
| struct FlushOptions { |
| // If true, the flush will wait until the flush is done. |
| // Default: true |
| bool wait; |
| |
| FlushOptions() : wait(true) {} |
| }; |
| |
| // Create a Logger from provided DBOptions |
| extern Status CreateLoggerFromOptions(const std::string& dbname, |
| const DBOptions& options, |
| std::shared_ptr<Logger>* logger); |
| |
| // CompactionOptions are used in CompactFiles() call. |
| struct CompactionOptions { |
| // Compaction output compression type |
| // Default: snappy |
| CompressionType compression; |
| // Compaction will create files of size `output_file_size_limit`. |
| // Default: MAX, which means that compaction will create a single file |
| uint64_t output_file_size_limit; |
| |
| CompactionOptions() |
| : compression(kSnappyCompression), |
| output_file_size_limit(std::numeric_limits<uint64_t>::max()) {} |
| }; |
| |
| // For level based compaction, we can configure if we want to skip/force |
| // bottommost level compaction. |
| enum class BottommostLevelCompaction { |
| // Skip bottommost level compaction |
| kSkip, |
| // Only compact bottommost level if there is a compaction filter |
| // This is the default option |
| kIfHaveCompactionFilter, |
| // Always compact bottommost level |
| kForce, |
| }; |
| |
| // CompactRangeOptions is used by CompactRange() call. |
| struct CompactRangeOptions { |
| // If true, no other compaction will run at the same time as this |
| // manual compaction |
| bool exclusive_manual_compaction = true; |
| // If true, compacted files will be moved to the minimum level capable |
| // of holding the data or given level (specified non-negative target_level). |
| bool change_level = false; |
| // If change_level is true and target_level have non-negative value, compacted |
| // files will be moved to target_level. |
| int target_level = -1; |
| // Compaction outputs will be placed in options.db_paths[target_path_id]. |
| // Behavior is undefined if target_path_id is out of range. |
| uint32_t target_path_id = 0; |
| // By default level based compaction will only compact the bottommost level |
| // if there is a compaction filter |
| BottommostLevelCompaction bottommost_level_compaction = |
| BottommostLevelCompaction::kIfHaveCompactionFilter; |
| }; |
| |
| // IngestExternalFileOptions is used by IngestExternalFile() |
| struct IngestExternalFileOptions { |
| // Can be set to true to move the files instead of copying them. |
| bool move_files = false; |
| // If set to false, an ingested file keys could appear in existing snapshots |
| // that where created before the file was ingested. |
| bool snapshot_consistency = true; |
| // If set to false, IngestExternalFile() will fail if the file key range |
| // overlaps with existing keys or tombstones in the DB. |
| bool allow_global_seqno = true; |
| // If set to false and the file key range overlaps with the memtable key range |
| // (memtable flush required), IngestExternalFile will fail. |
| bool allow_blocking_flush = true; |
| // Set to true if you would like duplicate keys in the file being ingested |
| // to be skipped rather than overwriting existing data under that key. |
| // Usecase: back-fill of some historical data in the database without |
| // over-writing existing newer version of data. |
| // This option could only be used if the DB has been running |
| // with allow_ingest_behind=true since the dawn of time. |
| // All files will be ingested at the bottommost level with seqno=0. |
| bool ingest_behind = false; |
| }; |
| |
| } // namespace rocksdb |
| |
| #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ |