| // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| // This source code is licensed under both the GPLv2 (found in the |
| // COPYING file in the root directory) and Apache 2.0 License |
| // (found in the LICENSE.Apache file in the root directory). |
| // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| |
| #pragma once |
| |
| #include <memory> |
| |
| #include "rocksdb/memtablerep.h" |
| #include "rocksdb/universal_compaction.h" |
| |
| namespace rocksdb { |
| |
| class Slice; |
| class SliceTransform; |
| enum CompressionType : unsigned char; |
| class TablePropertiesCollectorFactory; |
| class TableFactory; |
| struct Options; |
| |
| enum CompactionStyle : char { |
| // level based compaction style |
| kCompactionStyleLevel = 0x0, |
| // Universal compaction style |
| // Not supported in ROCKSDB_LITE. |
| kCompactionStyleUniversal = 0x1, |
| // FIFO compaction style |
| // Not supported in ROCKSDB_LITE |
| kCompactionStyleFIFO = 0x2, |
| // Disable background compaction. Compaction jobs are submitted |
| // via CompactFiles(). |
| // Not supported in ROCKSDB_LITE |
| kCompactionStyleNone = 0x3, |
| }; |
| |
| // In Level-based compaction, it Determines which file from a level to be |
| // picked to merge to the next level. We suggest people try |
| // kMinOverlappingRatio first when you tune your database. |
| enum CompactionPri : char { |
| // Slightly prioritize larger files by size compensated by #deletes |
| kByCompensatedSize = 0x0, |
| // First compact files whose data's latest update time is oldest. |
| // Try this if you only update some hot keys in small ranges. |
| kOldestLargestSeqFirst = 0x1, |
| // First compact files whose range hasn't been compacted to the next level |
| // for the longest. If your updates are random across the key space, |
| // write amplification is slightly better with this option. |
| kOldestSmallestSeqFirst = 0x2, |
| // First compact files whose ratio between overlapping size in next level |
| // and its size is the smallest. It in many cases can optimize write |
| // amplification. |
| kMinOverlappingRatio = 0x3, |
| }; |
| |
| struct CompactionOptionsFIFO { |
| // once the total sum of table files reaches this, we will delete the oldest |
| // table file |
| // Default: 1GB |
| uint64_t max_table_files_size; |
| |
| // Drop files older than TTL. TTL based deletion will take precedence over |
| // size based deletion if ttl > 0. |
| // delete if sst_file_creation_time < (current_time - ttl) |
| // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 |
| // Default: 0 (disabled) |
| uint64_t ttl = 0; |
| |
| // If true, try to do compaction to compact smaller files into larger ones. |
| // Minimum files to compact follows options.level0_file_num_compaction_trigger |
| // and compaction won't trigger if average compact bytes per del file is |
| // larger than options.write_buffer_size. This is to protect large files |
| // from being compacted again. |
| // Default: false; |
| bool allow_compaction = false; |
| |
| CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} |
| CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction, |
| uint64_t _ttl = 0) |
| : max_table_files_size(_max_table_files_size), |
| ttl(_ttl), |
| allow_compaction(_allow_compaction) {} |
| }; |
| |
| // Compression options for different compression algorithms like Zlib |
| struct CompressionOptions { |
| int window_bits; |
| int level; |
| int strategy; |
| // Maximum size of dictionary used to prime the compression library. Currently |
| // this dictionary will be constructed by sampling the first output file in a |
| // subcompaction when the target level is bottommost. This dictionary will be |
| // loaded into the compression library before compressing/uncompressing each |
| // data block of subsequent files in the subcompaction. Effectively, this |
| // improves compression ratios when there are repetitions across data blocks. |
| // A value of 0 indicates the feature is disabled. |
| // Default: 0. |
| uint32_t max_dict_bytes; |
| |
| CompressionOptions() |
| : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {} |
| CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes) |
| : window_bits(wbits), |
| level(_lev), |
| strategy(_strategy), |
| max_dict_bytes(_max_dict_bytes) {} |
| }; |
| |
| enum UpdateStatus { // Return status For inplace update callback |
| UPDATE_FAILED = 0, // Nothing to update |
| UPDATED_INPLACE = 1, // Value updated inplace |
| UPDATED = 2, // No inplace update. Merged value set |
| }; |
| |
| |
| struct AdvancedColumnFamilyOptions { |
| // The maximum number of write buffers that are built up in memory. |
| // The default and the minimum number is 2, so that when 1 write buffer |
| // is being flushed to storage, new writes can continue to the other |
| // write buffer. |
| // If max_write_buffer_number > 3, writing will be slowed down to |
| // options.delayed_write_rate if we are writing to the last write buffer |
| // allowed. |
| // |
| // Default: 2 |
| // |
| // Dynamically changeable through SetOptions() API |
| int max_write_buffer_number = 2; |
| |
| // The minimum number of write buffers that will be merged together |
| // before writing to storage. If set to 1, then |
| // all write buffers are flushed to L0 as individual files and this increases |
| // read amplification because a get request has to check in all of these |
| // files. Also, an in-memory merge may result in writing lesser |
| // data to storage if there are duplicate records in each of these |
| // individual write buffers. Default: 1 |
| int min_write_buffer_number_to_merge = 1; |
| |
| // The total maximum number of write buffers to maintain in memory including |
| // copies of buffers that have already been flushed. Unlike |
| // max_write_buffer_number, this parameter does not affect flushing. |
| // This controls the minimum amount of write history that will be available |
| // in memory for conflict checking when Transactions are used. |
| // |
| // When using an OptimisticTransactionDB: |
| // If this value is too low, some transactions may fail at commit time due |
| // to not being able to determine whether there were any write conflicts. |
| // |
| // When using a TransactionDB: |
| // If Transaction::SetSnapshot is used, TransactionDB will read either |
| // in-memory write buffers or SST files to do write-conflict checking. |
| // Increasing this value can reduce the number of reads to SST files |
| // done for conflict detection. |
| // |
| // Setting this value to 0 will cause write buffers to be freed immediately |
| // after they are flushed. |
| // If this value is set to -1, 'max_write_buffer_number' will be used. |
| // |
| // Default: |
| // If using a TransactionDB/OptimisticTransactionDB, the default value will |
| // be set to the value of 'max_write_buffer_number' if it is not explicitly |
| // set by the user. Otherwise, the default is 0. |
| int max_write_buffer_number_to_maintain = 0; |
| |
| // Allows thread-safe inplace updates. If this is true, there is no way to |
| // achieve point-in-time consistency using snapshot or iterator (assuming |
| // concurrent updates). Hence iterator and multi-get will return results |
| // which are not consistent as of any point-in-time. |
| // If inplace_callback function is not set, |
| // Put(key, new_value) will update inplace the existing_value iff |
| // * key exists in current memtable |
| // * new sizeof(new_value) <= sizeof(existing_value) |
| // * existing_value for that key is a put i.e. kTypeValue |
| // If inplace_callback function is set, check doc for inplace_callback. |
| // Default: false. |
| bool inplace_update_support = false; |
| |
| // Number of locks used for inplace update |
| // Default: 10000, if inplace_update_support = true, else 0. |
| // |
| // Dynamically changeable through SetOptions() API |
| size_t inplace_update_num_locks = 10000; |
| |
| // existing_value - pointer to previous value (from both memtable and sst). |
| // nullptr if key doesn't exist |
| // existing_value_size - pointer to size of existing_value). |
| // nullptr if key doesn't exist |
| // delta_value - Delta value to be merged with the existing_value. |
| // Stored in transaction logs. |
| // merged_value - Set when delta is applied on the previous value. |
| |
| // Applicable only when inplace_update_support is true, |
| // this callback function is called at the time of updating the memtable |
| // as part of a Put operation, lets say Put(key, delta_value). It allows the |
| // 'delta_value' specified as part of the Put operation to be merged with |
| // an 'existing_value' of the key in the database. |
| |
| // If the merged value is smaller in size that the 'existing_value', |
| // then this function can update the 'existing_value' buffer inplace and |
| // the corresponding 'existing_value'_size pointer, if it wishes to. |
| // The callback should return UpdateStatus::UPDATED_INPLACE. |
| // In this case. (In this case, the snapshot-semantics of the rocksdb |
| // Iterator is not atomic anymore). |
| |
| // If the merged value is larger in size than the 'existing_value' or the |
| // application does not wish to modify the 'existing_value' buffer inplace, |
| // then the merged value should be returned via *merge_value. It is set by |
| // merging the 'existing_value' and the Put 'delta_value'. The callback should |
| // return UpdateStatus::UPDATED in this case. This merged value will be added |
| // to the memtable. |
| |
| // If merging fails or the application does not wish to take any action, |
| // then the callback should return UpdateStatus::UPDATE_FAILED. |
| |
| // Please remember that the original call from the application is Put(key, |
| // delta_value). So the transaction log (if enabled) will still contain (key, |
| // delta_value). The 'merged_value' is not stored in the transaction log. |
| // Hence the inplace_callback function should be consistent across db reopens. |
| |
| // Default: nullptr |
| UpdateStatus (*inplace_callback)(char* existing_value, |
| uint32_t* existing_value_size, |
| Slice delta_value, |
| std::string* merged_value) = nullptr; |
| |
| // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, |
| // create prefix bloom for memtable with the size of |
| // write_buffer_size * memtable_prefix_bloom_size_ratio. |
| // If it is larger than 0.25, it is santinized to 0.25. |
| // |
| // Default: 0 (disable) |
| // |
| // Dynamically changeable through SetOptions() API |
| double memtable_prefix_bloom_size_ratio = 0.0; |
| |
| // Page size for huge page for the arena used by the memtable. If <=0, it |
| // won't allocate from huge page but from malloc. |
| // Users are responsible to reserve huge pages for it to be allocated. For |
| // example: |
| // sysctl -w vm.nr_hugepages=20 |
| // See linux doc Documentation/vm/hugetlbpage.txt |
| // If there isn't enough free huge page available, it will fall back to |
| // malloc. |
| // |
| // Dynamically changeable through SetOptions() API |
| size_t memtable_huge_page_size = 0; |
| |
| // If non-nullptr, memtable will use the specified function to extract |
| // prefixes for keys, and for each prefix maintain a hint of insert location |
| // to reduce CPU usage for inserting keys with the prefix. Keys out of |
| // domain of the prefix extractor will be insert without using hints. |
| // |
| // Currently only the default skiplist based memtable implements the feature. |
| // All other memtable implementation will ignore the option. It incurs ~250 |
| // additional bytes of memory overhead to store a hint for each prefix. |
| // Also concurrent writes (when allow_concurrent_memtable_write is true) will |
| // ignore the option. |
| // |
| // The option is best suited for workloads where keys will likely to insert |
| // to a location close the last inserted key with the same prefix. |
| // One example could be inserting keys of the form (prefix + timestamp), |
| // and keys of the same prefix always comes in with time order. Another |
| // example would be updating the same key over and over again, in which case |
| // the prefix can be the key itself. |
| // |
| // Default: nullptr (disable) |
| std::shared_ptr<const SliceTransform> |
| memtable_insert_with_hint_prefix_extractor = nullptr; |
| |
| // Control locality of bloom filter probes to improve cache miss rate. |
| // This option only applies to memtable prefix bloom and plaintable |
| // prefix bloom. It essentially limits every bloom checking to one cache line. |
| // This optimization is turned off when set to 0, and positive number to turn |
| // it on. |
| // Default: 0 |
| uint32_t bloom_locality = 0; |
| |
| // size of one block in arena memory allocation. |
| // If <= 0, a proper value is automatically calculated (usually 1/8 of |
| // writer_buffer_size, rounded up to a multiple of 4KB). |
| // |
| // There are two additional restriction of the specified size: |
| // (1) size should be in the range of [4096, 2 << 30] and |
| // (2) be the multiple of the CPU word (which helps with the memory |
| // alignment). |
| // |
| // We'll automatically check and adjust the size number to make sure it |
| // conforms to the restrictions. |
| // |
| // Default: 0 |
| // |
| // Dynamically changeable through SetOptions() API |
| size_t arena_block_size = 0; |
| |
| // Different levels can have different compression policies. There |
| // are cases where most lower levels would like to use quick compression |
| // algorithms while the higher levels (which have more data) use |
| // compression algorithms that have better compression but could |
| // be slower. This array, if non-empty, should have an entry for |
| // each level of the database; these override the value specified in |
| // the previous field 'compression'. |
| // |
| // NOTICE if level_compaction_dynamic_level_bytes=true, |
| // compression_per_level[0] still determines L0, but other elements |
| // of the array are based on base level (the level L0 files are merged |
| // to), and may not match the level users see from info log for metadata. |
| // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] |
| // determines compaction type for level n+i-1. |
| // For example, if we have three 5 levels, and we determine to merge L0 |
| // data to L4 (which means L1..L3 will be empty), then the new files go to |
| // L4 uses compression type compression_per_level[1]. |
| // If now L0 is merged to L2. Data goes to L2 will be compressed |
| // according to compression_per_level[1], L3 using compression_per_level[2] |
| // and L4 using compression_per_level[3]. Compaction for each level can |
| // change when data grows. |
| std::vector<CompressionType> compression_per_level; |
| |
| // Number of levels for this database |
| int num_levels = 7; |
| |
| // Soft limit on number of level-0 files. We start slowing down writes at this |
| // point. A value <0 means that no writing slow down will be triggered by |
| // number of files in level-0. |
| // |
| // Default: 20 |
| // |
| // Dynamically changeable through SetOptions() API |
| int level0_slowdown_writes_trigger = 20; |
| |
| // Maximum number of level-0 files. We stop writes at this point. |
| // |
| // Default: 36 |
| // |
| // Dynamically changeable through SetOptions() API |
| int level0_stop_writes_trigger = 36; |
| |
| // Target file size for compaction. |
| // target_file_size_base is per-file size for level-1. |
| // Target file size for level L can be calculated by |
| // target_file_size_base * (target_file_size_multiplier ^ (L-1)) |
| // For example, if target_file_size_base is 2MB and |
| // target_file_size_multiplier is 10, then each file on level-1 will |
| // be 2MB, and each file on level 2 will be 20MB, |
| // and each file on level-3 will be 200MB. |
| // |
| // Default: 64MB. |
| // |
| // Dynamically changeable through SetOptions() API |
| uint64_t target_file_size_base = 64 * 1048576; |
| |
| // By default target_file_size_multiplier is 1, which means |
| // by default files in different levels will have similar size. |
| // |
| // Dynamically changeable through SetOptions() API |
| int target_file_size_multiplier = 1; |
| |
| // If true, RocksDB will pick target size of each level dynamically. |
| // We will pick a base level b >= 1. L0 will be directly merged into level b, |
| // instead of always into level 1. Level 1 to b-1 need to be empty. |
| // We try to pick b and its target size so that |
| // 1. target size is in the range of |
| // (max_bytes_for_level_base / max_bytes_for_level_multiplier, |
| // max_bytes_for_level_base] |
| // 2. target size of the last level (level num_levels-1) equals to extra size |
| // of the level. |
| // At the same time max_bytes_for_level_multiplier and |
| // max_bytes_for_level_multiplier_additional are still satisfied. |
| // |
| // With this option on, from an empty DB, we make last level the base level, |
| // which means merging L0 data into the last level, until it exceeds |
| // max_bytes_for_level_base. And then we make the second last level to be |
| // base level, to start to merge L0 data to second last level, with its |
| // target size to be 1/max_bytes_for_level_multiplier of the last level's |
| // extra size. After the data accumulates more so that we need to move the |
| // base level to the third last one, and so on. |
| // |
| // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, |
| // and max_bytes_for_level_base=10MB. |
| // Target sizes of level 1 to 5 starts with: |
| // [- - - - 10MB] |
| // with base level is level. Target sizes of level 1 to 4 are not applicable |
| // because they will not be used. |
| // Until the size of Level 5 grows to more than 10MB, say 11MB, we make |
| // base target to level 4 and now the targets looks like: |
| // [- - - 1.1MB 11MB] |
| // While data are accumulated, size targets are tuned based on actual data |
| // of level 5. When level 5 has 50MB of data, the target is like: |
| // [- - - 5MB 50MB] |
| // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep |
| // level 4 to be the base level, its target size needs to be 10.1MB, which |
| // doesn't satisfy the target size range. So now we make level 3 the target |
| // size and the target sizes of the levels look like: |
| // [- - 1.01MB 10.1MB 101MB] |
| // In the same way, while level 5 further grows, all levels' targets grow, |
| // like |
| // [- - 5MB 50MB 500MB] |
| // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the |
| // base level and make levels' target sizes like this: |
| // [- 1.001MB 10.01MB 100.1MB 1001MB] |
| // and go on... |
| // |
| // By doing it, we give max_bytes_for_level_multiplier a priority against |
| // max_bytes_for_level_base, for a more predictable LSM tree shape. It is |
| // useful to limit worse case space amplification. |
| // |
| // max_bytes_for_level_multiplier_additional is ignored with this flag on. |
| // |
| // Turning this feature on or off for an existing DB can cause unexpected |
| // LSM tree structure so it's not recommended. |
| // |
| // NOTE: this option is experimental |
| // |
| // Default: false |
| bool level_compaction_dynamic_level_bytes = false; |
| |
| // Default: 10. |
| // |
| // Dynamically changeable through SetOptions() API |
| double max_bytes_for_level_multiplier = 10; |
| |
| // Different max-size multipliers for different levels. |
| // These are multiplied by max_bytes_for_level_multiplier to arrive |
| // at the max-size of each level. |
| // |
| // Default: 1 |
| // |
| // Dynamically changeable through SetOptions() API |
| std::vector<int> max_bytes_for_level_multiplier_additional = |
| std::vector<int>(num_levels, 1); |
| |
| // We try to limit number of bytes in one compaction to be lower than this |
| // threshold. But it's not guaranteed. |
| // Value 0 will be sanitized. |
| // |
| // Default: result.target_file_size_base * 25 |
| uint64_t max_compaction_bytes = 0; |
| |
| // All writes will be slowed down to at least delayed_write_rate if estimated |
| // bytes needed to be compaction exceed this threshold. |
| // |
| // Default: 64GB |
| uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; |
| |
| // All writes are stopped if estimated bytes needed to be compaction exceed |
| // this threshold. |
| // |
| // Default: 256GB |
| uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; |
| |
| // The compaction style. Default: kCompactionStyleLevel |
| CompactionStyle compaction_style = kCompactionStyleLevel; |
| |
| // If level compaction_style = kCompactionStyleLevel, for each level, |
| // which files are prioritized to be picked to compact. |
| // Default: kByCompensatedSize |
| CompactionPri compaction_pri = kByCompensatedSize; |
| |
| // The options needed to support Universal Style compactions |
| CompactionOptionsUniversal compaction_options_universal; |
| |
| // The options for FIFO compaction style |
| CompactionOptionsFIFO compaction_options_fifo; |
| |
| // An iteration->Next() sequentially skips over keys with the same |
| // user-key unless this option is set. This number specifies the number |
| // of keys (with the same userkey) that will be sequentially |
| // skipped before a reseek is issued. |
| // |
| // Default: 8 |
| // |
| // Dynamically changeable through SetOptions() API |
| uint64_t max_sequential_skip_in_iterations = 8; |
| |
| // This is a factory that provides MemTableRep objects. |
| // Default: a factory that provides a skip-list-based implementation of |
| // MemTableRep. |
| std::shared_ptr<MemTableRepFactory> memtable_factory = |
| std::shared_ptr<SkipListFactory>(new SkipListFactory); |
| |
| // Block-based table related options are moved to BlockBasedTableOptions. |
| // Related options that were originally here but now moved include: |
| // no_block_cache |
| // block_cache |
| // block_cache_compressed |
| // block_size |
| // block_size_deviation |
| // block_restart_interval |
| // filter_policy |
| // whole_key_filtering |
| // If you'd like to customize some of these options, you will need to |
| // use NewBlockBasedTableFactory() to construct a new table factory. |
| |
| // This option allows user to collect their own interested statistics of |
| // the tables. |
| // Default: empty vector -- no user-defined statistics collection will be |
| // performed. |
| typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> |
| TablePropertiesCollectorFactories; |
| TablePropertiesCollectorFactories table_properties_collector_factories; |
| |
| // Maximum number of successive merge operations on a key in the memtable. |
| // |
| // When a merge operation is added to the memtable and the maximum number of |
| // successive merges is reached, the value of the key will be calculated and |
| // inserted into the memtable instead of the merge operation. This will |
| // ensure that there are never more than max_successive_merges merge |
| // operations in the memtable. |
| // |
| // Default: 0 (disabled) |
| // |
| // Dynamically changeable through SetOptions() API |
| size_t max_successive_merges = 0; |
| |
| // This flag specifies that the implementation should optimize the filters |
| // mainly for cases where keys are found rather than also optimize for keys |
| // missed. This would be used in cases where the application knows that |
| // there are very few misses or the performance in the case of misses is not |
| // important. |
| // |
| // For now, this flag allows us to not store filters for the last level i.e |
| // the largest level which contains data of the LSM store. For keys which |
| // are hits, the filters in this level are not useful because we will search |
| // for the data anyway. NOTE: the filters in other levels are still useful |
| // even for key hit because they tell us whether to look in that level or go |
| // to the higher level. |
| // |
| // Default: false |
| bool optimize_filters_for_hits = false; |
| |
| // After writing every SST file, reopen it and read all the keys. |
| // Default: false |
| bool paranoid_file_checks = false; |
| |
| // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM |
| // change (Flush, Compaction, AddFile). These checks are disabled in release |
| // mode, use this option to enable them in release mode as well. |
| // Default: false |
| bool force_consistency_checks = false; |
| |
| // Measure IO stats in compactions and flushes, if true. |
| // Default: false |
| bool report_bg_io_stats = false; |
| |
| // Create ColumnFamilyOptions with default values for all fields |
| AdvancedColumnFamilyOptions(); |
| // Create ColumnFamilyOptions from Options |
| explicit AdvancedColumnFamilyOptions(const Options& options); |
| |
| // ---------------- OPTIONS NOT SUPPORTED ANYMORE ---------------- |
| |
| // NOT SUPPORTED ANYMORE |
| // This does not do anything anymore. |
| int max_mem_compaction_level; |
| |
| // NOT SUPPORTED ANYMORE -- this options is no longer used |
| // Puts are delayed to options.delayed_write_rate when any level has a |
| // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. |
| // |
| // Default: 0 (disabled) |
| // |
| // Dynamically changeable through SetOptions() API |
| double soft_rate_limit = 0.0; |
| |
| // NOT SUPPORTED ANYMORE -- this options is no longer used |
| double hard_rate_limit = 0.0; |
| |
| // NOT SUPPORTED ANYMORE -- this options is no longer used |
| unsigned int rate_limit_delay_max_milliseconds = 100; |
| |
| // NOT SUPPORTED ANYMORE |
| // Does not have any effect. |
| bool purge_redundant_kvs_while_flush = true; |
| }; |
| |
| } // namespace rocksdb |