| // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| // This source code is licensed under both the GPLv2 (found in the |
| // COPYING file in the root directory) and Apache 2.0 License |
| // (found in the LICENSE.Apache file in the root directory). |
| // |
| // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| // |
| // The representation of a DBImpl consists of a set of Versions. The |
| // newest version is called "current". Older versions may be kept |
| // around to provide a consistent view to live iterators. |
| // |
| // Each Version keeps track of a set of Table files per level. The |
| // entire set of versions is maintained in a VersionSet. |
| // |
| // Version,VersionSet are thread-compatible, but require external |
| // synchronization on all accesses. |
| |
| #pragma once |
| #include <atomic> |
| #include <deque> |
| #include <limits> |
| #include <map> |
| #include <memory> |
| #include <set> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "db/column_family.h" |
| #include "db/compaction.h" |
| #include "db/compaction_picker.h" |
| #include "db/dbformat.h" |
| #include "db/file_indexer.h" |
| #include "db/log_reader.h" |
| #include "db/range_del_aggregator.h" |
| #include "db/table_cache.h" |
| #include "db/version_builder.h" |
| #include "db/version_edit.h" |
| #include "db/write_controller.h" |
| #include "monitoring/instrumented_mutex.h" |
| #include "options/db_options.h" |
| #include "port/port.h" |
| #include "rocksdb/env.h" |
| |
| namespace rocksdb { |
| |
| namespace log { |
| class Writer; |
| } |
| |
| class Compaction; |
| class InternalIterator; |
| class LogBuffer; |
| class LookupKey; |
| class MemTable; |
| class Version; |
| class VersionSet; |
| class WriteBufferManager; |
| class MergeContext; |
| class ColumnFamilySet; |
| class TableCache; |
| class MergeIteratorBuilder; |
| |
| // Return the smallest index i such that file_level.files[i]->largest >= key. |
| // Return file_level.num_files if there is no such file. |
| // REQUIRES: "file_level.files" contains a sorted list of |
| // non-overlapping files. |
| extern int FindFile(const InternalKeyComparator& icmp, |
| const LevelFilesBrief& file_level, const Slice& key); |
| |
| // Returns true iff some file in "files" overlaps the user key range |
| // [*smallest,*largest]. |
| // smallest==nullptr represents a key smaller than all keys in the DB. |
| // largest==nullptr represents a key largest than all keys in the DB. |
| // REQUIRES: If disjoint_sorted_files, file_level.files[] |
| // contains disjoint ranges in sorted order. |
| extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, |
| bool disjoint_sorted_files, |
| const LevelFilesBrief& file_level, |
| const Slice* smallest_user_key, |
| const Slice* largest_user_key); |
| |
| // Generate LevelFilesBrief from vector<FdWithKeyRange*> |
| // Would copy smallest_key and largest_key data to sequential memory |
| // arena: Arena used to allocate the memory |
| extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, |
| const std::vector<FileMetaData*>& files, |
| Arena* arena); |
| |
| class VersionStorageInfo { |
| public: |
| VersionStorageInfo(const InternalKeyComparator* internal_comparator, |
| const Comparator* user_comparator, int num_levels, |
| CompactionStyle compaction_style, |
| VersionStorageInfo* src_vstorage, |
| bool _force_consistency_checks); |
| ~VersionStorageInfo(); |
| |
| void Reserve(int level, size_t size) { files_[level].reserve(size); } |
| |
| void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr); |
| |
| void SetFinalized(); |
| |
| // Update num_non_empty_levels_. |
| void UpdateNumNonEmptyLevels(); |
| |
| void GenerateFileIndexer() { |
| file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); |
| } |
| |
| // Update the accumulated stats from a file-meta. |
| void UpdateAccumulatedStats(FileMetaData* file_meta); |
| |
| // Decrease the current stat form a to-be-delected file-meta |
| void RemoveCurrentStats(FileMetaData* file_meta); |
| |
| void ComputeCompensatedSizes(); |
| |
| // Updates internal structures that keep track of compaction scores |
| // We use compaction scores to figure out which compaction to do next |
| // REQUIRES: db_mutex held!! |
| // TODO find a better way to pass compaction_options_fifo. |
| void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options, |
| const MutableCFOptions& mutable_cf_options); |
| |
| // Estimate est_comp_needed_bytes_ |
| void EstimateCompactionBytesNeeded( |
| const MutableCFOptions& mutable_cf_options); |
| |
| // This computes files_marked_for_compaction_ and is called by |
| // ComputeCompactionScore() |
| void ComputeFilesMarkedForCompaction(); |
| |
| // Generate level_files_brief_ from files_ |
| void GenerateLevelFilesBrief(); |
| // Sort all files for this version based on their file size and |
| // record results in files_by_compaction_pri_. The largest files are listed |
| // first. |
| void UpdateFilesByCompactionPri(CompactionPri compaction_pri); |
| |
| void GenerateLevel0NonOverlapping(); |
| bool level0_non_overlapping() const { |
| return level0_non_overlapping_; |
| } |
| |
| int MaxInputLevel() const; |
| int MaxOutputLevel(bool allow_ingest_behind) const; |
| |
| // Return level number that has idx'th highest score |
| int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } |
| |
| // Return idx'th highest score |
| double CompactionScore(int idx) const { return compaction_score_[idx]; } |
| |
| void GetOverlappingInputs( |
| int level, const InternalKey* begin, // nullptr means before all keys |
| const InternalKey* end, // nullptr means after all keys |
| std::vector<FileMetaData*>* inputs, |
| int hint_index = -1, // index of overlap file |
| int* file_index = nullptr, // return index of overlap file |
| bool expand_range = true) // if set, returns files which overlap the |
| const; // range and overlap each other. If false, |
| // then just files intersecting the range |
| void GetCleanInputsWithinInterval( |
| int level, const InternalKey* begin, // nullptr means before all keys |
| const InternalKey* end, // nullptr means after all keys |
| std::vector<FileMetaData*>* inputs, |
| int hint_index = -1, // index of overlap file |
| int* file_index = nullptr) // return index of overlap file |
| const; |
| |
| void GetOverlappingInputsRangeBinarySearch( |
| int level, // level > 0 |
| const Slice& begin, // nullptr means before all keys |
| const Slice& end, // nullptr means after all keys |
| std::vector<FileMetaData*>* inputs, |
| int hint_index, // index of overlap file |
| int* file_index, // return index of overlap file |
| bool within_interval = false) // if set, force the inputs within interval |
| const; |
| |
| void ExtendFileRangeOverlappingInterval( |
| int level, |
| const Slice& begin, // nullptr means before all keys |
| const Slice& end, // nullptr means after all keys |
| unsigned int index, // start extending from this index |
| int* startIndex, // return the startIndex of input range |
| int* endIndex) // return the endIndex of input range |
| const; |
| |
| void ExtendFileRangeWithinInterval( |
| int level, |
| const Slice& begin, // nullptr means before all keys |
| const Slice& end, // nullptr means after all keys |
| unsigned int index, // start extending from this index |
| int* startIndex, // return the startIndex of input range |
| int* endIndex) // return the endIndex of input range |
| const; |
| |
| // Returns true iff some file in the specified level overlaps |
| // some part of [*smallest_user_key,*largest_user_key]. |
| // smallest_user_key==NULL represents a key smaller than all keys in the DB. |
| // largest_user_key==NULL represents a key largest than all keys in the DB. |
| bool OverlapInLevel(int level, const Slice* smallest_user_key, |
| const Slice* largest_user_key); |
| |
| // Returns true iff the first or last file in inputs contains |
| // an overlapping user key to the file "just outside" of it (i.e. |
| // just after the last file, or just before the first file) |
| // REQUIRES: "*inputs" is a sorted list of non-overlapping files |
| bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs, |
| int level); |
| |
| int num_levels() const { return num_levels_; } |
| |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| int num_non_empty_levels() const { |
| assert(finalized_); |
| return num_non_empty_levels_; |
| } |
| |
| // REQUIRES: This version has been finalized. |
| // (CalculateBaseBytes() is called) |
| // This may or may not return number of level files. It is to keep backward |
| // compatible behavior in universal compaction. |
| int l0_delay_trigger_count() const { return l0_delay_trigger_count_; } |
| |
| void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; } |
| |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| int NumLevelFiles(int level) const { |
| assert(finalized_); |
| return static_cast<int>(files_[level].size()); |
| } |
| |
| // Return the combined file size of all files at the specified level. |
| uint64_t NumLevelBytes(int level) const; |
| |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| const std::vector<FileMetaData*>& LevelFiles(int level) const { |
| return files_[level]; |
| } |
| |
| const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const { |
| assert(level < static_cast<int>(level_files_brief_.size())); |
| return level_files_brief_[level]; |
| } |
| |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| const std::vector<int>& FilesByCompactionPri(int level) const { |
| assert(finalized_); |
| return files_by_compaction_pri_[level]; |
| } |
| |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| // REQUIRES: DB mutex held during access |
| const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction() |
| const { |
| assert(finalized_); |
| return files_marked_for_compaction_; |
| } |
| |
| int base_level() const { return base_level_; } |
| |
| // REQUIRES: lock is held |
| // Set the index that is used to offset into files_by_compaction_pri_ to find |
| // the next compaction candidate file. |
| void SetNextCompactionIndex(int level, int index) { |
| next_file_to_compact_by_size_[level] = index; |
| } |
| |
| // REQUIRES: lock is held |
| int NextCompactionIndex(int level) const { |
| return next_file_to_compact_by_size_[level]; |
| } |
| |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| const FileIndexer& file_indexer() const { |
| assert(finalized_); |
| return file_indexer_; |
| } |
| |
| // Only the first few entries of files_by_compaction_pri_ are sorted. |
| // There is no need to sort all the files because it is likely |
| // that on a running system, we need to look at only the first |
| // few largest files because a new version is created every few |
| // seconds/minutes (because of concurrent compactions). |
| static const size_t kNumberFilesToSort = 50; |
| |
| // Return a human-readable short (single-line) summary of the number |
| // of files per level. Uses *scratch as backing store. |
| struct LevelSummaryStorage { |
| char buffer[1000]; |
| }; |
| struct FileSummaryStorage { |
| char buffer[3000]; |
| }; |
| const char* LevelSummary(LevelSummaryStorage* scratch) const; |
| // Return a human-readable short (single-line) summary of files |
| // in a specified level. Uses *scratch as backing store. |
| const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; |
| |
| // Return the maximum overlapping data (in bytes) at next level for any |
| // file at a level >= 1. |
| int64_t MaxNextLevelOverlappingBytes(); |
| |
| // Return a human readable string that describes this version's contents. |
| std::string DebugString(bool hex = false) const; |
| |
| uint64_t GetAverageValueSize() const { |
| if (accumulated_num_non_deletions_ == 0) { |
| return 0; |
| } |
| assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); |
| assert(accumulated_file_size_ > 0); |
| return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * |
| accumulated_file_size_ / |
| (accumulated_raw_key_size_ + accumulated_raw_value_size_); |
| } |
| |
| uint64_t GetEstimatedActiveKeys() const; |
| |
| double GetEstimatedCompressionRatioAtLevel(int level) const; |
| |
| // re-initializes the index that is used to offset into |
| // files_by_compaction_pri_ |
| // to find the next compaction candidate file. |
| void ResetNextCompactionIndex(int level) { |
| next_file_to_compact_by_size_[level] = 0; |
| } |
| |
| const InternalKeyComparator* InternalComparator() { |
| return internal_comparator_; |
| } |
| |
| // Returns maximum total bytes of data on a given level. |
| uint64_t MaxBytesForLevel(int level) const; |
| |
| // Must be called after any change to MutableCFOptions. |
| void CalculateBaseBytes(const ImmutableCFOptions& ioptions, |
| const MutableCFOptions& options); |
| |
| // Returns an estimate of the amount of live data in bytes. |
| uint64_t EstimateLiveDataSize() const; |
| |
| uint64_t estimated_compaction_needed_bytes() const { |
| return estimated_compaction_needed_bytes_; |
| } |
| |
| void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { |
| estimated_compaction_needed_bytes_ = v; |
| } |
| |
| bool force_consistency_checks() const { return force_consistency_checks_; } |
| |
| private: |
| const InternalKeyComparator* internal_comparator_; |
| const Comparator* user_comparator_; |
| int num_levels_; // Number of levels |
| int num_non_empty_levels_; // Number of levels. Any level larger than it |
| // is guaranteed to be empty. |
| // Per-level max bytes |
| std::vector<uint64_t> level_max_bytes_; |
| |
| // A short brief metadata of files per level |
| autovector<rocksdb::LevelFilesBrief> level_files_brief_; |
| FileIndexer file_indexer_; |
| Arena arena_; // Used to allocate space for file_levels_ |
| |
| CompactionStyle compaction_style_; |
| |
| // List of files per level, files in each level are arranged |
| // in increasing order of keys |
| std::vector<FileMetaData*>* files_; |
| |
| // Level that L0 data should be compacted to. All levels < base_level_ should |
| // be empty. -1 if it is not level-compaction so it's not applicable. |
| int base_level_; |
| |
| // A list for the same set of files that are stored in files_, |
| // but files in each level are now sorted based on file |
| // size. The file with the largest size is at the front. |
| // This vector stores the index of the file from files_. |
| std::vector<std::vector<int>> files_by_compaction_pri_; |
| |
| // If true, means that files in L0 have keys with non overlapping ranges |
| bool level0_non_overlapping_; |
| |
| // An index into files_by_compaction_pri_ that specifies the first |
| // file that is not yet compacted |
| std::vector<int> next_file_to_compact_by_size_; |
| |
| // Only the first few entries of files_by_compaction_pri_ are sorted. |
| // There is no need to sort all the files because it is likely |
| // that on a running system, we need to look at only the first |
| // few largest files because a new version is created every few |
| // seconds/minutes (because of concurrent compactions). |
| static const size_t number_of_files_to_sort_ = 50; |
| |
| // This vector contains list of files marked for compaction and also not |
| // currently being compacted. It is protected by DB mutex. It is calculated in |
| // ComputeCompactionScore() |
| autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_; |
| |
| // Level that should be compacted next and its compaction score. |
| // Score < 1 means compaction is not strictly needed. These fields |
| // are initialized by Finalize(). |
| // The most critical level to be compacted is listed first |
| // These are used to pick the best compaction level |
| std::vector<double> compaction_score_; |
| std::vector<int> compaction_level_; |
| int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop |
| // for number of L0 files. |
| |
| // the following are the sampled temporary stats. |
| // the current accumulated size of sampled files. |
| uint64_t accumulated_file_size_; |
| // the current accumulated size of all raw keys based on the sampled files. |
| uint64_t accumulated_raw_key_size_; |
| // the current accumulated size of all raw keys based on the sampled files. |
| uint64_t accumulated_raw_value_size_; |
| // total number of non-deletion entries |
| uint64_t accumulated_num_non_deletions_; |
| // total number of deletion entries |
| uint64_t accumulated_num_deletions_; |
| // current number of non_deletion entries |
| uint64_t current_num_non_deletions_; |
| // current number of delection entries |
| uint64_t current_num_deletions_; |
| // current number of file samples |
| uint64_t current_num_samples_; |
| // Estimated bytes needed to be compacted until all levels' size is down to |
| // target sizes. |
| uint64_t estimated_compaction_needed_bytes_; |
| |
| bool finalized_; |
| |
| // If set to true, we will run consistency checks even if RocksDB |
| // is compiled in release mode |
| bool force_consistency_checks_; |
| |
| friend class Version; |
| friend class VersionSet; |
| // No copying allowed |
| VersionStorageInfo(const VersionStorageInfo&) = delete; |
| void operator=(const VersionStorageInfo&) = delete; |
| }; |
| |
| class Version { |
| public: |
| // Append to *iters a sequence of iterators that will |
| // yield the contents of this Version when merged together. |
| // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| void AddIterators(const ReadOptions&, const EnvOptions& soptions, |
| MergeIteratorBuilder* merger_iter_builder, |
| RangeDelAggregator* range_del_agg); |
| |
| void AddIteratorsForLevel(const ReadOptions&, const EnvOptions& soptions, |
| MergeIteratorBuilder* merger_iter_builder, |
| int level, RangeDelAggregator* range_del_agg); |
| |
| void AddRangeDelIteratorsForLevel( |
| const ReadOptions& read_options, const EnvOptions& soptions, int level, |
| std::vector<InternalIterator*>* range_del_iters); |
| |
| // Lookup the value for key. If found, store it in *val and |
| // return OK. Else return a non-OK status. |
| // Uses *operands to store merge_operator operations to apply later. |
| // |
| // If the ReadOptions.read_tier is set to do a read-only fetch, then |
| // *value_found will be set to false if it cannot be determined whether |
| // this value exists without doing IO. |
| // |
| // If the key is Deleted, *status will be set to NotFound and |
| // *key_exists will be set to true. |
| // If no key was found, *status will be set to NotFound and |
| // *key_exists will be set to false. |
| // If seq is non-null, *seq will be set to the sequence number found |
| // for the key if a key was found. |
| // |
| // REQUIRES: lock is not held |
| void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, |
| Status* status, MergeContext* merge_context, |
| RangeDelAggregator* range_del_agg, bool* value_found = nullptr, |
| bool* key_exists = nullptr, SequenceNumber* seq = nullptr, |
| bool* is_blob = nullptr); |
| |
| // Loads some stats information from files. Call without mutex held. It needs |
| // to be called before applying the version to the version set. |
| void PrepareApply(const MutableCFOptions& mutable_cf_options, |
| bool update_stats); |
| |
| // Reference count management (so Versions do not disappear out from |
| // under live iterators) |
| void Ref(); |
| // Decrease reference count. Delete the object if no reference left |
| // and return true. Otherwise, return false. |
| bool Unref(); |
| |
| // Add all files listed in the current version to *live. |
| void AddLiveFiles(std::vector<FileDescriptor>* live); |
| |
| // Return a human readable string that describes this version's contents. |
| std::string DebugString(bool hex = false, bool print_stats = false) const; |
| |
| // Returns the version nuber of this version |
| uint64_t GetVersionNumber() const { return version_number_; } |
| |
| // REQUIRES: lock is held |
| // On success, "tp" will contains the table properties of the file |
| // specified in "file_meta". If the file name of "file_meta" is |
| // known ahread, passing it by a non-null "fname" can save a |
| // file-name conversion. |
| Status GetTableProperties(std::shared_ptr<const TableProperties>* tp, |
| const FileMetaData* file_meta, |
| const std::string* fname = nullptr) const; |
| |
| // REQUIRES: lock is held |
| // On success, *props will be populated with all SSTables' table properties. |
| // The keys of `props` are the sst file name, the values of `props` are the |
| // tables' propertis, represented as shared_ptr. |
| Status GetPropertiesOfAllTables(TablePropertiesCollection* props); |
| Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); |
| Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, |
| TablePropertiesCollection* props) const; |
| |
| // REQUIRES: lock is held |
| // On success, "tp" will contains the aggregated table property amoug |
| // the table properties of all sst files in this version. |
| Status GetAggregatedTableProperties( |
| std::shared_ptr<const TableProperties>* tp, int level = -1); |
| |
| uint64_t GetEstimatedActiveKeys() { |
| return storage_info_.GetEstimatedActiveKeys(); |
| } |
| |
| size_t GetMemoryUsageByTableReaders(); |
| |
| ColumnFamilyData* cfd() const { return cfd_; } |
| |
| // Return the next Version in the linked list. Used for debug only |
| Version* TEST_Next() const { |
| return next_; |
| } |
| |
| int TEST_refs() const { return refs_; } |
| |
| VersionStorageInfo* storage_info() { return &storage_info_; } |
| |
| VersionSet* version_set() { return vset_; } |
| |
| void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); |
| |
| private: |
| Env* env_; |
| friend class VersionSet; |
| |
| const InternalKeyComparator* internal_comparator() const { |
| return storage_info_.internal_comparator_; |
| } |
| const Comparator* user_comparator() const { |
| return storage_info_.user_comparator_; |
| } |
| |
| bool PrefixMayMatch(const ReadOptions& read_options, |
| InternalIterator* level_iter, |
| const Slice& internal_prefix) const; |
| |
| // Returns true if the filter blocks in the specified level will not be |
| // checked during read operations. In certain cases (trivial move or preload), |
| // the filter block may already be cached, but we still do not access it such |
| // that it eventually expires from the cache. |
| bool IsFilterSkipped(int level, bool is_file_last_in_level = false); |
| |
| // The helper function of UpdateAccumulatedStats, which may fill the missing |
| // fields of file_mata from its associated TableProperties. |
| // Returns true if it does initialize FileMetaData. |
| bool MaybeInitializeFileMetaData(FileMetaData* file_meta); |
| |
| // Update the accumulated stats associated with the current version. |
| // This accumulated stats will be used in compaction. |
| void UpdateAccumulatedStats(bool update_stats); |
| |
| // Sort all files for this version based on their file size and |
| // record results in files_by_compaction_pri_. The largest files are listed |
| // first. |
| void UpdateFilesByCompactionPri(); |
| |
| ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs |
| Logger* info_log_; |
| Statistics* db_statistics_; |
| TableCache* table_cache_; |
| const MergeOperator* merge_operator_; |
| |
| VersionStorageInfo storage_info_; |
| VersionSet* vset_; // VersionSet to which this Version belongs |
| Version* next_; // Next version in linked list |
| Version* prev_; // Previous version in linked list |
| int refs_; // Number of live refs to this version |
| |
| // A version number that uniquely represents this version. This is |
| // used for debugging and logging purposes only. |
| uint64_t version_number_; |
| |
| Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0); |
| |
| ~Version(); |
| |
| // No copying allowed |
| Version(const Version&); |
| void operator=(const Version&); |
| }; |
| |
| class VersionSet { |
| public: |
| VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, |
| const EnvOptions& env_options, Cache* table_cache, |
| WriteBufferManager* write_buffer_manager, |
| WriteController* write_controller); |
| ~VersionSet(); |
| |
| // Apply *edit to the current version to form a new descriptor that |
| // is both saved to persistent state and installed as the new |
| // current version. Will release *mu while actually writing to the file. |
| // column_family_options has to be set if edit is column family add |
| // REQUIRES: *mu is held on entry. |
| // REQUIRES: no other thread concurrently calls LogAndApply() |
| Status LogAndApply( |
| ColumnFamilyData* column_family_data, |
| const MutableCFOptions& mutable_cf_options, VersionEdit* edit, |
| InstrumentedMutex* mu, Directory* db_directory = nullptr, |
| bool new_descriptor_log = false, |
| const ColumnFamilyOptions* column_family_options = nullptr) { |
| autovector<VersionEdit*> edit_list; |
| edit_list.push_back(edit); |
| return LogAndApply(column_family_data, mutable_cf_options, edit_list, mu, |
| db_directory, new_descriptor_log, column_family_options); |
| } |
| // The batch version. If edit_list.size() > 1, caller must ensure that |
| // no edit in the list column family add or drop |
| Status LogAndApply( |
| ColumnFamilyData* column_family_data, |
| const MutableCFOptions& mutable_cf_options, |
| const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu, |
| Directory* db_directory = nullptr, bool new_descriptor_log = false, |
| const ColumnFamilyOptions* column_family_options = nullptr); |
| |
| // Recover the last saved descriptor from persistent storage. |
| // If read_only == true, Recover() will not complain if some column families |
| // are not opened |
| Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families, |
| bool read_only = false); |
| |
| // Reads a manifest file and returns a list of column families in |
| // column_families. |
| static Status ListColumnFamilies(std::vector<std::string>* column_families, |
| const std::string& dbname, Env* env); |
| |
| #ifndef ROCKSDB_LITE |
| // Try to reduce the number of levels. This call is valid when |
| // only one level from the new max level to the old |
| // max level containing files. |
| // The call is static, since number of levels is immutable during |
| // the lifetime of a RocksDB instance. It reduces number of levels |
| // in a DB by applying changes to manifest. |
| // For example, a db currently has 7 levels [0-6], and a call to |
| // to reduce to 5 [0-4] can only be executed when only one level |
| // among [4-6] contains files. |
| static Status ReduceNumberOfLevels(const std::string& dbname, |
| const Options* options, |
| const EnvOptions& env_options, |
| int new_levels); |
| |
| // printf contents (for debugging) |
| Status DumpManifest(Options& options, std::string& manifestFileName, |
| bool verbose, bool hex = false, bool json = false); |
| |
| #endif // ROCKSDB_LITE |
| |
| // Return the current manifest file number |
| uint64_t manifest_file_number() const { return manifest_file_number_; } |
| |
| uint64_t options_file_number() const { return options_file_number_; } |
| |
| uint64_t pending_manifest_file_number() const { |
| return pending_manifest_file_number_; |
| } |
| |
| uint64_t current_next_file_number() const { return next_file_number_.load(); } |
| |
| // Allocate and return a new file number |
| uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } |
| |
| // Return the last sequence number. |
| uint64_t LastSequence() const { |
| return last_sequence_.load(std::memory_order_acquire); |
| } |
| |
| // Note: memory_order_acquire must be sufficient. |
| uint64_t LastToBeWrittenSequence() const { |
| return last_to_be_written_sequence_.load(std::memory_order_seq_cst); |
| } |
| |
| // Set the last sequence number to s. |
| void SetLastSequence(uint64_t s) { |
| assert(s >= last_sequence_); |
| // Last visible seqeunce must always be less than last written seq |
| assert(!db_options_->concurrent_prepare || |
| s <= last_to_be_written_sequence_); |
| last_sequence_.store(s, std::memory_order_release); |
| } |
| |
| // Note: memory_order_release must be sufficient |
| void SetLastToBeWrittenSequence(uint64_t s) { |
| assert(s >= last_to_be_written_sequence_); |
| last_to_be_written_sequence_.store(s, std::memory_order_seq_cst); |
| } |
| |
| // Note: memory_order_release must be sufficient |
| uint64_t FetchAddLastToBeWrittenSequence(uint64_t s) { |
| return last_to_be_written_sequence_.fetch_add(s, std::memory_order_seq_cst); |
| } |
| |
| // Mark the specified file number as used. |
| // REQUIRED: this is only called during single-threaded recovery |
| void MarkFileNumberUsedDuringRecovery(uint64_t number); |
| |
| // Return the log file number for the log file that is currently |
| // being compacted, or zero if there is no such log file. |
| uint64_t prev_log_number() const { return prev_log_number_; } |
| |
| // Returns the minimum log number such that all |
| // log numbers less than or equal to it can be deleted |
| uint64_t MinLogNumber() const { |
| uint64_t min_log_num = std::numeric_limits<uint64_t>::max(); |
| for (auto cfd : *column_family_set_) { |
| // It's safe to ignore dropped column families here: |
| // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. |
| if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { |
| min_log_num = cfd->GetLogNumber(); |
| } |
| } |
| return min_log_num; |
| } |
| |
| // Create an iterator that reads over the compaction inputs for "*c". |
| // The caller should delete the iterator when no longer needed. |
| InternalIterator* MakeInputIterator(const Compaction* c, |
| RangeDelAggregator* range_del_agg); |
| |
| // Add all files listed in any live version to *live. |
| void AddLiveFiles(std::vector<FileDescriptor>* live_list); |
| |
| // Return the approximate size of data to be scanned for range [start, end) |
| // in levels [start_level, end_level). If end_level == 0 it will search |
| // through all non-empty levels |
| uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, |
| int start_level = 0, int end_level = -1); |
| |
| // Return the size of the current manifest file |
| uint64_t manifest_file_size() const { return manifest_file_size_; } |
| |
| // verify that the files that we started with for a compaction |
| // still exist in the current version and in the same original level. |
| // This ensures that a concurrent compaction did not erroneously |
| // pick the same files to compact. |
| bool VerifyCompactionFileConsistency(Compaction* c); |
| |
| Status GetMetadataForFile(uint64_t number, int* filelevel, |
| FileMetaData** metadata, ColumnFamilyData** cfd); |
| |
| // This function doesn't support leveldb SST filenames |
| void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata); |
| |
| void GetObsoleteFiles(std::vector<FileMetaData*>* files, |
| std::vector<std::string>* manifest_filenames, |
| uint64_t min_pending_output); |
| |
| ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } |
| const EnvOptions& env_options() { return env_options_; } |
| |
| static uint64_t GetNumLiveVersions(Version* dummy_versions); |
| |
| static uint64_t GetTotalSstFilesSize(Version* dummy_versions); |
| |
| private: |
| struct ManifestWriter; |
| |
| friend class Version; |
| friend class DBImpl; |
| |
| struct LogReporter : public log::Reader::Reporter { |
| Status* status; |
| virtual void Corruption(size_t bytes, const Status& s) override { |
| if (this->status->ok()) *this->status = s; |
| } |
| }; |
| |
| // ApproximateSize helper |
| uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, |
| const Slice& start, const Slice& end); |
| |
| uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, |
| const Slice& key); |
| |
| // Save current contents to *log |
| Status WriteSnapshot(log::Writer* log); |
| |
| void AppendVersion(ColumnFamilyData* column_family_data, Version* v); |
| |
| ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, |
| VersionEdit* edit); |
| |
| std::unique_ptr<ColumnFamilySet> column_family_set_; |
| |
| Env* const env_; |
| const std::string dbname_; |
| const ImmutableDBOptions* const db_options_; |
| std::atomic<uint64_t> next_file_number_; |
| uint64_t manifest_file_number_; |
| uint64_t options_file_number_; |
| uint64_t pending_manifest_file_number_; |
| // The last seq visible to reads |
| std::atomic<uint64_t> last_sequence_; |
| // The last seq with which a writer has written/will write. |
| std::atomic<uint64_t> last_to_be_written_sequence_; |
| uint64_t prev_log_number_; // 0 or backing store for memtable being compacted |
| |
| // Opened lazily |
| unique_ptr<log::Writer> descriptor_log_; |
| |
| // generates a increasing version number for every new version |
| uint64_t current_version_number_; |
| |
| // Queue of writers to the manifest file |
| std::deque<ManifestWriter*> manifest_writers_; |
| |
| // Current size of manifest file |
| uint64_t manifest_file_size_; |
| |
| std::vector<FileMetaData*> obsolete_files_; |
| std::vector<std::string> obsolete_manifests_; |
| |
| // env options for all reads and writes except compactions |
| const EnvOptions& env_options_; |
| |
| // env options used for compactions. This is a copy of |
| // env_options_ but with readaheads set to readahead_compactions_. |
| const EnvOptions env_options_compactions_; |
| |
| // No copying allowed |
| VersionSet(const VersionSet&); |
| void operator=(const VersionSet&); |
| |
| void LogAndApplyCFHelper(VersionEdit* edit); |
| void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v, |
| VersionEdit* edit, InstrumentedMutex* mu); |
| }; |
| |
| } // namespace rocksdb |