| // This file is derived from cache.cc in the LevelDB project: |
| // |
| // Some portions copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| // |
| // ------------------------------------------------------------ |
| // This file implements a cache based on the MEMKIND library (http://memkind.github.io/memkind/) |
| // This library makes it easy to program against persistent memory hardware by exposing an API |
| // which parallels malloc/free, but allocates from persistent memory instead of DRAM. |
| // |
| // We use this API to implement a cache which treats persistent memory or |
| // non-volatile memory as if it were a larger cheaper bank of volatile memory. We |
| // currently make no use of its persistence properties. |
| // |
| // Currently, we only store key/value in NVM. All other data structures such as the |
| // ShardedLRUCache instances, hash table, etc are in DRAM. The assumption is that |
| // the ratio of data stored vs overhead is quite high. |
| |
| #include "kudu/util/nvm_cache.h" |
| |
| #include <dlfcn.h> |
| |
| #include <cstdint> |
| #include <cstring> |
| #include <iostream> |
| #include <memory> |
| #include <mutex> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include <gflags/gflags.h> |
| #include <glog/logging.h> |
| |
| #include "kudu/gutil/atomic_refcount.h" |
| #include "kudu/gutil/atomicops.h" |
| #include "kudu/gutil/bits.h" |
| #include "kudu/gutil/dynamic_annotations.h" |
| #include "kudu/gutil/hash/city.h" |
| #include "kudu/gutil/macros.h" |
| #include "kudu/gutil/port.h" |
| #include "kudu/gutil/ref_counted.h" |
| #include "kudu/gutil/strings/substitute.h" |
| #include "kudu/gutil/sysinfo.h" |
| #include "kudu/util/cache.h" |
| #include "kudu/util/cache_metrics.h" |
| #include "kudu/util/flag_tags.h" |
| #include "kudu/util/locks.h" |
| #include "kudu/util/metrics.h" |
| #include "kudu/util/scoped_cleanup.h" |
| #include "kudu/util/slice.h" |
| #include "kudu/util/status.h" |
| #include "kudu/util/test_util_prod.h" |
| |
| #ifndef MEMKIND_PMEM_MIN_SIZE |
| #define MEMKIND_PMEM_MIN_SIZE (1024 * 1024 * 16) // Taken from memkind 1.9.0. |
| #endif |
| |
| struct memkind; |
| |
| // Useful in tests that require accurate cache capacity accounting. |
| DECLARE_bool(cache_force_single_shard); |
| |
| DEFINE_string(nvm_cache_path, "/pmem", |
| "The path at which the NVM cache will try to allocate its memory. " |
| "This can be a tmpfs or ramfs for testing purposes."); |
| |
| DEFINE_bool(nvm_cache_simulate_allocation_failure, false, |
| "If true, the NVM cache will inject failures in calls to memkind_malloc " |
| "for testing."); |
| TAG_FLAG(nvm_cache_simulate_allocation_failure, unsafe); |
| |
| DEFINE_double(nvm_cache_usage_ratio, 1.25, |
| "A ratio to set the usage of nvm cache. The charge of an item in the nvm " |
| "cache is equal to the results of memkind_malloc_usable_size multiplied by " |
| "the ratio."); |
| TAG_FLAG(nvm_cache_usage_ratio, advanced); |
| |
| using std::string; |
| using std::unique_ptr; |
| using std::vector; |
| using strings::Substitute; |
| |
| static bool ValidateNVMCacheUsageRatio(const char* flagname, double value) { |
| if (value < 1.0) { |
| LOG(ERROR) << Substitute("$0 must be greater than or equal to 1.0, value $1 is invalid.", |
| flagname, value); |
| return false; |
| } |
| if (value < 1.25) { |
| LOG(WARNING) << Substitute("The value of $0 is $1, it is less than recommended " |
| "value (1.25). Due to memkind fragmentation, an improper " |
| "ratio will cause allocation failures while capacity-based " |
| "evictions are not triggered. Raise --nvm_cache_usage_ratio.", |
| flagname, value); |
| } |
| return true; |
| } |
| DEFINE_validator(nvm_cache_usage_ratio, &ValidateNVMCacheUsageRatio); |
| |
| namespace kudu { |
| |
| namespace { |
| |
| // Taken together, these typedefs and this macro make it easy to call a |
| // memkind function: |
| // |
| // CALL_MEMKIND(memkind_malloc, vmp_, size); |
| typedef int (*memkind_create_pmem)(const char*, size_t, memkind**); |
| typedef int (*memkind_destroy_kind)(memkind*); |
| typedef void* (*memkind_malloc)(memkind*, size_t); |
| typedef size_t (*memkind_malloc_usable_size)(memkind*, void*); |
| typedef void (*memkind_free)(memkind*, void*); |
| #define CALL_MEMKIND(func_name, ...) ((func_name)g_##func_name)(__VA_ARGS__) |
| |
| // Function pointers into memkind; set by InitMemkindOps(). |
| void* g_memkind_create_pmem; |
| void* g_memkind_destroy_kind; |
| void* g_memkind_malloc; |
| void* g_memkind_malloc_usable_size; |
| void* g_memkind_free; |
| |
| // After InitMemkindOps() is called, true if memkind is available and safe |
| // to use, false otherwise. |
| bool g_memkind_available; |
| |
| std::once_flag g_memkind_ops_flag; |
| |
| // Try to dlsym() a particular symbol from 'handle', storing the result in 'ptr' |
| // if successful. |
| Status TryDlsym(void* handle, const char* sym, void** ptr) { |
| dlerror(); // Need to clear any existing error first. |
| void* ret = dlsym(handle, sym); |
| char* error = dlerror(); |
| if (error) { |
| return Status::NotSupported(Substitute("could not dlsym $0", sym), error); |
| } |
| *ptr = ret; |
| return Status::OK(); |
| } |
| |
| // Try to dlopen() memkind and set up all the function pointers we need from it. |
| // |
| // Note: in terms of protecting ourselves against changes in memkind, we'll |
| // notice (and fail) if a symbol is missing, but not if it's signature has |
| // changed or if there's some subtle behavioral change. A scan of the memkind |
| // repo suggests that backwards compatibility is enforced: symbols are only |
| // added and behavioral changes are effected via the introduction of new symbols. |
| void InitMemkindOps() { |
| g_memkind_available = false; |
| |
| // Use RTLD_NOW so that if any of memkind's dependencies aren't satisfied |
| // (e.g. libnuma is too old and is missing symbols), we'll know up front |
| // instead of during cache operations. |
| void* memkind_lib = dlopen("libmemkind.so.0", RTLD_NOW); |
| if (!memkind_lib) { |
| LOG(WARNING) << "could not dlopen: " << dlerror(); |
| return; |
| } |
| auto cleanup = MakeScopedCleanup([&]() { |
| dlclose(memkind_lib); |
| }); |
| |
| #define DLSYM_OR_RETURN(func_name, handle) do { \ |
| const Status _s = TryDlsym(memkind_lib, func_name, handle); \ |
| if (!_s.ok()) { \ |
| LOG(WARNING) << _s.ToString(); \ |
| return; \ |
| } \ |
| } while (0) |
| |
| DLSYM_OR_RETURN("memkind_create_pmem", &g_memkind_create_pmem); |
| DLSYM_OR_RETURN("memkind_destroy_kind", &g_memkind_destroy_kind); |
| DLSYM_OR_RETURN("memkind_malloc", &g_memkind_malloc); |
| DLSYM_OR_RETURN("memkind_malloc_usable_size", &g_memkind_malloc_usable_size); |
| DLSYM_OR_RETURN("memkind_free", &g_memkind_free); |
| #undef DLSYM_OR_RETURN |
| |
| g_memkind_available = true; |
| |
| // Need to keep the memkind library handle open so our function pointers |
| // remain loaded in memory. |
| cleanup.cancel(); |
| } |
| |
| typedef simple_spinlock MutexType; |
| |
| // LRU cache implementation |
| |
| // An entry is a variable length heap-allocated structure. Entries |
| // are kept in a circular doubly linked list ordered by access time. |
| struct LRUHandle { |
| Cache::EvictionCallback* eviction_callback; |
| LRUHandle* next_hash; |
| LRUHandle* next; |
| LRUHandle* prev; |
| size_t charge; // TODO(opt): Only allow uint32_t? |
| uint32_t key_length; |
| uint32_t val_length; |
| Atomic32 refs; |
| uint32_t hash; // Hash of key(); used for fast sharding and comparisons |
| uint8_t* kv_data; |
| |
| Slice key() const { |
| return Slice(kv_data, key_length); |
| } |
| |
| Slice value() const { |
| return Slice(&kv_data[key_length], val_length); |
| } |
| |
| uint8_t* val_ptr() { |
| return &kv_data[key_length]; |
| } |
| }; |
| |
| // We provide our own simple hash table since it removes a whole bunch |
| // of porting hacks and is also faster than some of the built-in hash |
| // table implementations in some of the compiler/runtime combinations |
| // we have tested. E.g., readrandom speeds up by ~5% over the g++ |
| // 4.4.3's builtin hashtable. |
| class HandleTable { |
| public: |
| HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } |
| ~HandleTable() { delete[] list_; } |
| |
| LRUHandle* Lookup(const Slice& key, uint32_t hash) { |
| return *FindPointer(key, hash); |
| } |
| |
| LRUHandle* Insert(LRUHandle* h) { |
| LRUHandle** ptr = FindPointer(h->key(), h->hash); |
| LRUHandle* old = *ptr; |
| h->next_hash = (old == nullptr ? nullptr : old->next_hash); |
| *ptr = h; |
| if (old == nullptr) { |
| ++elems_; |
| if (elems_ > length_) { |
| // Since each cache entry is fairly large, we aim for a small |
| // average linked list length (<= 1). |
| Resize(); |
| } |
| } |
| return old; |
| } |
| |
| LRUHandle* Remove(const Slice& key, uint32_t hash) { |
| LRUHandle** ptr = FindPointer(key, hash); |
| LRUHandle* result = *ptr; |
| if (result != nullptr) { |
| *ptr = result->next_hash; |
| --elems_; |
| } |
| return result; |
| } |
| |
| private: |
| // The table consists of an array of buckets where each bucket is |
| // a linked list of cache entries that hash into the bucket. |
| uint32_t length_; |
| uint32_t elems_; |
| LRUHandle** list_; |
| |
| // Return a pointer to slot that points to a cache entry that |
| // matches key/hash. If there is no such cache entry, return a |
| // pointer to the trailing slot in the corresponding linked list. |
| LRUHandle** FindPointer(const Slice& key, uint32_t hash) { |
| LRUHandle** ptr = &list_[hash & (length_ - 1)]; |
| while (*ptr != nullptr && |
| ((*ptr)->hash != hash || key != (*ptr)->key())) { |
| ptr = &(*ptr)->next_hash; |
| } |
| return ptr; |
| } |
| |
| void Resize() { |
| uint32_t new_length = 16; |
| while (new_length < elems_ * 1.5) { |
| new_length *= 2; |
| } |
| LRUHandle** new_list = new LRUHandle*[new_length]; |
| memset(new_list, 0, sizeof(new_list[0]) * new_length); |
| uint32_t count = 0; |
| for (uint32_t i = 0; i < length_; i++) { |
| LRUHandle* h = list_[i]; |
| while (h != nullptr) { |
| LRUHandle* next = h->next_hash; |
| uint32_t hash = h->hash; |
| LRUHandle** ptr = &new_list[hash & (new_length - 1)]; |
| h->next_hash = *ptr; |
| *ptr = h; |
| h = next; |
| count++; |
| } |
| } |
| DCHECK_EQ(elems_, count); |
| delete[] list_; |
| list_ = new_list; |
| length_ = new_length; |
| } |
| }; |
| |
| // A single shard of sharded cache. |
| class NvmLRUCache { |
| public: |
| explicit NvmLRUCache(memkind *vmp); |
| ~NvmLRUCache(); |
| |
| // Separate from constructor so caller can easily make an array of LRUCache |
| void SetCapacity(size_t capacity) { capacity_ = capacity; } |
| |
| void SetMetrics(CacheMetrics* metrics) { metrics_ = metrics; } |
| |
| Cache::Handle* Insert(LRUHandle* e, Cache::EvictionCallback* eviction_callback); |
| |
| // Like Cache::Lookup, but with an extra "hash" parameter. |
| Cache::Handle* Lookup(const Slice& key, uint32_t hash, bool caching); |
| void Release(Cache::Handle* handle); |
| void Erase(const Slice& key, uint32_t hash); |
| size_t Invalidate(const Cache::InvalidationControl& ctl); |
| void* Allocate(size_t size); |
| |
| private: |
| void NvmLRU_Remove(LRUHandle* e); |
| void NvmLRU_Append(LRUHandle* e); |
| // Just reduce the reference count by 1. |
| // Return true if last reference |
| bool Unref(LRUHandle* e); |
| void FreeEntry(LRUHandle* e); |
| |
| // Evict the LRU item in the cache, adding it to the linked list |
| // pointed to by 'to_remove_head'. |
| void EvictOldestUnlocked(LRUHandle** to_remove_head); |
| |
| // Free all of the entries in the linked list that has to_free_head |
| // as its head. |
| void FreeLRUEntries(LRUHandle* to_free_head); |
| |
| // Wrapper around memkind_malloc which injects failures based on a flag. |
| void* MemkindMalloc(size_t size); |
| |
| // Initialized before use. |
| size_t capacity_; |
| |
| // mutex_ protects the following state. |
| MutexType mutex_; |
| size_t usage_; |
| |
| // Dummy head of LRU list. |
| // lru.prev is newest entry, lru.next is oldest entry. |
| LRUHandle lru_; |
| |
| HandleTable table_; |
| |
| memkind* vmp_; |
| |
| CacheMetrics* metrics_; |
| }; |
| |
| NvmLRUCache::NvmLRUCache(memkind* vmp) |
| : usage_(0), |
| vmp_(vmp), |
| metrics_(nullptr) { |
| // Make empty circular linked list |
| lru_.next = &lru_; |
| lru_.prev = &lru_; |
| } |
| |
| NvmLRUCache::~NvmLRUCache() { |
| for (LRUHandle* e = lru_.next; e != &lru_; ) { |
| LRUHandle* next = e->next; |
| DCHECK_EQ(e->refs, 1); // Error if caller has an unreleased handle |
| if (Unref(e)) { |
| FreeEntry(e); |
| } |
| e = next; |
| } |
| } |
| |
| void* NvmLRUCache::MemkindMalloc(size_t size) { |
| if (PREDICT_FALSE(FLAGS_nvm_cache_simulate_allocation_failure)) { |
| return nullptr; |
| } |
| return CALL_MEMKIND(memkind_malloc, vmp_, size); |
| } |
| |
| bool NvmLRUCache::Unref(LRUHandle* e) { |
| DCHECK_GT(ANNOTATE_UNPROTECTED_READ(e->refs), 0); |
| return !base::RefCountDec(&e->refs); |
| } |
| |
| void NvmLRUCache::FreeEntry(LRUHandle* e) { |
| DCHECK_EQ(ANNOTATE_UNPROTECTED_READ(e->refs), 0); |
| if (e->eviction_callback) { |
| e->eviction_callback->EvictedEntry(e->key(), e->value()); |
| } |
| if (PREDICT_TRUE(metrics_)) { |
| metrics_->cache_usage->DecrementBy(e->charge); |
| metrics_->evictions->Increment(); |
| } |
| CALL_MEMKIND(memkind_free, vmp_, e); |
| } |
| |
| // Allocate NVM memory. |
| void* NvmLRUCache::Allocate(size_t size) { |
| return MemkindMalloc(size); |
| } |
| |
| void NvmLRUCache::NvmLRU_Remove(LRUHandle* e) { |
| e->next->prev = e->prev; |
| e->prev->next = e->next; |
| usage_ -= e->charge; |
| } |
| |
| void NvmLRUCache::NvmLRU_Append(LRUHandle* e) { |
| // Make "e" newest entry by inserting just before lru_ |
| e->next = &lru_; |
| e->prev = lru_.prev; |
| e->prev->next = e; |
| e->next->prev = e; |
| usage_ += e->charge; |
| } |
| |
| Cache::Handle* NvmLRUCache::Lookup(const Slice& key, uint32_t hash, bool caching) { |
| LRUHandle* e; |
| { |
| std::lock_guard<MutexType> l(mutex_); |
| e = table_.Lookup(key, hash); |
| if (e != nullptr) { |
| // If an entry exists, remove the old entry from the cache |
| // and re-add to the end of the linked list. |
| base::RefCountInc(&e->refs); |
| NvmLRU_Remove(e); |
| NvmLRU_Append(e); |
| } |
| } |
| |
| // Do the metrics outside of the lock. |
| if (metrics_) { |
| metrics_->lookups->Increment(); |
| bool was_hit = (e != nullptr); |
| if (was_hit) { |
| if (caching) { |
| metrics_->cache_hits_caching->Increment(); |
| } else { |
| metrics_->cache_hits->Increment(); |
| } |
| } else { |
| if (caching) { |
| metrics_->cache_misses_caching->Increment(); |
| } else { |
| metrics_->cache_misses->Increment(); |
| } |
| } |
| } |
| |
| return reinterpret_cast<Cache::Handle*>(e); |
| } |
| |
| void NvmLRUCache::Release(Cache::Handle* handle) { |
| LRUHandle* e = reinterpret_cast<LRUHandle*>(handle); |
| bool last_reference = Unref(e); |
| if (last_reference) { |
| FreeEntry(e); |
| } |
| } |
| |
| void NvmLRUCache::EvictOldestUnlocked(LRUHandle** to_remove_head) { |
| LRUHandle* old = lru_.next; |
| NvmLRU_Remove(old); |
| table_.Remove(old->key(), old->hash); |
| if (Unref(old)) { |
| old->next = *to_remove_head; |
| *to_remove_head = old; |
| } |
| } |
| |
| void NvmLRUCache::FreeLRUEntries(LRUHandle* to_free_head) { |
| while (to_free_head != nullptr) { |
| LRUHandle* next = to_free_head->next; |
| FreeEntry(to_free_head); |
| to_free_head = next; |
| } |
| } |
| |
| Cache::Handle* NvmLRUCache::Insert(LRUHandle* e, |
| Cache::EvictionCallback* eviction_callback) { |
| DCHECK(e); |
| LRUHandle* to_remove_head = nullptr; |
| |
| e->refs = 2; // One from LRUCache, one for the returned handle |
| e->eviction_callback = eviction_callback; |
| if (PREDICT_TRUE(metrics_)) { |
| metrics_->cache_usage->IncrementBy(e->charge); |
| metrics_->inserts->Increment(); |
| } |
| |
| { |
| std::lock_guard<MutexType> l(mutex_); |
| |
| NvmLRU_Append(e); |
| |
| LRUHandle* old = table_.Insert(e); |
| if (old != nullptr) { |
| NvmLRU_Remove(old); |
| if (Unref(old)) { |
| old->next = to_remove_head; |
| to_remove_head = old; |
| } |
| } |
| |
| while (usage_ > capacity_ && lru_.next != &lru_) { |
| EvictOldestUnlocked(&to_remove_head); |
| } |
| } |
| |
| // we free the entries here outside of mutex for |
| // performance reasons |
| FreeLRUEntries(to_remove_head); |
| |
| return reinterpret_cast<Cache::Handle*>(e); |
| } |
| |
| void NvmLRUCache::Erase(const Slice& key, uint32_t hash) { |
| LRUHandle* e; |
| bool last_reference = false; |
| { |
| std::lock_guard<MutexType> l(mutex_); |
| e = table_.Remove(key, hash); |
| if (e != nullptr) { |
| NvmLRU_Remove(e); |
| last_reference = Unref(e); |
| } |
| } |
| // mutex not held here |
| // last_reference will only be true if e != nullptr |
| if (last_reference) { |
| FreeEntry(e); |
| } |
| } |
| |
| size_t NvmLRUCache::Invalidate(const Cache::InvalidationControl& ctl) { |
| size_t invalid_entry_count = 0; |
| size_t valid_entry_count = 0; |
| LRUHandle* to_remove_head = nullptr; |
| |
| { |
| std::lock_guard<MutexType> l(mutex_); |
| |
| // rl_.next is the oldest entry in the recency list. |
| LRUHandle* h = lru_.next; |
| while (h != nullptr && h != &lru_ && |
| ctl.iteration_func(valid_entry_count, invalid_entry_count)) { |
| if (ctl.validity_func(h->key(), h->value())) { |
| // Continue iterating over the list. |
| h = h->next; |
| ++valid_entry_count; |
| continue; |
| } |
| // Copy the handle slated for removal. |
| LRUHandle* h_to_remove = h; |
| // Prepare for next iteration of the cycle. |
| h = h->next; |
| |
| NvmLRU_Remove(h_to_remove); |
| table_.Remove(h_to_remove->key(), h_to_remove->hash); |
| if (Unref(h_to_remove)) { |
| h_to_remove->next = to_remove_head; |
| to_remove_head = h_to_remove; |
| } |
| ++invalid_entry_count; |
| } |
| } |
| |
| FreeLRUEntries(to_remove_head); |
| |
| return invalid_entry_count; |
| } |
| |
| // Determine the number of bits of the hash that should be used to determine |
| // the cache shard. This, in turn, determines the number of shards. |
| int DetermineShardBits() { |
| int bits = PREDICT_FALSE(FLAGS_cache_force_single_shard) ? |
| 0 : Bits::Log2Ceiling(base::NumCPUs()); |
| VLOG(1) << "Will use " << (1 << bits) << " shards for LRU cache."; |
| return bits; |
| } |
| |
| class ShardedLRUCache : public Cache { |
| private: |
| unique_ptr<CacheMetrics> metrics_; |
| vector<unique_ptr<NvmLRUCache>> shards_; |
| |
| // Number of bits of hash used to determine the shard. |
| const int shard_bits_; |
| |
| memkind* vmp_; |
| |
| static inline uint32_t HashSlice(const Slice& s) { |
| return util_hash::CityHash64( |
| reinterpret_cast<const char *>(s.data()), s.size()); |
| } |
| |
| uint32_t Shard(uint32_t hash) { |
| // Widen to uint64 before shifting, or else on a single CPU, |
| // we would try to shift a uint32_t by 32 bits, which is undefined. |
| return static_cast<uint64_t>(hash) >> (32 - shard_bits_); |
| } |
| |
| public: |
| explicit ShardedLRUCache(size_t capacity, const string& /*id*/, memkind* vmp) |
| : shard_bits_(DetermineShardBits()), |
| vmp_(vmp) { |
| int num_shards = 1 << shard_bits_; |
| const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; |
| for (int s = 0; s < num_shards; s++) { |
| unique_ptr<NvmLRUCache> shard(new NvmLRUCache(vmp_)); |
| shard->SetCapacity(per_shard); |
| shards_.emplace_back(std::move(shard)); |
| } |
| } |
| |
| virtual ~ShardedLRUCache() { |
| shards_.clear(); |
| // Per the note at the top of this file, our cache is entirely volatile. |
| // Hence, when the cache is destructed, we delete the underlying |
| // memkind pool. |
| CALL_MEMKIND(memkind_destroy_kind, vmp_); |
| } |
| |
| virtual UniqueHandle Insert(UniquePendingHandle handle, |
| Cache::EvictionCallback* eviction_callback) OVERRIDE { |
| LRUHandle* h = reinterpret_cast<LRUHandle*>(DCHECK_NOTNULL(handle.release())); |
| return UniqueHandle( |
| shards_[Shard(h->hash)]->Insert(h, eviction_callback), |
| Cache::HandleDeleter(this)); |
| } |
| virtual UniqueHandle Lookup(const Slice& key, CacheBehavior caching) OVERRIDE { |
| const uint32_t hash = HashSlice(key); |
| return UniqueHandle( |
| shards_[Shard(hash)]->Lookup(key, hash, caching == EXPECT_IN_CACHE), |
| Cache::HandleDeleter(this)); |
| } |
| virtual void Release(Handle* handle) OVERRIDE { |
| LRUHandle* h = reinterpret_cast<LRUHandle*>(handle); |
| shards_[Shard(h->hash)]->Release(handle); |
| } |
| virtual void Erase(const Slice& key) OVERRIDE { |
| const uint32_t hash = HashSlice(key); |
| shards_[Shard(hash)]->Erase(key, hash); |
| } |
| virtual Slice Value(const UniqueHandle& handle) const OVERRIDE { |
| return reinterpret_cast<const LRUHandle*>(handle.get())->value(); |
| } |
| virtual uint8_t* MutableValue(UniquePendingHandle* handle) OVERRIDE { |
| return reinterpret_cast<LRUHandle*>(handle->get())->val_ptr(); |
| } |
| |
| virtual void SetMetrics(unique_ptr<CacheMetrics> metrics, |
| Cache::ExistingMetricsPolicy metrics_policy) OVERRIDE { |
| if (metrics_ && metrics_policy == Cache::ExistingMetricsPolicy::kKeep) { |
| CHECK(IsGTest()) << "Metrics should only be set once per Cache"; |
| return; |
| } |
| metrics_ = std::move(metrics); |
| for (const auto& shard : shards_) { |
| shard->SetMetrics(metrics_.get()); |
| } |
| } |
| virtual UniquePendingHandle Allocate(Slice key, int val_len, int charge) OVERRIDE { |
| int key_len = key.size(); |
| DCHECK_GE(key_len, 0); |
| DCHECK_GE(val_len, 0); |
| |
| // Try allocating from each of the shards -- if memkind is tight, |
| // this can cause eviction, so we might have better luck in different |
| // shards. |
| for (const auto& shard : shards_) { |
| UniquePendingHandle ph(static_cast<PendingHandle*>( |
| shard->Allocate(sizeof(LRUHandle) + key_len + val_len)), |
| Cache::PendingHandleDeleter(this)); |
| if (ph) { |
| LRUHandle* handle = reinterpret_cast<LRUHandle*>(ph.get()); |
| uint8_t* buf = reinterpret_cast<uint8_t*>(ph.get()); |
| handle->kv_data = &buf[sizeof(LRUHandle)]; |
| handle->val_length = val_len; |
| handle->key_length = key_len; |
| // Multiply the results of memkind_malloc_usable_size by a ratio |
| // due to the fragmentation is not counted in. |
| handle->charge = (charge == kAutomaticCharge) ? |
| CALL_MEMKIND(memkind_malloc_usable_size, vmp_, buf) * FLAGS_nvm_cache_usage_ratio : |
| charge; |
| handle->hash = HashSlice(key); |
| memcpy(handle->kv_data, key.data(), key.size()); |
| return ph; |
| } |
| } |
| // TODO(unknown): increment a metric here on allocation failure. |
| return UniquePendingHandle(nullptr, Cache::PendingHandleDeleter(this)); |
| } |
| |
| virtual void Free(PendingHandle* ph) OVERRIDE { |
| CALL_MEMKIND(memkind_free, vmp_, ph); |
| } |
| |
| size_t Invalidate(const InvalidationControl& ctl) override { |
| size_t invalidated_count = 0; |
| for (const auto& shard: shards_) { |
| invalidated_count += shard->Invalidate(ctl); |
| } |
| return invalidated_count; |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| template<> |
| Cache* NewCache<Cache::EvictionPolicy::LRU, |
| Cache::MemoryType::NVM>(size_t capacity, const std::string& id) { |
| std::call_once(g_memkind_ops_flag, InitMemkindOps); |
| |
| // TODO(adar): we should plumb the failure up the call stack, but at the time |
| // of writing the NVM cache is only usable by the block cache, and its use of |
| // the singleton pattern prevents the surfacing of errors. |
| CHECK(g_memkind_available) << "Memkind not available!"; |
| |
| // memkind_create_pmem() will fail if the capacity is too small, but with |
| // an inscrutable error. So, we'll check ourselves. |
| CHECK_GE(capacity, MEMKIND_PMEM_MIN_SIZE) |
| << "configured capacity " << capacity << " bytes is less than " |
| << "the minimum capacity for an NVM cache: " << MEMKIND_PMEM_MIN_SIZE; |
| |
| LOG(INFO) << 1 / FLAGS_nvm_cache_usage_ratio * 100 << "% capacity " |
| << "from nvm block cache is available due to memkind fragmentation."; |
| |
| memkind* vmp; |
| int err = CALL_MEMKIND(memkind_create_pmem, FLAGS_nvm_cache_path.c_str(), capacity, &vmp); |
| // If we cannot create the cache pool we should not retry. |
| PLOG_IF(FATAL, err) << "Could not initialize NVM cache library in path " |
| << FLAGS_nvm_cache_path.c_str(); |
| |
| return new ShardedLRUCache(capacity, id, vmp); |
| } |
| |
| bool CanUseNVMCacheForTests() { |
| std::call_once(g_memkind_ops_flag, InitMemkindOps); |
| return g_memkind_available; |
| } |
| |
| } // namespace kudu |