src/kudu/util/nvm_cache.cc - kudu - Git at Google

 // This file is derived from cache.cc in the LevelDB project:
 //
 //   Some portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
 //   Use of this source code is governed by a BSD-style license that can be
 //   found in the LICENSE file.
 //
 // ------------------------------------------------------------
 // This file implements a cache based on the MEMKIND library (http://memkind.github.io/memkind/)
 // This library makes it easy to program against persistent memory hardware by exposing an API
 // which parallels malloc/free, but allocates from persistent memory instead of DRAM.
 //
 // We use this API to implement a cache which treats persistent memory or
 // non-volatile memory as if it were a larger cheaper bank of volatile memory. We
 // currently make no use of its persistence properties.
 //
 // Currently, we only store key/value in NVM. All other data structures such as the
 // ShardedLRUCache instances, hash table, etc are in DRAM. The assumption is that
 // the ratio of data stored vs overhead is quite high.

 #include "kudu/util/nvm_cache.h"

 #include <dlfcn.h>

 #include <cstdint>
 #include <cstring>
 #include <iostream>
 #include <memory>
 #include <mutex>
 #include <string>
 #include <utility>
 #include <vector>

 #include <gflags/gflags.h>
 #include <glog/logging.h>

 #include "kudu/gutil/atomic_refcount.h"
 #include "kudu/gutil/atomicops.h"
 #include "kudu/gutil/bits.h"
 #include "kudu/gutil/dynamic_annotations.h"
 #include "kudu/gutil/hash/city.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/sysinfo.h"
 #include "kudu/util/cache.h"
 #include "kudu/util/cache_metrics.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_util_prod.h"

 #ifndef MEMKIND_PMEM_MIN_SIZE
 #define MEMKIND_PMEM_MIN_SIZE (1024 * 1024 * 16) // Taken from memkind 1.9.0.
 #endif

 struct memkind;

 // Useful in tests that require accurate cache capacity accounting.
 DECLARE_bool(cache_force_single_shard);

 DEFINE_string(nvm_cache_path, "/pmem",
               "The path at which the NVM cache will try to allocate its memory. "
               "This can be a tmpfs or ramfs for testing purposes.");

 DEFINE_bool(nvm_cache_simulate_allocation_failure, false,
             "If true, the NVM cache will inject failures in calls to memkind_malloc "
             "for testing.");
 TAG_FLAG(nvm_cache_simulate_allocation_failure, unsafe);

 DEFINE_double(nvm_cache_usage_ratio, 1.25,
              "A ratio to set the usage of nvm cache. The charge of an item in the nvm "
              "cache is equal to the results of memkind_malloc_usable_size multiplied by "
              "the ratio.");
 TAG_FLAG(nvm_cache_usage_ratio, advanced);

 using std::string;
 using std::unique_ptr;
 using std::vector;
 using strings::Substitute;

 static bool ValidateNVMCacheUsageRatio(const char* flagname, double value) {
   if (value < 1.0) {
     LOG(ERROR) << Substitute("$0 must be greater than or equal to 1.0, value $1 is invalid.",
                              flagname, value);
     return false;
   }
   if (value < 1.25) {
     LOG(WARNING) << Substitute("The value of $0 is $1, it is less than recommended "
                                "value (1.25). Due to memkind fragmentation, an improper "
                                "ratio will cause allocation failures while capacity-based "
                                "evictions are not triggered. Raise --nvm_cache_usage_ratio.",
                                flagname, value);
   }
   return true;
 }
 DEFINE_validator(nvm_cache_usage_ratio, &ValidateNVMCacheUsageRatio);

 namespace kudu {

 namespace {

 // Taken together, these typedefs and this macro make it easy to call a
 // memkind function:
 //
 //  CALL_MEMKIND(memkind_malloc, vmp_, size);
 typedef int (*memkind_create_pmem)(const char*, size_t, memkind**);
 typedef int (*memkind_destroy_kind)(memkind*);
 typedef void* (*memkind_malloc)(memkind*, size_t);
 typedef size_t (*memkind_malloc_usable_size)(memkind*, void*);
 typedef void (*memkind_free)(memkind*, void*);
 #define CALL_MEMKIND(func_name, ...) ((func_name)g_##func_name)(__VA_ARGS__)

 // Function pointers into memkind; set by InitMemkindOps().
 void* g_memkind_create_pmem;
 void* g_memkind_destroy_kind;
 void* g_memkind_malloc;
 void* g_memkind_malloc_usable_size;
 void* g_memkind_free;

 // After InitMemkindOps() is called, true if memkind is available and safe
 // to use, false otherwise.
 bool g_memkind_available;

 std::once_flag g_memkind_ops_flag;

 // Try to dlsym() a particular symbol from 'handle', storing the result in 'ptr'
 // if successful.
 Status TryDlsym(void* handle, const char* sym, void** ptr) {
   dlerror(); // Need to clear any existing error first.
   void* ret = dlsym(handle, sym);
   char* error = dlerror();
   if (error) {
     return Status::NotSupported(Substitute("could not dlsym $0", sym), error);
   }
   *ptr = ret;
   return Status::OK();
 }

 // Try to dlopen() memkind and set up all the function pointers we need from it.
 //
 // Note: in terms of protecting ourselves against changes in memkind, we'll
 // notice (and fail) if a symbol is missing, but not if it's signature has
 // changed or if there's some subtle behavioral change. A scan of the memkind
 // repo suggests that backwards compatibility is enforced: symbols are only
 // added and behavioral changes are effected via the introduction of new symbols.
 void InitMemkindOps() {
   g_memkind_available = false;

   // Use RTLD_NOW so that if any of memkind's dependencies aren't satisfied
   // (e.g. libnuma is too old and is missing symbols), we'll know up front
   // instead of during cache operations.
   void* memkind_lib = dlopen("libmemkind.so.0", RTLD_NOW);
   if (!memkind_lib) {
     LOG(WARNING) << "could not dlopen: " << dlerror();
     return;
   }
   auto cleanup = MakeScopedCleanup([&]() {
     dlclose(memkind_lib);
   });

 #define DLSYM_OR_RETURN(func_name, handle) do { \
     const Status _s = TryDlsym(memkind_lib, func_name, handle); \
     if (!_s.ok()) { \
       LOG(WARNING) << _s.ToString(); \
       return; \
     } \
   } while (0)

   DLSYM_OR_RETURN("memkind_create_pmem", &g_memkind_create_pmem);
   DLSYM_OR_RETURN("memkind_destroy_kind", &g_memkind_destroy_kind);
   DLSYM_OR_RETURN("memkind_malloc", &g_memkind_malloc);
   DLSYM_OR_RETURN("memkind_malloc_usable_size", &g_memkind_malloc_usable_size);
   DLSYM_OR_RETURN("memkind_free", &g_memkind_free);
 #undef DLSYM_OR_RETURN

   g_memkind_available = true;

   // Need to keep the memkind library handle open so our function pointers
   // remain loaded in memory.
   cleanup.cancel();
 }

 typedef simple_spinlock MutexType;

 // LRU cache implementation

 // An entry is a variable length heap-allocated structure.  Entries
 // are kept in a circular doubly linked list ordered by access time.
 struct LRUHandle {
   Cache::EvictionCallback* eviction_callback;
   LRUHandle* next_hash;
   LRUHandle* next;
   LRUHandle* prev;
   size_t charge;      // TODO(opt): Only allow uint32_t?
   uint32_t key_length;
   uint32_t val_length;
   Atomic32 refs;
   uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
   uint8_t* kv_data;

   Slice key() const {
     return Slice(kv_data, key_length);
   }

   Slice value() const {
     return Slice(&kv_data[key_length], val_length);
   }

   uint8_t* val_ptr() {
     return &kv_data[key_length];
   }
 };

 // We provide our own simple hash table since it removes a whole bunch
 // of porting hacks and is also faster than some of the built-in hash
 // table implementations in some of the compiler/runtime combinations
 // we have tested.  E.g., readrandom speeds up by ~5% over the g++
 // 4.4.3's builtin hashtable.
 class HandleTable {
  public:
   HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
   ~HandleTable() { delete[] list_; }

   LRUHandle* Lookup(const Slice& key, uint32_t hash) {
     return *FindPointer(key, hash);
   }

   LRUHandle* Insert(LRUHandle* h) {
     LRUHandle** ptr = FindPointer(h->key(), h->hash);
     LRUHandle* old = *ptr;
     h->next_hash = (old == nullptr ? nullptr : old->next_hash);
     *ptr = h;
     if (old == nullptr) {
       ++elems_;
       if (elems_ > length_) {
         // Since each cache entry is fairly large, we aim for a small
         // average linked list length (<= 1).
         Resize();
       }
     }
     return old;
   }

   LRUHandle* Remove(const Slice& key, uint32_t hash) {
     LRUHandle** ptr = FindPointer(key, hash);
     LRUHandle* result = *ptr;
     if (result != nullptr) {
       *ptr = result->next_hash;
       --elems_;
     }
     return result;
   }

  private:
   // The table consists of an array of buckets where each bucket is
   // a linked list of cache entries that hash into the bucket.
   uint32_t length_;
   uint32_t elems_;
   LRUHandle** list_;

   // Return a pointer to slot that points to a cache entry that
   // matches key/hash.  If there is no such cache entry, return a
   // pointer to the trailing slot in the corresponding linked list.
   LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
     LRUHandle** ptr = &list_[hash & (length_ - 1)];
     while (*ptr != nullptr &&
            ((*ptr)->hash != hash || key != (*ptr)->key())) {
       ptr = &(*ptr)->next_hash;
     }
     return ptr;
   }

   void Resize() {
     uint32_t new_length = 16;
     while (new_length < elems_ * 1.5) {
       new_length *= 2;
     }
     LRUHandle** new_list = new LRUHandle*[new_length];
     memset(new_list, 0, sizeof(new_list[0]) * new_length);
     uint32_t count = 0;
     for (uint32_t i = 0; i < length_; i++) {
       LRUHandle* h = list_[i];
       while (h != nullptr) {
         LRUHandle* next = h->next_hash;
         uint32_t hash = h->hash;
         LRUHandle** ptr = &new_list[hash & (new_length - 1)];
         h->next_hash = *ptr;
         *ptr = h;
         h = next;
         count++;
       }
     }
     DCHECK_EQ(elems_, count);
     delete[] list_;
     list_ = new_list;
     length_ = new_length;
   }
 };

 // A single shard of sharded cache.
 class NvmLRUCache {
  public:
   explicit NvmLRUCache(memkind *vmp);
   ~NvmLRUCache();

   // Separate from constructor so caller can easily make an array of LRUCache
   void SetCapacity(size_t capacity) { capacity_ = capacity; }

   void SetMetrics(CacheMetrics* metrics) { metrics_ = metrics; }

   Cache::Handle* Insert(LRUHandle* e, Cache::EvictionCallback* eviction_callback);

   // Like Cache::Lookup, but with an extra "hash" parameter.
   Cache::Handle* Lookup(const Slice& key, uint32_t hash, bool caching);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
   size_t Invalidate(const Cache::InvalidationControl& ctl);
   void* Allocate(size_t size);

  private:
   void NvmLRU_Remove(LRUHandle* e);
   void NvmLRU_Append(LRUHandle* e);
   // Just reduce the reference count by 1.
   // Return true if last reference
   bool Unref(LRUHandle* e);
   void FreeEntry(LRUHandle* e);

   // Evict the LRU item in the cache, adding it to the linked list
   // pointed to by 'to_remove_head'.
   void EvictOldestUnlocked(LRUHandle** to_remove_head);

   // Free all of the entries in the linked list that has to_free_head
   // as its head.
   void FreeLRUEntries(LRUHandle* to_free_head);

   // Wrapper around memkind_malloc which injects failures based on a flag.
   void* MemkindMalloc(size_t size);

   // Initialized before use.
   size_t capacity_;

   // mutex_ protects the following state.
   MutexType mutex_;
   size_t usage_;

   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
   LRUHandle lru_;

   HandleTable table_;

   memkind* vmp_;

   CacheMetrics* metrics_;
 };

 NvmLRUCache::NvmLRUCache(memkind* vmp)
   : usage_(0),
   vmp_(vmp),
   metrics_(nullptr) {
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
 }

 NvmLRUCache::~NvmLRUCache() {
   for (LRUHandle* e = lru_.next; e != &lru_; ) {
     LRUHandle* next = e->next;
     DCHECK_EQ(e->refs, 1);  // Error if caller has an unreleased handle
     if (Unref(e)) {
       FreeEntry(e);
     }
     e = next;
   }
 }

 void* NvmLRUCache::MemkindMalloc(size_t size) {
   if (PREDICT_FALSE(FLAGS_nvm_cache_simulate_allocation_failure)) {
     return nullptr;
   }
   return CALL_MEMKIND(memkind_malloc, vmp_, size);
 }

 bool NvmLRUCache::Unref(LRUHandle* e) {
   DCHECK_GT(ANNOTATE_UNPROTECTED_READ(e->refs), 0);
   return !base::RefCountDec(&e->refs);
 }

 void NvmLRUCache::FreeEntry(LRUHandle* e) {
   DCHECK_EQ(ANNOTATE_UNPROTECTED_READ(e->refs), 0);
   if (e->eviction_callback) {
     e->eviction_callback->EvictedEntry(e->key(), e->value());
   }
   if (PREDICT_TRUE(metrics_)) {
     metrics_->cache_usage->DecrementBy(e->charge);
     metrics_->evictions->Increment();
   }
   CALL_MEMKIND(memkind_free, vmp_, e);
 }

 // Allocate NVM memory.
 void* NvmLRUCache::Allocate(size_t size) {
   return MemkindMalloc(size);
 }

 void NvmLRUCache::NvmLRU_Remove(LRUHandle* e) {
   e->next->prev = e->prev;
   e->prev->next = e->next;
   usage_ -= e->charge;
 }

 void NvmLRUCache::NvmLRU_Append(LRUHandle* e) {
   // Make "e" newest entry by inserting just before lru_
   e->next = &lru_;
   e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
   usage_ += e->charge;
 }

 Cache::Handle* NvmLRUCache::Lookup(const Slice& key, uint32_t hash, bool caching) {
  LRUHandle* e;
   {
     std::lock_guard<MutexType> l(mutex_);
     e = table_.Lookup(key, hash);
     if (e != nullptr) {
       // If an entry exists, remove the old entry from the cache
       // and re-add to the end of the linked list.
       base::RefCountInc(&e->refs);
       NvmLRU_Remove(e);
       NvmLRU_Append(e);
     }
   }

   // Do the metrics outside of the lock.
   if (metrics_) {
     metrics_->lookups->Increment();
     bool was_hit = (e != nullptr);
     if (was_hit) {
       if (caching) {
         metrics_->cache_hits_caching->Increment();
       } else {
         metrics_->cache_hits->Increment();
       }
     } else {
       if (caching) {
         metrics_->cache_misses_caching->Increment();
       } else {
         metrics_->cache_misses->Increment();
       }
     }
   }

   return reinterpret_cast<Cache::Handle*>(e);
 }

 void NvmLRUCache::Release(Cache::Handle* handle) {
   LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
   bool last_reference = Unref(e);
   if (last_reference) {
     FreeEntry(e);
   }
 }

 void NvmLRUCache::EvictOldestUnlocked(LRUHandle** to_remove_head) {
   LRUHandle* old = lru_.next;
   NvmLRU_Remove(old);
   table_.Remove(old->key(), old->hash);
   if (Unref(old)) {
     old->next = *to_remove_head;
     *to_remove_head = old;
   }
 }

 void NvmLRUCache::FreeLRUEntries(LRUHandle* to_free_head) {
   while (to_free_head != nullptr) {
     LRUHandle* next = to_free_head->next;
     FreeEntry(to_free_head);
     to_free_head = next;
   }
 }

 Cache::Handle* NvmLRUCache::Insert(LRUHandle* e,
                                    Cache::EvictionCallback* eviction_callback) {
   DCHECK(e);
   LRUHandle* to_remove_head = nullptr;

   e->refs = 2;  // One from LRUCache, one for the returned handle
   e->eviction_callback = eviction_callback;
   if (PREDICT_TRUE(metrics_)) {
     metrics_->cache_usage->IncrementBy(e->charge);
     metrics_->inserts->Increment();
   }

   {
     std::lock_guard<MutexType> l(mutex_);

     NvmLRU_Append(e);

     LRUHandle* old = table_.Insert(e);
     if (old != nullptr) {
       NvmLRU_Remove(old);
       if (Unref(old)) {
         old->next = to_remove_head;
         to_remove_head = old;
       }
     }

     while (usage_ > capacity_ && lru_.next != &lru_) {
       EvictOldestUnlocked(&to_remove_head);
     }
   }

   // we free the entries here outside of mutex for
   // performance reasons
   FreeLRUEntries(to_remove_head);

   return reinterpret_cast<Cache::Handle*>(e);
 }

 void NvmLRUCache::Erase(const Slice& key, uint32_t hash) {
   LRUHandle* e;
   bool last_reference = false;
   {
     std::lock_guard<MutexType> l(mutex_);
     e = table_.Remove(key, hash);
     if (e != nullptr) {
       NvmLRU_Remove(e);
       last_reference = Unref(e);
     }
   }
   // mutex not held here
   // last_reference will only be true if e != nullptr
   if (last_reference) {
     FreeEntry(e);
   }
 }

 size_t NvmLRUCache::Invalidate(const Cache::InvalidationControl& ctl) {
   size_t invalid_entry_count = 0;
   size_t valid_entry_count = 0;
   LRUHandle* to_remove_head = nullptr;

   {
     std::lock_guard<MutexType> l(mutex_);

     // rl_.next is the oldest entry in the recency list.
     LRUHandle* h = lru_.next;
     while (h != nullptr && h != &lru_ &&
            ctl.iteration_func(valid_entry_count, invalid_entry_count)) {
       if (ctl.validity_func(h->key(), h->value())) {
         // Continue iterating over the list.
         h = h->next;
         ++valid_entry_count;
         continue;
       }
       // Copy the handle slated for removal.
       LRUHandle* h_to_remove = h;
       // Prepare for next iteration of the cycle.
       h = h->next;

       NvmLRU_Remove(h_to_remove);
       table_.Remove(h_to_remove->key(), h_to_remove->hash);
       if (Unref(h_to_remove)) {
         h_to_remove->next = to_remove_head;
         to_remove_head = h_to_remove;
       }
       ++invalid_entry_count;
     }
   }

   FreeLRUEntries(to_remove_head);

   return invalid_entry_count;
 }

 // Determine the number of bits of the hash that should be used to determine
 // the cache shard. This, in turn, determines the number of shards.
 int DetermineShardBits() {
   int bits = PREDICT_FALSE(FLAGS_cache_force_single_shard) ?
       0 : Bits::Log2Ceiling(base::NumCPUs());
   VLOG(1) << "Will use " << (1 << bits) << " shards for LRU cache.";
   return bits;
 }

 class ShardedLRUCache : public Cache {
  private:
   unique_ptr<CacheMetrics> metrics_;
   vector<unique_ptr<NvmLRUCache>> shards_;

   // Number of bits of hash used to determine the shard.
   const int shard_bits_;

   memkind* vmp_;

   static inline uint32_t HashSlice(const Slice& s) {
     return util_hash::CityHash64(
       reinterpret_cast<const char *>(s.data()), s.size());
   }

   uint32_t Shard(uint32_t hash) {
     // Widen to uint64 before shifting, or else on a single CPU,
     // we would try to shift a uint32_t by 32 bits, which is undefined.
     return static_cast<uint64_t>(hash) >> (32 - shard_bits_);
   }

  public:
   explicit ShardedLRUCache(size_t capacity, const string& /*id*/, memkind* vmp)
       : shard_bits_(DetermineShardBits()),
         vmp_(vmp) {
     int num_shards = 1 << shard_bits_;
     const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
     for (int s = 0; s < num_shards; s++) {
       unique_ptr<NvmLRUCache> shard(new NvmLRUCache(vmp_));
       shard->SetCapacity(per_shard);
       shards_.emplace_back(std::move(shard));
     }
   }

   virtual ~ShardedLRUCache() {
     shards_.clear();
     // Per the note at the top of this file, our cache is entirely volatile.
     // Hence, when the cache is destructed, we delete the underlying
     // memkind pool.
     CALL_MEMKIND(memkind_destroy_kind, vmp_);
   }

   virtual UniqueHandle Insert(UniquePendingHandle handle,
                               Cache::EvictionCallback* eviction_callback) OVERRIDE {
     LRUHandle* h = reinterpret_cast<LRUHandle*>(DCHECK_NOTNULL(handle.release()));
     return UniqueHandle(
         shards_[Shard(h->hash)]->Insert(h, eviction_callback),
         Cache::HandleDeleter(this));
   }
   virtual UniqueHandle Lookup(const Slice& key, CacheBehavior caching) OVERRIDE {
     const uint32_t hash = HashSlice(key);
     return UniqueHandle(
         shards_[Shard(hash)]->Lookup(key, hash, caching == EXPECT_IN_CACHE),
         Cache::HandleDeleter(this));
   }
   virtual void Release(Handle* handle) OVERRIDE {
     LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
     shards_[Shard(h->hash)]->Release(handle);
   }
   virtual void Erase(const Slice& key) OVERRIDE {
     const uint32_t hash = HashSlice(key);
     shards_[Shard(hash)]->Erase(key, hash);
   }
   virtual Slice Value(const UniqueHandle& handle) const OVERRIDE {
     return reinterpret_cast<const LRUHandle*>(handle.get())->value();
   }
   virtual uint8_t* MutableValue(UniquePendingHandle* handle) OVERRIDE {
     return reinterpret_cast<LRUHandle*>(handle->get())->val_ptr();
   }

   virtual void SetMetrics(unique_ptr<CacheMetrics> metrics,
                           Cache::ExistingMetricsPolicy metrics_policy) OVERRIDE {
     if (metrics_ && metrics_policy == Cache::ExistingMetricsPolicy::kKeep) {
       CHECK(IsGTest()) << "Metrics should only be set once per Cache";
       return;
     }
     metrics_ = std::move(metrics);
     for (const auto& shard : shards_) {
       shard->SetMetrics(metrics_.get());
     }
   }
   virtual UniquePendingHandle Allocate(Slice key, int val_len, int charge) OVERRIDE {
     int key_len = key.size();
     DCHECK_GE(key_len, 0);
     DCHECK_GE(val_len, 0);

     // Try allocating from each of the shards -- if memkind is tight,
     // this can cause eviction, so we might have better luck in different
     // shards.
     for (const auto& shard : shards_) {
       UniquePendingHandle ph(static_cast<PendingHandle*>(
           shard->Allocate(sizeof(LRUHandle) + key_len + val_len)),
           Cache::PendingHandleDeleter(this));
       if (ph) {
         LRUHandle* handle = reinterpret_cast<LRUHandle*>(ph.get());
         uint8_t* buf = reinterpret_cast<uint8_t*>(ph.get());
         handle->kv_data = &buf[sizeof(LRUHandle)];
         handle->val_length = val_len;
         handle->key_length = key_len;
         // Multiply the results of memkind_malloc_usable_size by a ratio
         // due to the fragmentation is not counted in.
         handle->charge = (charge == kAutomaticCharge) ?
             CALL_MEMKIND(memkind_malloc_usable_size, vmp_, buf) * FLAGS_nvm_cache_usage_ratio :
             charge;
         handle->hash = HashSlice(key);
         memcpy(handle->kv_data, key.data(), key.size());
         return ph;
       }
     }
     // TODO(unknown): increment a metric here on allocation failure.
     return UniquePendingHandle(nullptr, Cache::PendingHandleDeleter(this));
   }

   virtual void Free(PendingHandle* ph) OVERRIDE {
     CALL_MEMKIND(memkind_free, vmp_, ph);
   }

   size_t Invalidate(const InvalidationControl& ctl) override {
     size_t invalidated_count = 0;
     for (const auto& shard: shards_) {
       invalidated_count += shard->Invalidate(ctl);
     }
     return invalidated_count;
   }
 };

 } // end anonymous namespace

 template<>
 Cache* NewCache<Cache::EvictionPolicy::LRU,
                 Cache::MemoryType::NVM>(size_t capacity, const std::string& id) {
   std::call_once(g_memkind_ops_flag, InitMemkindOps);

   // TODO(adar): we should plumb the failure up the call stack, but at the time
   // of writing the NVM cache is only usable by the block cache, and its use of
   // the singleton pattern prevents the surfacing of errors.
   CHECK(g_memkind_available) << "Memkind not available!";

   // memkind_create_pmem() will fail if the capacity is too small, but with
   // an inscrutable error. So, we'll check ourselves.
   CHECK_GE(capacity, MEMKIND_PMEM_MIN_SIZE)
     << "configured capacity " << capacity << " bytes is less than "
     << "the minimum capacity for an NVM cache: " << MEMKIND_PMEM_MIN_SIZE;

   LOG(INFO) << 1 / FLAGS_nvm_cache_usage_ratio * 100 << "% capacity "
             << "from nvm block cache is available due to memkind fragmentation.";

   memkind* vmp;
   int err = CALL_MEMKIND(memkind_create_pmem, FLAGS_nvm_cache_path.c_str(), capacity, &vmp);
   // If we cannot create the cache pool we should not retry.
   PLOG_IF(FATAL, err) << "Could not initialize NVM cache library in path "
                            << FLAGS_nvm_cache_path.c_str();

   return new ShardedLRUCache(capacity, id, vmp);
 }

 bool CanUseNVMCacheForTests() {
   std::call_once(g_memkind_ops_flag, InitMemkindOps);
   return g_memkind_available;
 }

 } // namespace kudu