blob: 021c758d91785a2dda2f630b9955bef5cdee9f54 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <gtest/gtest_prod.h>
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/cache.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/locks.h"
#include "kudu/util/status.h"
namespace kudu {
class Env;
namespace internal {
template <class FileType>
class BaseDescriptor;
template <class FileType>
class Descriptor;
} // namespace internal
class MetricEntity;
class Thread;
// Cache of open files.
//
// The purpose of this cache is to enforce an upper bound on the maximum number
// of files open at a time. Files opened through the cache may be closed at any
// time, only to be reopened upon next use.
//
// The file cache can be viewed as having two logical parts: the client-facing
// API and the LRU cache.
//
// Client-facing API
// -----------------
// The core of the client-facing API is the cache descriptor. A descriptor
// uniquely identifies an opened file. To a client, a descriptor is just an
// open file interface of the variety defined in util/env.h. Clients open
// descriptors via the OpenExistingFile() cache method.
//
// Descriptors are shared objects; an existing descriptor is handed back to a
// client if a file with the same name is already opened. To facilitate
// descriptor sharing, the file cache maintains a by-file-name descriptor map.
// The values are weak references to the descriptors so that map entries don't
// affect the descriptor lifecycle.
//
// LRU cache
// ---------
// The lower half of the file cache is a standard LRU cache whose keys are file
// names and whose values are pointers to opened file objects allocated on the
// heap. Unlike the descriptor map, this cache has an upper bound on capacity,
// and handles are evicted (and closed) according to an LRU algorithm.
//
// Whenever a descriptor is used by a client in file I/O, its file name is used
// in an LRU cache lookup. If found, the underlying file is still open and the
// file access is performed. Otherwise, the file must have been evicted and
// closed, so it is reopened and reinserted (possibly evicting a different open
// file) before the file access is performed.
//
// Other notes
// -----------
// In a world where files are opened and closed transparently, file deletion
// demands special care if UNIX semantics are to be preserved. When a call to
// DeleteFile() is made to a file with an opened descriptor, the descriptor is
// simply "marked" as to-be-deleted-later. Only when all references to the
// descriptor are dropped is the file actually deleted. If there is no open
// descriptor, the file is deleted immediately.
//
// Every public method in the file cache is thread safe.
template <class FileType>
class FileCache {
public:
// Creates a new file cache.
//
// The 'cache_name' is used to disambiguate amongst other file cache
// instances. The cache will use 'max_open_files' as a soft upper bound on
// the number of files open at any given time.
FileCache(const std::string& cache_name,
Env* env,
int max_open_files,
const scoped_refptr<MetricEntity>& entity);
// Destroys the file cache.
~FileCache();
// Initializes the file cache. Initialization done here may fail.
Status Init();
// Opens an existing file by name through the cache.
//
// The returned 'file' is actually an object called a descriptor. It adheres
// to a file-like interface but interfaces with the cache under the hood to
// reopen a file as needed during file operations.
//
// The descriptor is opened immediately to verify that the on-disk file can
// be opened, but may be closed later if the cache reaches its upper bound on
// the number of open files.
Status OpenExistingFile(const std::string& file_name,
std::shared_ptr<FileType>* file);
// Deletes a file by name through the cache.
//
// If there is an outstanding descriptor for the file, the deletion will be
// deferred until the last referent is dropped. Otherwise, the file is
// deleted immediately.
Status DeleteFile(const std::string& file_name);
// Invalidate the given path in the cache if present. This removes the
// path from the cache, and invalidates any previously-opened descriptors
// associated with this file.
//
// If a file with the same path is opened again, the actual path will be opened from
// disk.
//
// This operation should be used during 'rename-to-replace' patterns, eg:
//
// WriteNewDataTo(tmp_path);
// env->RenameFile(tmp_path, p);
// file_cache->Invalidate(p);
//
// NOTE: if any reader of 'p' holds an open descriptor from the cache
// prior to this operation, that descriptor is invalidated and any
// further operations on that descriptor will result in a CHECK failure.
// Hence this is not safe to use without some external synchronization
// which prevents concurrent access to the same file.
//
// NOTE: this function must not be called concurrently on the same file name
// from multiple threads.
void Invalidate(const std::string& file_name);
// Returns the number of entries in the descriptor map.
//
// Only intended for unit tests.
int NumDescriptorsForTests() const;
// Dumps the contents of the file cache. Intended for debugging.
std::string ToDebugString() const;
private:
friend class internal::BaseDescriptor<FileType>;
template<class FileType2>
FRIEND_TEST(FileCacheTest, TestBasicOperations);
// Looks up a descriptor by file name.
//
// Must be called with 'lock_' held.
Status FindDescriptorUnlocked(
const std::string& file_name,
std::shared_ptr<internal::Descriptor<FileType>>* file);
// Periodically removes expired descriptors from 'descriptors_'.
void RunDescriptorExpiry();
// Interface to the underlying filesystem.
Env* env_;
// Name of the cache.
const std::string cache_name_;
// Invoked whenever a cached file reaches zero references (i.e. it was
// removed from the cache and is no longer in use by any file operations).
std::unique_ptr<Cache::EvictionCallback> eviction_cb_;
// Underlying cache instance. Caches opened files.
std::unique_ptr<Cache> cache_;
// Protects the descriptor map.
mutable simple_spinlock lock_;
// Maps filenames to descriptors.
std::unordered_map<std::string,
std::weak_ptr<internal::Descriptor<FileType>>> descriptors_;
// Calls RunDescriptorExpiry() in a loop until 'running_' isn't set.
scoped_refptr<Thread> descriptor_expiry_thread_;
// Tracks whether or not 'descriptor_expiry_thread_' should be running.
CountDownLatch running_;
DISALLOW_COPY_AND_ASSIGN(FileCache);
};
} // namespace kudu