blob: c22ee91a2559b18762f3dfef4b63a2d75f456c67 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstddef>
#include <memory>
#include <string>
#include <unordered_map>
#include <gtest/gtest_prod.h>
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/util/cache.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/env.h"
#include "kudu/util/locks.h"
#include "kudu/util/status.h"
namespace kudu {
namespace internal {
template <class FileType>
class BaseDescriptor;
template <class FileType>
class Descriptor;
} // namespace internal
class MetricEntity;
class Thread;
// Cache of open files.
//
// The purpose of this cache is to enforce an upper bound on the maximum number
// of files open at a time. Files opened through the cache may be closed at any
// time, only to be reopened upon next use.
//
// The file cache can be viewed as having two logical parts: the client-facing
// API and the LRU cache.
//
// Client-facing API
// -----------------
// The core of the client-facing API is the cache descriptor. A descriptor
// uniquely identifies an opened file. To a client, a descriptor is just an
// open file interface of the variety defined in util/env.h. Clients open
// descriptors via the OpenFile() cache methods.
//
// Descriptors are shared objects; an existing descriptor is handed back to a
// client if a file with the same name is already opened. To facilitate
// descriptor sharing, the file cache maintains by-file-name descriptor maps
// (one per file type). The values are weak references to the descriptors so
// that map entries don't affect the descriptor lifecycle.
//
// LRU cache
// ---------
// The lower half of the file cache is a standard LRU cache whose keys are file
// names and whose values are pointers to opened file objects allocated on the
// heap. Unlike the descriptor maps, this cache has an upper bound on capacity,
// and handles are evicted (and closed) according to an LRU algorithm.
//
// Whenever a descriptor is used by a client in file I/O, its file name is used
// in an LRU cache lookup. If found, the underlying file is still open and the
// file access is performed. Otherwise, the file must have been evicted and
// closed, so it is reopened and reinserted (possibly evicting a different open
// file) before the file access is performed.
//
// Other notes
// -----------
// In a world where files are opened and closed transparently, file deletion
// demands special care if UNIX semantics are to be preserved. When a call to
// DeleteFile() is made to a file with an opened descriptor, the descriptor is
// simply "marked" as to-be-deleted-later. Only when all references to the
// descriptor are dropped is the file actually deleted. If there is no open
// descriptor, the file is deleted immediately.
//
// Every public method in the file cache is thread safe.
class FileCache {
public:
// Creates a new file cache.
//
// The 'cache_name' is used to disambiguate amongst other file cache
// instances. The cache will use 'max_open_files' as a soft upper bound on
// the number of files open at any given time.
FileCache(const std::string& cache_name,
Env* env,
int max_open_files,
const scoped_refptr<MetricEntity>& entity);
// Destroys the file cache.
~FileCache();
// Initializes the file cache. Initialization done here may fail.
Status Init();
// Opens an existing file by name through the cache.
//
// The returned 'file' is actually an object called a descriptor. It adheres
// to a file-like interface but interfaces with the cache under the hood to
// reopen a file as needed during file operations.
//
// The underlying file is opened immediately to respect 'Mode', but may be
// closed later if the cache reaches its upper bound on the number of open
// files. It is also closed when the descriptor's last reference is dropped.
//
// All file types honor a 'Mode' of MUST_EXIST. Some may honor other modes as
// well, although transparently reopening evicted files will always use
// MUST_EXIST. Different combinations of modes and file types are expressed as
// template specializations; if a file type doesn't support a particular mode,
// there will be a linker error.
//
// TODO(adar): The file cache tries to behave as if users were accessing the
// underlying POSIX filesystem directly, but its semantics aren't 100% correct
// when using modes other than MUST_EXIST. For example, the behavior of
// MUST_CREATE and CREATE_OR_OPEN isn't quite right for open files marked for
// deletion. In theory we should "unmark" such a file to indicate that it was
// recreated, and truncate it so it's empty for the second client, but the
// truncation would corrupt the file for the first client. In short, take
// great care when using any mode apart from MUST_EXIST.
template <Env::OpenMode Mode, class FileType>
Status OpenFile(const std::string& file_name,
std::shared_ptr<FileType>* file);
// Deletes a file by name through the cache.
//
// If there is an outstanding descriptor for the file, the deletion will be
// deferred until the last reference is dropped. Otherwise, the file is
// deleted immediately.
Status DeleteFile(const std::string& file_name);
// Invalidate the given path in the cache if present. This removes the
// path from the cache, and invalidates any previously-opened descriptors
// associated with this file.
//
// If a file with the same path is opened again, the actual path will be opened from
// disk.
//
// This operation should be used during 'rename-to-replace' patterns, eg:
//
// WriteNewDataTo(tmp_path);
// env->RenameFile(tmp_path, p);
// file_cache->Invalidate(p);
//
// NOTE: if any reader of 'p' holds an open descriptor from the cache
// prior to this operation, that descriptor is invalidated and any
// further operations on that descriptor will result in a CHECK failure.
// Hence this is not safe to use without some external synchronization
// which prevents concurrent access to the same file.
//
// NOTE: this function must not be called concurrently on the same file name
// from multiple threads.
void Invalidate(const std::string& file_name);
// Returns the number of entries in the descriptor maps.
//
// Only intended for unit tests.
size_t NumDescriptorsForTests() const;
// Dumps the contents of the file cache. Intended for debugging.
std::string ToDebugString() const;
private:
friend class internal::BaseDescriptor<RWFile>;
friend class internal::BaseDescriptor<RandomAccessFile>;
template <class FileType>
using DescriptorMap = std::unordered_map<std::string,
std::weak_ptr<internal::Descriptor<FileType>>>;
template <class FileType>
FRIEND_TEST(FileCacheTest, TestBasicOperations);
// Dumps a descriptor map in 'descriptors'. All output will be prefixed by 'prefix'.
template <class FileType>
static std::string MapToDebugString(const DescriptorMap<FileType>& descs,
const std::string& prefix);
// Removes all expired descriptors from 'descs'.
template <class FileType>
static void ExpireDescriptorsFromMap(DescriptorMap<FileType>* descs);
// Looks up a descriptor by file name or creates a new one (if requested).
//
// The value of 'created_desc' will be set in accordance with whether a new
// descriptor was created.
//
// Must be called with 'lock_' held.
enum class FindMode {
// Only return an existing descriptor from the map; don't create a new one.
DONT_CREATE,
// Create a new descriptor if one did not exist in the map.
CREATE_IF_NOT_EXIST,
};
template <class FileType>
std::shared_ptr<internal::Descriptor<FileType>> FindDescriptorUnlocked(
const std::string& file_name,
FindMode mode,
DescriptorMap<FileType>* descs,
bool* created_desc);
// Periodically removes expired descriptors from the descriptor maps.
void RunDescriptorExpiry();
// Actually opens the file as per OpenFile. Used to encapsulate the bulk of
// OpenFile because C++ prohibits partial specialization of template functions.
template <class FileType>
Status DoOpenFile(const std::string& file_name,
std::shared_ptr<FileType>* file,
bool* created_desc);
// Status message prefix for files that have already been marked as deleted.
static const char* const kAlreadyDeleted;
// Interface to the underlying filesystem.
Env* env_;
// Name of the cache.
const std::string cache_name_;
// Invoked whenever a cached file reaches zero references (i.e. it was
// removed from the cache and is no longer in use by any file operations).
std::unique_ptr<Cache::EvictionCallback> eviction_cb_;
// Underlying cache instance. Caches opened files.
std::unique_ptr<Cache> cache_;
// Protects the descriptor map.
mutable simple_spinlock lock_;
// Maps filenames to descriptors.
DescriptorMap<RWFile> rwf_descs_;
DescriptorMap<RandomAccessFile> raf_descs_;
// Calls RunDescriptorExpiry() in a loop until 'running_' isn't set.
scoped_refptr<Thread> descriptor_expiry_thread_;
// Tracks whether or not 'descriptor_expiry_thread_' should be running.
CountDownLatch running_;
DISALLOW_COPY_AND_ASSIGN(FileCache);
};
} // namespace kudu