blob: e956ea612c58f96a8ae35896bb99d28e6a6497d8 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef KUDU_TABLET_TABLET_METADATA_H
#define KUDU_TABLET_TABLET_METADATA_H
#include <boost/optional/optional_fwd.hpp>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "kudu/common/partition.h"
#include "kudu/common/schema.h"
#include "kudu/consensus/opid.pb.h"
#include "kudu/fs/block_id.h"
#include "kudu/fs/fs_manager.h"
#include "kudu/gutil/callback.h"
#include "kudu/gutil/dynamic_annotations.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/tablet/metadata.pb.h"
#include "kudu/util/mutex.h"
#include "kudu/util/status.h"
#include "kudu/util/status_callback.h"
namespace kudu {
namespace tablet {
class RowSetMetadata;
class RowSetMetadataUpdate;
typedef std::vector<std::shared_ptr<RowSetMetadata> > RowSetMetadataVector;
typedef std::unordered_set<int64_t> RowSetMetadataIds;
extern const int64 kNoDurableMemStore;
// Manages the "blocks tracking" for the specified tablet.
//
// TabletMetadata is owned by the Tablet. As new blocks are written to store
// the tablet's data, the Tablet calls Flush() to persist the block list
// on disk.
//
// At startup, the TSTabletManager will load a TabletMetadata for each
// super block found in the tablets/ directory, and then instantiate
// tablets from this data.
class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> {
public:
// Create metadata for a new tablet. This assumes that the given superblock
// has not been written before, and writes out the initial superblock with
// the provided parameters.
static Status CreateNew(FsManager* fs_manager,
const std::string& tablet_id,
const std::string& table_name,
const Schema& schema,
const PartitionSchema& partition_schema,
const Partition& partition,
const TabletDataState& initial_tablet_data_state,
scoped_refptr<TabletMetadata>* metadata);
// Load existing metadata from disk.
static Status Load(FsManager* fs_manager,
const std::string& tablet_id,
scoped_refptr<TabletMetadata>* metadata);
// Try to load an existing tablet. If it does not exist, create it.
// If it already existed, verifies that the schema of the tablet matches the
// provided 'schema'.
//
// This is mostly useful for tests which instantiate tablets directly.
static Status LoadOrCreate(FsManager* fs_manager,
const std::string& tablet_id,
const std::string& table_name,
const Schema& schema,
const PartitionSchema& partition_schema,
const Partition& partition,
const TabletDataState& initial_tablet_data_state,
scoped_refptr<TabletMetadata>* metadata);
static void CollectBlockIdPBs(const TabletSuperBlockPB& superblock,
std::vector<BlockIdPB>* block_ids);
const std::string& tablet_id() const {
DCHECK_NE(state_, kNotLoadedYet);
return tablet_id_;
}
// Returns the partition of the tablet.
const Partition& partition() const {
return partition_;
}
std::string table_id() const {
DCHECK_NE(state_, kNotLoadedYet);
return table_id_;
}
std::string table_name() const;
uint32_t schema_version() const;
void SetSchema(const Schema& schema, uint32_t version);
void SetTableName(const std::string& table_name);
// Return a reference to the current schema.
// This pointer will be valid until the TabletMetadata is destructed,
// even if the schema is changed.
const Schema& schema() const {
const Schema* s = reinterpret_cast<const Schema*>(
base::subtle::Acquire_Load(reinterpret_cast<const AtomicWord*>(&schema_)));
return *s;
}
// Returns the partition schema of the tablet's table.
const PartitionSchema& partition_schema() const {
return partition_schema_;
}
// Set / get the remote bootstrap / tablet data state.
void set_tablet_data_state(TabletDataState state);
TabletDataState tablet_data_state() const;
// Increments flush pin count by one: if flush pin count > 0,
// metadata will _not_ be flushed to disk during Flush().
void PinFlush();
// Decrements flush pin count by one: if flush pin count is zero,
// metadata will be flushed to disk during the next call to Flush()
// or -- if Flush() had been called after a call to PinFlush() but
// before this method was called -- Flush() will be called inside
// this method.
Status UnPinFlush();
Status Flush();
// Updates the metadata in the following ways:
// 1. Adds rowsets from 'to_add'.
// 2. Removes rowsets from 'to_remove'.
// 3. Adds orphaned blocks from 'to_remove'.
// 4. Updates the last durable MRS ID from 'last_durable_mrs_id',
// assuming it's not kNoMrsFlushed.
static const int64_t kNoMrsFlushed = -1;
Status UpdateAndFlush(const RowSetMetadataIds& to_remove,
const RowSetMetadataVector& to_add,
int64_t last_durable_mrs_id);
// Adds the blocks referenced by 'block_ids' to 'orphaned_blocks_'.
//
// This set will be written to the on-disk metadata in any subsequent
// flushes.
//
// Blocks are removed from this set after they are successfully deleted
// in a call to DeleteOrphanedBlocks().
void AddOrphanedBlocks(const std::vector<BlockId>& block_ids);
// Mark the superblock to be in state 'delete_type', sync it to disk, and
// then delete all of the rowsets in this tablet.
// The metadata (superblock) is not deleted. For that, call DeleteSuperBlock().
//
// 'delete_type' must be one of TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED.
// 'last_logged_opid' should be set to the last opid in the log, if any is known.
// If 'last_logged_opid' is not set, then the current value of
// last_logged_opid is not modified. This is important for roll-forward of
// partially-tombstoned tablets during crash recovery.
//
// Returns only once all data has been removed.
Status DeleteTabletData(TabletDataState delete_type,
const boost::optional<consensus::OpId>& last_logged_opid);
// Permanently deletes the superblock from the disk.
// DeleteTabletData() must first be called and the tablet data state must be
// TABLET_DATA_DELETED.
// Returns Status::InvalidArgument if the list of orphaned blocks is not empty.
// Returns Status::IllegalState if the tablet data state is not TABLET_DATA_DELETED.
Status DeleteSuperBlock();
// Create a new RowSetMetadata for this tablet.
// Does not add the new rowset to the list of rowsets. Use one of the Update()
// calls to do so.
Status CreateRowSet(std::shared_ptr<RowSetMetadata> *rowset, const Schema& schema);
const RowSetMetadataVector& rowsets() const { return rowsets_; }
FsManager *fs_manager() const { return fs_manager_; }
int64_t last_durable_mrs_id() const { return last_durable_mrs_id_; }
void SetLastDurableMrsIdForTests(int64_t mrs_id) { last_durable_mrs_id_ = mrs_id; }
void SetPreFlushCallback(StatusClosure callback) { pre_flush_callback_ = callback; }
consensus::OpId tombstone_last_logged_opid() const { return tombstone_last_logged_opid_; }
// Loads the currently-flushed superblock from disk into the given protobuf.
Status ReadSuperBlockFromDisk(TabletSuperBlockPB* superblock) const;
// Sets *super_block to the serialized form of the current metadata.
Status ToSuperBlock(TabletSuperBlockPB* super_block) const;
// Fully replace a superblock (used for bootstrap).
Status ReplaceSuperBlock(const TabletSuperBlockPB &pb);
// ==========================================================================
// Stuff used by the tests
// ==========================================================================
const RowSetMetadata *GetRowSetForTests(int64_t id) const;
RowSetMetadata *GetRowSetForTests(int64_t id);
private:
friend class RefCountedThreadSafe<TabletMetadata>;
friend class MetadataTest;
// Compile time assert that no one deletes TabletMetadata objects.
~TabletMetadata();
// Constructor for creating a new tablet.
//
// TODO: get rid of this many-arg constructor in favor of just passing in a
// SuperBlock, which already contains all of these fields.
TabletMetadata(FsManager* fs_manager, std::string tablet_id,
std::string table_name, const Schema& schema,
PartitionSchema partition_schema, Partition partition,
const TabletDataState& tablet_data_state);
// Constructor for loading an existing tablet.
TabletMetadata(FsManager* fs_manager, std::string tablet_id);
void SetSchemaUnlocked(gscoped_ptr<Schema> schema, uint32_t version);
Status LoadFromDisk();
// Update state of metadata to that of the given superblock PB.
Status LoadFromSuperBlock(const TabletSuperBlockPB& superblock);
Status ReadSuperBlock(TabletSuperBlockPB *pb);
// Fully replace superblock.
// Requires 'flush_lock_'.
Status ReplaceSuperBlockUnlocked(const TabletSuperBlockPB &pb);
// Requires 'data_lock_'.
Status UpdateUnlocked(const RowSetMetadataIds& to_remove,
const RowSetMetadataVector& to_add,
int64_t last_durable_mrs_id);
// Requires 'data_lock_'.
Status ToSuperBlockUnlocked(TabletSuperBlockPB* super_block,
const RowSetMetadataVector& rowsets) const;
// Requires 'data_lock_'.
void AddOrphanedBlocksUnlocked(const std::vector<BlockId>& block_ids);
// Deletes the provided 'blocks' on disk.
//
// All blocks that are successfully deleted are removed from the
// 'orphaned_blocks_' set.
//
// Failures are logged, but are not fatal.
void DeleteOrphanedBlocks(const std::vector<BlockId>& blocks);
// Return standard "T xxx P yyy" log prefix.
std::string LogPrefix() const;
enum State {
kNotLoadedYet,
kNotWrittenYet,
kInitialized
};
State state_;
// Lock protecting the underlying data.
typedef simple_spinlock LockType;
mutable LockType data_lock_;
// Lock protecting flushing the data to disk.
// If taken together with 'data_lock_', must be acquired first.
mutable Mutex flush_lock_;
const std::string tablet_id_;
std::string table_id_;
Partition partition_;
FsManager* const fs_manager_;
RowSetMetadataVector rowsets_;
base::subtle::Atomic64 next_rowset_idx_;
int64_t last_durable_mrs_id_;
// The current schema version. This is owned by this class.
// We don't use gscoped_ptr so that we can do an atomic swap.
Schema* schema_;
uint32_t schema_version_;
std::string table_name_;
PartitionSchema partition_schema_;
// Previous values of 'schema_'.
// These are currently kept alive forever, under the assumption that
// a given tablet won't have thousands of "alter table" calls.
// They are kept alive so that callers of schema() don't need to
// worry about reference counting or locking.
std::vector<Schema*> old_schemas_;
// Protected by 'data_lock_'.
std::unordered_set<BlockId, BlockIdHash, BlockIdEqual> orphaned_blocks_;
// The current state of remote bootstrap for the tablet.
TabletDataState tablet_data_state_;
// Record of the last opid logged by the tablet before it was last
// tombstoned. Has no meaning for non-tombstoned tablets.
consensus::OpId tombstone_last_logged_opid_;
// If this counter is > 0 then Flush() will not write any data to
// disk.
int32_t num_flush_pins_;
// Set if Flush() is called when num_flush_pins_ is > 0; if true,
// then next UnPinFlush will call Flush() again to ensure the
// metadata is persisted.
bool needs_flush_;
// A callback that, if set, is called before this metadata is flushed
// to disk.
StatusClosure pre_flush_callback_;
DISALLOW_COPY_AND_ASSIGN(TabletMetadata);
};
} // namespace tablet
} // namespace kudu
#endif /* KUDU_TABLET_TABLET_METADATA_H */