blob: 47ca3b5bc83da83ca0a0e784a1cfc8d0e51e430b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <iosfwd>
#include <map>
#include <memory>
#include <mutex>
#include <ostream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <glog/logging.h>
#include <gtest/gtest_prod.h>
#include "kudu/common/iterator.h"
#include "kudu/common/schema.h"
#include "kudu/fs/io_context.h"
#include "kudu/gutil/integral_types.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/threading/thread_collision_warner.h"
#include "kudu/tablet/lock_manager.h"
#include "kudu/tablet/mvcc.h"
#include "kudu/tablet/rowset.h"
#include "kudu/tablet/tablet_mem_trackers.h"
#include "kudu/tablet/tablet_metadata.h"
#include "kudu/tablet/txn_participant.h"
#include "kudu/util/bloom_filter.h"
#include "kudu/util/locks.h"
#include "kudu/util/maintenance_manager.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/rw_semaphore.h"
#include "kudu/util/semaphore.h"
#include "kudu/util/status.h"
namespace kudu {
class AlterTableTest;
class ConstContiguousRow;
class EncodedKey;
class KeyRange;
class MemTracker;
class RowBlock;
class ScanSpec;
class Throttler;
class Timestamp;
struct IterWithBounds;
struct IteratorStats;
namespace consensus {
class OpId;
} // namespace consensus
namespace clock {
class Clock;
} // namespace clock
namespace log {
class LogAnchorRegistry;
} // namespace log
namespace tablet {
class AlterSchemaOpState;
class CompactionPolicy;
class HistoryGcOpts;
class MemRowSet;
class ParticipantOpState;
class RowSetTree;
class RowSetsInCompaction;
class TxnMetadata;
class WriteOpState;
struct RowOp;
struct TabletComponents;
struct TabletMetrics;
struct TxnRowSets;
class Tablet {
public:
typedef std::map<int64_t, int64_t> ReplaySizeMap;
friend class CompactRowSetsOp;
friend class FlushMRSOp;
class CompactionFaultHooks;
class FlushCompactCommonHooks;
class FlushFaultHooks;
class Iterator;
// Create a new tablet.
//
// If 'metric_registry' is non-NULL, then this tablet will create a 'tablet' entity
// within the provided registry. Otherwise, no metrics are collected.
Tablet(scoped_refptr<TabletMetadata> metadata,
clock::Clock* clock,
std::shared_ptr<MemTracker> parent_mem_tracker,
MetricRegistry* metric_registry,
scoped_refptr<log::LogAnchorRegistry> log_anchor_registry);
~Tablet();
// Open the tablet, initializing transactions for 'in_flight_txn_ids', and
// MRSs for 'txn_ids_with_mrs'. The created MRSs will be uncommitted -- it is
// up to the caller to determine whether they should be committed after
// finishing bootstrapping.
//
// Upon completion, the tablet enters the kBootstrapping state.
Status Open(const std::unordered_set<int64_t>& in_flight_txn_ids = std::unordered_set<int64_t>{},
const std::unordered_set<int64_t>& txn_ids_with_mrs = std::unordered_set<int64_t>{});
// Mark that the tablet has finished bootstrapping.
// This transitions from kBootstrapping to kOpen state.
// Returns an error if tablet has been stopped.
Status MarkFinishedBootstrapping();
// Shuts down the tablet, unregistering various components that may attempt
// to point back to it, and changing the lifecycle state to 'kShutdown'.
void Shutdown();
// Stops the tablet from making any progress. Currently-Applying operations
// are terminated early on a best-effort basis, new ops will return
// with Status::Aborted(), tablet maintenance ops will no longer be
// scheduled, and the lifecycle state is set to 'kStopped'.
//
// Currently, the tablet will only be Stopped as the tablet is shutting down.
// In the future, it can be Stopped when it hits a non-recoverable error
// (e.g. a disk error) to immediately prevent further writes.
void Stop();
// Returns whether the tablet has been stopped, i.e. is in either the
// 'kStopped' or 'kShutdown' state.
bool HasBeenStopped() const;
// Decode the Write (insert/mutate) operations from within a user's
// request.
Status DecodeWriteOperations(const Schema* client_schema,
WriteOpState* op_state);
// Acquire locks for each of the operations in the given write op.
// This also sets the row op's RowSetKeyProbe.
Status AcquireRowLocks(WriteOpState* op_state);
// Acquire locks for the given write op. If 'must_acquire' is true,
// then wait until the lock is acquired. Otherwise, return
// 'TXN_LOCKED_ABORT' or 'TXN_LOCKED_RETRY_OP' error if lock
// cannot be acquired.
Status AcquirePartitionLock(WriteOpState* op_state,
LockManager::LockWaitMode wait_mode);
// Acquire a shared lock on the given transaction, to ensure the
// transaction's state doesn't change while the given write is in flight.
Status AcquireTxnLock(int64_t txn_id, WriteOpState* op_state);
// Starts an MVCC op which must have a pre-assigned timestamp.
//
// TODO(todd): rename this to something like "FinishPrepare" or "StartApply", since
// it's not the first thing in an op!
void StartOp(WriteOpState* op_state);
void StartOp(ParticipantOpState* op_state);
// Like the above but actually assigns the timestamp. Only used for tests that
// don't boot a tablet server.
void AssignTimestampAndStartOpForTests(WriteOpState* op_state);
// Signal that the given op is about to Apply.
void StartApplying(WriteOpState* op_state);
void StartApplying(ParticipantOpState* op_state);
// Apply all of the row operations associated with this op.
Status ApplyRowOperations(WriteOpState* op_state) WARN_UNUSED_RESULT;
// Apply a single row operation, which must already be prepared.
// The result is set back into row_op->result.
Status ApplyRowOperation(const fs::IOContext* io_context,
WriteOpState* op_state,
RowOp* row_op,
ProbeStats* stats) WARN_UNUSED_RESULT;
// Begins the transaction, recording its presence in the tablet metadata.
// Upon calling this, 'op_id' will be anchored until the metadata is flushed,
// using 'txn' as the anchor owner.
void BeginTransaction(Txn* txn, const consensus::OpId& op_id);
// Indicates that the transaction has started to commit, recording the
// timestamp used by the MVCC op to demarcate the end of the transaction in
// the tablet metadata. Upon calling this, 'op_id' will be anchored until
// the metadata is flushed, using 'txn' as the anchor owner.
void BeginCommit(Txn* txn, Timestamp mvcc_op_ts, const consensus::OpId& op_id);
// Commits the transaction, recording its commit timestamp in the tablet metadata.
// Upon calling this, 'op_id' will be anchored until the metadata is flushed,
// using 'txn' as the anchor owner.
void CommitTransaction(Txn* txn, Timestamp commit_ts, const consensus::OpId& op_id);
// Merges the uncommitted transaction rowsets associated with the given
// 'txn_id' with the committed rowsets.
void CommitTxnRowSets(int64_t txn_id);
// Aborts the transaction, recording the abort in the tablet metadata.
// Upon calling this, 'op_id' will be anchored until the metadata is flushed,
// using 'txn' as the anchor owner.
void AbortTransaction(Txn* txn, const consensus::OpId& op_id);
// Creates new rowsets for the given transaction.
void CreateTxnRowSets(int64_t txn_id, scoped_refptr<TxnMetadata> txn_meta);
// Create a new row iterator which yields the rows as of the current MVCC
// state of this tablet.
// The returned iterator is not initialized.
Status NewRowIterator(const Schema& projection,
std::unique_ptr<RowwiseIterator>* iter) const;
// Like above, but returns an ordered iterator.
Status NewOrderedRowIterator(const Schema& projection,
std::unique_ptr<RowwiseIterator>* iter) const;
// Create a new row iterator using specific iterator options.
//
// 'opts' contains the options desired from the iterator.
//
// Note: the Schema pointed to by the 'projection' field of the 'opts' struct
// will be copied, so that pointer only needs to remain valid during the call
// to NewRowIterator() and not after that.
// Similarly, the 'io_context' field of the 'opts' struct will be ignored and
// overwritten in the copy of the 'opts' struct used by the returned iterator
// because the iterator constructs and holds the relevant instance of that
// object as a member variable.
Status NewRowIterator(RowIteratorOptions opts,
std::unique_ptr<RowwiseIterator>* iter) const;
// Flush the current MemRowSet for this tablet to disk. This swaps
// in a new (initially empty) MemRowSet in its place.
//
// This doesn't flush any DeltaMemStores for any existing RowSets.
// To do that, call FlushBiggestDMS() for example.
Status Flush();
// Prepares the op context for the alter schema operation.
// An error will be returned if the specified schema is invalid (e.g.
// key mismatch, or missing IDs)
Status CreatePreparedAlterSchema(AlterSchemaOpState *op_state,
const SchemaPtr& schema);
// Apply the Schema of the specified op.
// This operation will trigger a flush on the current MemRowSet.
Status AlterSchema(AlterSchemaOpState* op_state);
// Rewind the schema to an earlier version than is written in the on-disk
// metadata. This is done during bootstrap to roll the schema back to the
// point in time where the logs-to-be-replayed begin, so we can then decode
// the operations in the log with the correct schema.
//
// REQUIRES: state_ == kBootstrapping
Status RewindSchemaForBootstrap(const Schema& new_schema,
int64_t schema_version);
// Prints current RowSet layout, taking a snapshot of the current RowSet interval
// tree. Also prints the log of the compaction algorithm as evaluated
// on the current layout.
void PrintRSLayout(std::ostream* o);
// Flags to change the behavior of compaction.
enum CompactFlag {
COMPACT_NO_FLAGS = 0,
// Force the compaction to include all rowsets, regardless of the
// configured compaction policy. This is currently only used in
// tests.
FORCE_COMPACT_ALL = 1 << 0
};
typedef int CompactFlags;
Status Compact(CompactFlags flags);
// Update the statistics for performing a compaction.
void UpdateCompactionStats(MaintenanceOpStats* stats);
// Returns the exact current size of the MRS, in bytes. A value greater than 0 doesn't imply
// that the MRS has data, only that it has allocated that amount of memory.
// This method takes a read lock on component_lock_ and is thread-safe.
size_t MemRowSetSize() const;
// Returns true if the MRS is empty, else false. Doesn't rely on size and
// actually verifies that the MRS has no elements.
// This method takes a read lock on component_lock_ and is thread-safe.
bool MemRowSetEmpty() const;
// Returns the size in bytes of WALs that would need to be replayed to restore
// the current MRS.
size_t MemRowSetLogReplaySize(const ReplaySizeMap& replay_size_map) const;
// Returns the total on-disk size of this tablet, in bytes.
// Includes the tablet superblock.
size_t OnDiskSize() const;
// Returns the on-disk size of this tablet's data, in bytes.
// Excludes all metadata (both tablet metadata and the metadata of this tablet's rowsets).
size_t OnDiskDataSize() const;
// Get the total size of all the DMS
size_t DeltaMemStoresSize() const;
// Same as MemRowSetEmpty(), but for the DMS.
bool DeltaMemRowSetEmpty() const;
// Flushes the DMS with the highest retention.
Status FlushBestDMS(const ReplaySizeMap &replay_size_map) const;
// Flush only the biggest DMS
Status FlushBiggestDMS();
// Flush all delta memstores. Only used for tests.
Status FlushAllDMSForTests();
// Run a major compaction on all delta stores. Initializes any un-initialized
// redo delta stores. Only used for tests.
Status MajorCompactAllDeltaStoresForTests();
// Finds the RowSet which has the most separate delta files and
// issues a delta compaction.
Status CompactWorstDeltas(RowSet::DeltaCompactionType type);
// Get the highest performance improvement that would come from compacting the delta stores
// of one of the rowsets. If the returned performance improvement is 0, or if 'rs' is NULL,
// then 'rs' isn't set. Callers who already own compact_select_lock_
// can call GetPerfImprovementForBestDeltaCompactUnlocked().
double GetPerfImprovementForBestDeltaCompact(RowSet::DeltaCompactionType type,
std::shared_ptr<RowSet>* rs) const;
// Same as GetPerfImprovementForBestDeltaCompact(), but doesn't take a lock on
// compact_select_lock_.
double GetPerfImprovementForBestDeltaCompactUnlocked(RowSet::DeltaCompactionType type,
std::shared_ptr<RowSet>* rs) const;
// Estimate the number of bytes in ancient undo delta stores. This may be an
// overestimate.
Status EstimateBytesInPotentiallyAncientUndoDeltas(int64_t* bytes);
// Initialize undo delta blocks for up to 'time_budget' amount of time.
// If 'time_budget' is not Initialized() then there is no time limit.
// If this method returns OK, the number of bytes found in ancient undo files
// is returned in the out-param 'bytes_in_ancient_undos'.
Status InitAncientUndoDeltas(MonoDelta time_budget, int64_t* bytes_in_ancient_undos);
// Find and delete all undo delta blocks that have a maximum op timestamp
// prior to the current ancient history mark. If this method returns OK, the
// number of blocks and bytes deleted are returned in the out-parameters.
//
// Returns an error if the metadata update fails. Upon failure, no in-memory
// state is change.
Status DeleteAncientUndoDeltas(int64_t* blocks_deleted = nullptr,
int64_t* bytes_deleted = nullptr);
// Returns the number of bytes potentially used by rowsets that have no live
// rows and are entirely ancient.
//
// These checks may not touch on-disk block data if we can determine from the
// live row count that the rowsets aren't fully deleted, or from the DMS that
// the latest update is not considered ancient. If there is no DMS, looks at
// the newest redo but doesn't initialize it. As such, since we may miss out
// on counting rowsets we haven't initialized yet, this may be an
// underestimate.
Status GetBytesInAncientDeletedRowsets(int64_t* bytes_in_ancient_deleted_rowsets);
// Finds and GCs all fully deleted rowsets that have a maximum op timestamp
// prior to the current ancient history mark.
//
// Returns an error if the metadata update fails. Upon failure, no in-memory
// state is change.
Status DeleteAncientDeletedRowsets();
// Counts the number of deltas in the tablet. Only used for tests.
int64_t CountUndoDeltasForTests() const;
int64_t CountRedoDeltasForTests() const;
// Return the current number of rowsets in the tablet.
size_t num_rowsets() const;
// Attempt to count the total number of rows in the tablet.
// This is not super-efficient since it must iterate over the
// memrowset in the current implementation.
Status CountRows(uint64_t *count) const;
// Count the number of live rows in this tablet.
Status CountLiveRows(uint64_t* count) const;
// Verbosely dump this entire tablet to the logs. This is only
// really useful when debugging unit tests failures where the tablet
// has a very small number of rows.
Status DebugDump(std::vector<std::string> *lines = NULL);
const SchemaPtr schema() const {
return metadata_->schema();
}
// Returns a reference to the key projection of the tablet schema.
// The schema keys are immutable.
const Schema& key_schema() const { return key_schema_; }
// Return the MVCC manager for this tablet.
MvccManager* mvcc_manager() { return &mvcc_; }
// Return the Lock Manager for this tablet.
LockManager* lock_manager() { return &lock_manager_; }
// Return the transaction participant for this tablet.
TxnParticipant* txn_participant() { return &txn_participant_; }
const TabletMetadata *metadata() const { return metadata_.get(); }
TabletMetadata *metadata() { return metadata_.get(); }
scoped_refptr<TabletMetadata> shared_metadata() const { return metadata_; }
void SetCompactionHooksForTests(const std::shared_ptr<CompactionFaultHooks> &hooks);
void SetFlushHooksForTests(const std::shared_ptr<FlushFaultHooks> &hooks);
void SetFlushCompactCommonHooksForTests(
const std::shared_ptr<FlushCompactCommonHooks> &hooks);
// Returns the current MemRowSet id, for tests.
// This method takes a read lock on component_lock_ and is thread-safe.
int32_t CurrentMrsIdForTests() const;
// Runs a major delta major compaction on columns with specified IDs.
// NOTE: RowSet must presently be a DiskRowSet. (Perhaps the API should be
// a shared_ptr API for now?)
//
// Only used in tests.
Status DoMajorDeltaCompaction(const std::vector<ColumnId>& col_ids,
const std::shared_ptr<RowSet>& input_rs,
const fs::IOContext* io_context = nullptr);
// Calculates the ancient history mark and returns true iff tablet history GC
// is enabled, which requires the use of a HybridClock.
// Otherwise, returns false.
bool GetTabletAncientHistoryMark(Timestamp* ancient_history_mark) const WARN_UNUSED_RESULT;
// Calculates history GC options based on properties of the Clock implementation.
HistoryGcOpts GetHistoryGcOpts() const;
// Method used by tests to retrieve all rowsets of this table. This
// will be removed once code for selecting the appropriate RowSet is
// finished and delta files is finished is part of Tablet class.
void GetRowSetsForTests(std::vector<std::shared_ptr<RowSet> >* out);
// Register the maintenance ops associated with this tablet
void RegisterMaintenanceOps(MaintenanceManager* maint_mgr);
// Unregister the maintenance ops associated with this tablet. This will wait
// for all ops to finish before returning.
//
// This method is not thread safe, but is currently only called during
// TabletReplica::Shutdown(), which is single-threaded by design.
void UnregisterMaintenanceOps();
// Cancel the maintenance ops associated with this tablet. This will prevent
// further scheduling of the ops and will not wait for any ops to finish.
//
// This method is thread-safe.
void CancelMaintenanceOps();
const std::string& tablet_id() const { return metadata_->tablet_id(); }
// Return the metrics for this tablet.
// May be NULL in unit tests, etc.
TabletMetrics* metrics() { return metrics_.get(); }
// Return handle to the metric entity of this tablet.
const scoped_refptr<MetricEntity>& GetMetricEntity() const {
return metric_entity_;
}
// Returns a reference to this tablet's memory tracker.
const std::shared_ptr<MemTracker>& mem_tracker() const {
return mem_trackers_.tablet_tracker;
}
// Throttle a RPC with 'bytes' request size.
// Return true if this RPC is allowed.
bool ShouldThrottleAllow(int64_t bytes);
clock::Clock* clock() const { return clock_; }
std::string LogPrefix() const;
// Return false if the tablets need to compact,
// otherwise return true.
bool disable_compaction() const;
// Return the default bloom filter sizing parameters, configured by server flags.
static BloomFilterSizing DefaultBloomSizing();
// Split [start_key, stop_key) into primary key ranges by chunk size.
//
// If column_ids specified, then the size estimate used for 'target_chunk_size'
// should only include these columns. This can be used if a query will
// only scan a certain subset of the columns.
void SplitKeyRange(const EncodedKey* start_key,
const EncodedKey* stop_key,
const std::vector<ColumnId>& column_ids,
uint64 target_chunk_size,
std::vector<KeyRange>* ranges);
// Update the last read operation timestamp.
void UpdateLastReadTime();
// Collect and update recent workload statistics for the tablet.
// Return the current workload score of the tablet.
//
// This method is not thread safe and should only be called from a single thread at once.
double CollectAndUpdateWorkloadStats(MaintenanceOp::PerfImprovementOpType type);
// Returns the best DMS to flush, based on its memory size and retained
// bytes. Also returns the earliest creation time of a DMS seen. Note that
// 'mem_size' and 'replay_size' correspond to the same DMS but
// 'earliest_dms_time' may not.
std::shared_ptr<RowSet> FindBestDMSToFlush(const ReplaySizeMap& replay_size_map,
int64_t* mem_size = nullptr,
int64_t* replay_size = nullptr,
MonoTime* earliest_dms_time = nullptr) const;
private:
friend class kudu::AlterTableTest;
friend class Iterator;
friend class TabletReplicaTest;
friend class TabletReplicaTestBase;
FRIEND_TEST(TestTablet, TestGetReplaySizeForIndex);
FRIEND_TEST(TestTabletStringKey, TestSplitKeyRange);
FRIEND_TEST(TestTabletStringKey, TestSplitKeyRangeWithZeroRowSets);
FRIEND_TEST(TestTabletStringKey, TestSplitKeyRangeWithOneRowSet);
FRIEND_TEST(TestTabletStringKey, TestSplitKeyRangeWithNonOverlappingRowSets);
FRIEND_TEST(TestTabletStringKey, TestSplitKeyRangeWithMinimumValueRowSet);
FRIEND_TEST(TxnParticipantTest, TestFlushMultipleMRSs);
// Lifecycle states that a Tablet can be in. Legal state transitions for a
// Tablet object:
//
// kInitialized -> kBootstrapping -> kOpen -> kStopped -> kShutdown
// | | | ^^^
// | | +--------+||
// | +-----------------------+|
// +----------------------------------------+
enum State {
kInitialized,
kBootstrapping,
kOpen,
kStopped,
kShutdown
};
// Sets the lifecycle state of the tablet. See the definition of State for
// the valid transitions.
//
// Must be called while 'state_lock_' is held.
void set_state_unlocked(State s) {
DCHECK(state_lock_.is_locked());
switch (s) {
case kBootstrapping:
DCHECK_EQ(kInitialized, state_);
break;
case kOpen:
DCHECK_EQ(kBootstrapping, state_);
break;
case kStopped:
DCHECK(state_ == kInitialized ||
state_ == kBootstrapping ||
state_ == kOpen);
break;
case kShutdown:
DCHECK(state_ == kStopped ||
state_ == kShutdown);
break;
default:
LOG(DFATAL) << "Illegal state transition!";
}
state_ = s;
}
// Returns an error if the tablet is in the 'kStopped' or 'kShutdown' state.
// Sets *cur_state to the current state, which may be useful for later assertions.
Status CheckHasNotBeenStopped(State* cur_state = nullptr) const;
Status FlushUnlocked();
// Validate the contents of 'op' and return a bad Status if it is invalid.
static Status ValidateOp(const RowOp& op);
// Validate 'op' as in 'ValidateOp()' above. If it is invalid, marks the op as failed
// and returns false. If valid, marks the op as valid and returns true.
static bool ValidateOpOrMarkFailed(RowOp* op);
// Validate the given insert/upsert operation.
static Status ValidateInsertOrUpsertUnlocked(const RowOp& op);
// Validate the given update/delete operation.
static Status ValidateMutateUnlocked(const RowOp& op);
// Perform an INSERT, INSERT_IGNORE, UPSERT, or UPSERT_IGNORE operation, assuming that the op is
// already in a prepared state. This state ensures that:
// - the row lock is acquired
// - the tablet components have been acquired
// - the operation has been decoded
Status InsertOrUpsertUnlocked(const fs::IOContext* io_context,
WriteOpState *op_state,
RowOp* op,
ProbeStats* stats);
// Same as above, but for UPDATE, UPDATE_IGNORE, DELETE, or DELETE_IGNORE operations.
Status MutateRowUnlocked(const fs::IOContext* io_context,
WriteOpState *op_state,
RowOp* op,
ProbeStats* stats);
// In the case of an UPSERT against a duplicate row, converts the UPSERT
// into an internal UPDATE operation and performs it.
Status ApplyUpsertAsUpdate(const fs::IOContext* io_context,
WriteOpState *op_state,
RowOp* upsert,
RowSet* rowset,
ProbeStats* stats);
// Return the list of RowSets that need to be consulted when processing the
// given insertion or mutation.
static std::vector<RowSet*> FindRowSetsToCheck(const RowOp* op,
const TabletComponents* comps);
// For each of the operations in 'op_state', check for the presence of their
// row keys in the RowSets in the current RowSetTree (as determined by the op's
// captured TabletComponents).
Status BulkCheckPresence(const fs::IOContext* io_context,
WriteOpState* op_state) WARN_UNUSED_RESULT;
// Capture a set of iterators which, together, reflect all of the data in the tablet.
//
// These iterators are not true snapshot iterators, but they are safe against
// concurrent modification. They will include all data that was present at the time
// of creation, and potentially newer data.
//
// The returned iterators are not Init()ed.
// The pointer fields of 'opts' must remain valid and unchanged for the
// lifetime of the returned iterators.
Status CaptureConsistentIterators(const RowIteratorOptions& opts,
const ScanSpec* spec,
std::vector<IterWithBounds>* iters) const;
Status PickRowSetsToCompact(RowSetsInCompaction *picked,
CompactFlags flags) const;
// Performs a merge compaction or a flush.
Status DoMergeCompactionOrFlush(const RowSetsInCompaction &input,
int64_t mrs_being_flushed,
const std::vector<TxnInfoBeingFlushed>& txns_being_flushed);
// Handle the case in which a compaction or flush yielded no output rows.
// In this case, we just need to remove the rowsets in 'rowsets' from the
// metadata and flush it.
Status HandleEmptyCompactionOrFlush(const RowSetVector& rowsets,
int mrs_being_flushed,
const std::vector<TxnInfoBeingFlushed>& txns_being_flushed);
// Updates the average rowset height metric. Acquires the tablet's
// compact_select_lock_.
void UpdateAverageRowsetHeight();
Status FlushMetadata(const RowSetVector& to_remove,
const RowSetMetadataVector& to_add,
int64_t mrs_being_flushed,
const std::vector<TxnInfoBeingFlushed>& txns_being_flushed);
static void ModifyRowSetTree(const RowSetTree& old_tree,
const RowSetVector& rowsets_to_remove,
const RowSetVector& rowsets_to_add,
RowSetTree* new_tree);
// Swap out a set of rowsets, atomically replacing them with the new rowset
// under the lock.
void AtomicSwapRowSets(const RowSetVector &to_remove,
const RowSetVector &to_add);
// Same as the above, but without taking the lock. This should only be used
// in cases where the lock is already held.
void AtomicSwapRowSetsUnlocked(const RowSetVector &to_remove,
const RowSetVector &to_add);
void GetComponents(scoped_refptr<TabletComponents>* comps) const {
shared_lock<rw_spinlock> l(component_lock_);
*comps = CHECK_NOTNULL(components_.get());
}
void GetComponentsOrNull(scoped_refptr<TabletComponents>* comps) const {
shared_lock<rw_spinlock> l(component_lock_);
*comps = components_;
}
// Create a new MemRowSet, replacing the current committed one(s).
// 'old_mrss' will be populated to the current committed MemRowSet(s) set
// before the replacement. If any MemRowSet is not empty it will be added to
// the 'compaction' input and the MemRowSets' compaction locks will be taken
// to prevent the inclusion in any concurrent compactions.
Status ReplaceMemRowSetsUnlocked(RowSetsInCompaction* compaction,
std::vector<std::shared_ptr<MemRowSet>>* old_mrss);
// Convert the specified read client schema (without IDs) to a server schema (with IDs)
// This method is used by NewRowIterator().
Status GetMappedReadProjection(const Schema& projection,
Schema *mapped_projection) const;
Status CheckRowInTablet(const ConstContiguousRow& row) const;
// Helper method to find how many bytes need to be replayed to restore in-memory
// state from this index.
static int64_t GetReplaySizeForIndex(int64_t min_log_index,
const ReplaySizeMap& size_map);
// The elapsed time, in seconds, since the last read operation on this tablet, or since this
// Tablet object was created on current tserver if it hasn't been read since then.
uint64_t LastReadElapsedSeconds() const;
// Same as LastReadElapsedSeconds(), but for write operation.
uint64_t LastWriteElapsedSeconds() const;
// Test-only lock that synchronizes access to AssignTimestampAndStartOpForTests().
// Tests that use LocalTabletWriter take this lock to synchronize timestamp assignment,
// op start, and safe time adjustment.
// NOTE: Should not be taken on non-test paths.
mutable simple_spinlock test_start_op_lock_;
// Lock protecting schema_ and key_schema_.
//
// Writers take this lock in shared mode before decoding and projecting
// their requests. They hold the lock until after APPLY.
//
// Readers take this lock in shared mode only long enough to copy the
// current schema into the iterator, after which all projection is taken
// care of based on that copy.
//
// On an AlterSchema, this is taken in exclusive mode during Prepare() and
// released after the schema change has been applied.
mutable rw_semaphore schema_lock_;
const Schema key_schema_;
scoped_refptr<TabletMetadata> metadata_;
// Lock protecting access to the 'components_' member (i.e the rowsets in the tablet)
//
// Shared mode:
// - Writers take this in shared mode at the same time as they obtain an MVCC timestamp
// and capture a reference to components_. This ensures that we can use the MVCC timestamp
// to determine which writers are writing to which components during compaction.
// - Readers take this in shared mode while capturing their iterators. This ensures that
// they see a consistent view when racing against flush/compact.
//
// Exclusive mode:
// - Flushes/compactions take this lock in order to lock out concurrent updates when
// swapping in a new memrowset.
//
// NOTE: callers should avoid taking this lock for a long time, even in shared mode.
// This is because the lock has some concept of fairness -- if, while a long reader
// is active, a writer comes along, then all future short readers will be blocked.
mutable rw_spinlock component_lock_;
// The components of the tablet whose base data has been committed. These
// should always be read or swapped under the component_lock.
scoped_refptr<TabletComponents> components_;
// Uncommitted transaction state.
std::unordered_map<int64_t, scoped_refptr<TxnRowSets>> uncommitted_rowsets_by_txn_id_;
scoped_refptr<log::LogAnchorRegistry> log_anchor_registry_;
TabletMemTrackers mem_trackers_;
scoped_refptr<MetricEntity> metric_entity_;
std::unique_ptr<TabletMetrics> metrics_;
std::unique_ptr<Throttler> throttler_;
int64_t next_mrs_id_;
// A pointer to the server's clock.
clock::Clock* clock_;
MvccManager mvcc_;
LockManager lock_manager_;
// Maintains the set of in-flight transactions, and any WAL anchors
// associated with them.
// NOTE: the participant may retain MVCC ops, so define it after the
// MvccManager, to ensure those ops get destructed before the MvccManager.
// The same goes for locks and the LockManager.
TxnParticipant txn_participant_;
std::unique_ptr<CompactionPolicy> compaction_policy_;
// Lock protecting the selection of rowsets for compaction.
// Only one thread may run the compaction selection algorithm at a time
// so that they don't both try to select the same rowset.
mutable std::mutex compact_select_lock_;
// We take this lock when flushing the tablet's rowsets in Tablet::Flush. We
// don't want to have two flushes in progress at once, in case the one which
// started earlier completes after the one started later.
mutable Semaphore rowsets_flush_sem_;
// Lock protecting access to mutate 'state_' and all access to 'maintenance_ops_'.
// If taken with any other locks, this must be taken last, i.e. no locks can
// be acquired while holding this lock.
mutable simple_spinlock state_lock_;
// Protected by state_lock_ for transitions, but may be read without holding a lock.
std::atomic<State> state_;
// Fake lock used to ensure calls to RegisterMaintenanceOps and
// UnregisterMaintenanceOps don't overlap. This serves to ensure that only
// one thread is updating the maintenance op list at a time.
DFAKE_MUTEX(maintenance_registration_fake_lock_);
// Fault hooks. In production code, these will always be NULL.
std::shared_ptr<CompactionFaultHooks> compaction_hooks_;
std::shared_ptr<FlushFaultHooks> flush_hooks_;
std::shared_ptr<FlushCompactCommonHooks> common_hooks_;
std::vector<MaintenanceOp*> maintenance_ops_;
// Lock protecting access to 'last_write_time_' and 'last_read_time_'.
mutable rw_spinlock last_rw_time_lock_;
MonoTime last_read_time_;
MonoTime last_write_time_;
// NOTE: it's important that this is the first member to be destructed. This
// ensures we do not attempt to collect metrics while calling the destructor.
FunctionGaugeDetacher metric_detacher_;
MonoTime last_update_workload_stats_time_;
int64_t last_scans_started_;
int64_t last_rows_mutated_;
double last_read_score_;
double last_write_score_;
DISALLOW_COPY_AND_ASSIGN(Tablet);
};
// Hooks used in test code to inject faults or other code into interesting
// parts of the compaction code.
class Tablet::CompactionFaultHooks {
public:
virtual Status PostSelectIterators() { return Status::OK(); }
virtual ~CompactionFaultHooks() {}
};
class Tablet::FlushCompactCommonHooks {
public:
virtual Status PostTakeMvccSnapshot() { return Status::OK(); }
virtual Status PostWriteSnapshot() { return Status::OK(); }
virtual Status PostSwapInDuplicatingRowSet() { return Status::OK(); }
virtual Status PostReupdateMissedDeltas() { return Status::OK(); }
virtual Status PostSwapNewRowSet() { return Status::OK(); }
virtual ~FlushCompactCommonHooks() {}
};
// Hooks used in test code to inject faults or other code into interesting
// parts of the Flush() code.
class Tablet::FlushFaultHooks {
public:
virtual Status PostSwapNewMemRowSet() { return Status::OK(); }
virtual ~FlushFaultHooks() {}
};
class Tablet::Iterator : public RowwiseIterator {
public:
virtual ~Iterator();
virtual Status Init(ScanSpec *spec) OVERRIDE;
virtual bool HasNext() const OVERRIDE;
virtual Status NextBlock(RowBlock *dst) OVERRIDE;
std::string ToString() const OVERRIDE;
const Schema &schema() const OVERRIDE;
virtual void GetIteratorStats(std::vector<IteratorStats>* stats) const OVERRIDE;
private:
friend class Tablet;
DISALLOW_COPY_AND_ASSIGN(Iterator);
// Instantiate iterator with given projection and options.
//
// Note: the Schema pointed to by the 'projection' field of the 'opts' struct
// will be copied into projection_, so that pointer only needs to remain
// valid during the call to the constructor and not after that.
// Similarly, the 'io_context' field of the 'opts' struct will be ignored and
// overwritten in the copy of the 'opts' struct used by this class because
// this class constructs and holds the relevant instance of that object as a
// member variable.
Iterator(const Tablet* tablet,
RowIteratorOptions opts);
const Tablet* tablet_;
fs::IOContext io_context_;
Schema projection_;
RowIteratorOptions opts_;
std::unique_ptr<RowwiseIterator> iter_;
};
// Structure which represents the components of the tablet's storage.
// This structure is immutable -- an op can grab it and be sure that it won't
// change.
struct TabletComponents : public RefCountedThreadSafe<TabletComponents> {
TabletComponents(std::shared_ptr<MemRowSet> mrs,
std::vector<std::shared_ptr<MemRowSet>> txn_mrss,
std::shared_ptr<RowSetTree> rs_tree);
// The "main" MemRowSet that catches inserts to the tablet that are not a
// part of any transaction.
const std::shared_ptr<MemRowSet> memrowset;
// MemRowSets whose insertion heads were inserted as a part of a transaction.
const std::vector<std::shared_ptr<MemRowSet>> txn_memrowsets;
// The persisted RowSets that comprise the rows of a tablet.
const std::shared_ptr<RowSetTree> rowsets;
};
// Encapsulates data inserted as a part of a transaction that has not yet been
// committed.
// TODO(awong): when we support flushing transactional MRSs before committing,
// track uncommitted disk rowsets here.
struct TxnRowSets : public RefCountedThreadSafe<TxnRowSets> {
explicit TxnRowSets(std::shared_ptr<MemRowSet> mrs)
: memrowset(std::move(mrs)) {}
const std::shared_ptr<MemRowSet> memrowset;
};
} // namespace tablet
} // namespace kudu