blob: 347376e70259a82006434c84f66caa082f7e990c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <atomic>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include <gtest/gtest_prod.h>
#include "kudu/common/timestamp.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
#include "kudu/util/locks.h"
#include "kudu/util/status.h"
namespace kudu {
class CountDownLatch;
class MonoTime;
namespace tablet {
class MvccManager;
// A snapshot of the current MVCC state, which can determine whether
// a transaction ID should be considered visible.
class MvccSnapshot {
public:
MvccSnapshot();
// Create a snapshot with the current state of the given manager
explicit MvccSnapshot(const MvccManager &manager);
// Create a snapshot at a specific Timestamp.
//
// This snapshot considers all transactions with lower timestamps to
// be committed, and those with higher timestamps to be uncommitted.
explicit MvccSnapshot(const Timestamp& timestamp);
// Create a snapshot which considers all transactions as committed.
// This is mostly useful in test contexts.
static MvccSnapshot CreateSnapshotIncludingAllTransactions();
// Creates a snapshot which considers no transactions committed.
static MvccSnapshot CreateSnapshotIncludingNoTransactions();
// Return true if the given transaction ID should be considered committed
// in this snapshot.
inline bool IsCommitted(const Timestamp& timestamp) const {
// Inline the most likely path, in which our watermarks determine
// whether a transaction is committed.
if (PREDICT_TRUE(timestamp < all_committed_before_)) {
return true;
}
if (PREDICT_TRUE(timestamp >= none_committed_at_or_after_)) {
return false;
}
// Out-of-line the unlikely case which involves more complex (loopy) code.
return IsCommittedFallback(timestamp);
}
// Returns true if this snapshot may have any committed transactions with ID
// equal to or higher than the provided 'timestamp'.
// This is mostly useful to avoid scanning REDO deltas in certain cases.
// If MayHaveCommittedTransactionsAtOrAfter(delta_stats.min) returns true
// it means that there might be transactions that need to be applied in the
// context of this snapshot; otherwise no scanning is necessary.
bool MayHaveCommittedTransactionsAtOrAfter(const Timestamp& timestamp) const;
// Returns true if this snapshot may have any uncommitted transactions with ID
// equal to or lower than the provided 'timestamp'.
// This is mostly useful to avoid scanning UNDO deltas in certain cases.
// If MayHaveUncommittedTransactionsAtOrBefore(delta_stats.max) returns false it
// means that all UNDO delta transactions are committed in the context of this
// snapshot and no scanning is necessary; otherwise there might be some
// transactions that need to be undone.
bool MayHaveUncommittedTransactionsAtOrBefore(const Timestamp& timestamp) const;
// Return a string representation of the set of committed transactions
// in this snapshot, suitable for debug printouts.
std::string ToString() const;
// Return true if the snapshot is considered 'clean'. A clean snapshot is one
// which is determined only by a timestamp -- the snapshot considers all
// transactions with timestamps less than some timestamp to be committed,
// and all other transactions to be uncommitted.
bool is_clean() const {
return committed_timestamps_.empty();
}
// Consider the given list of timestamps to be committed in this snapshot,
// even if they weren't when the snapshot was constructed.
// This is used in the flush path, where the set of commits going into a
// flushed file may not be a consistent snapshot from the MVCC point of view,
// yet we need to construct a scanner that accurately represents that set.
void AddCommittedTimestamps(const std::vector<Timestamp>& timestamps);
private:
friend class MvccManager;
FRIEND_TEST(MvccTest, TestMayHaveCommittedTransactionsAtOrAfter);
FRIEND_TEST(MvccTest, TestMayHaveUncommittedTransactionsBefore);
FRIEND_TEST(MvccTest, TestWaitUntilAllCommitted_SnapAtTimestampWithInFlights);
FRIEND_TEST(MvccTest, TestCorrectInitWithNoTxns);
bool IsCommittedFallback(const Timestamp& timestamp) const;
void AddCommittedTimestamp(Timestamp timestamp);
// Summary rule:
// A transaction T is committed if and only if:
// T < all_committed_before_ or
// or committed_timestamps_.contains(T)
//
// In ASCII form, where 'C' represents a committed transaction,
// and 'U' represents an uncommitted one:
//
// CCCCCCCCCCCCCCCCCUUUUUCUUUCU
// | \___\___ committed_timestamps_
// |
// \- all_committed_before_
// A transaction ID below which all transactions have been committed.
// For any timestamp X, if X < all_committed_timestamp_, then X is committed.
Timestamp all_committed_before_;
// A transaction ID at or beyond which no transactions have been committed.
// For any timestamp X, if X >= none_committed_after_, then X is uncommitted.
// This is equivalent to max(committed_timestamps_) + 1, but since
// that vector is unsorted, we cache it.
Timestamp none_committed_at_or_after_;
// The set of transactions higher than all_committed_before_timestamp_ which
// are committed in this snapshot.
// It might seem like using an unordered_set<> or a set<> would be faster here,
// but in practice, this list tends to be stay pretty small, and is only
// rarely consulted (most data will be culled by 'all_committed_before_'
// or none_committed_at_or_after_. So, using the compact vector structure fits
// the whole thing on one or two cache lines, and it ends up going faster.
std::vector<Timestamp::val_type> committed_timestamps_;
};
// Coordinator of MVCC transactions. Threads wishing to make updates use
// the MvccManager to obtain a unique timestamp, usually through the ScopedTransaction
// class defined below.
//
// MVCC is used to defer updates until commit time, and allow iterators to
// operate on a snapshot which contains only committed transactions.
//
// There are two valid paths for a transaction:
//
// 1) StartTransaction() -> StartApplyingTransaction() -> CommitTransaction()
// or
// 2) StartTransaction() -> AbortTransaction()
//
// When a transaction is ready to start making changes to in-memory data,
// it should transition to APPLYING state by calling StartApplyingTransaction().
// At this point, the transaction should apply its in-memory operations and
// must commit in a bounded amount of time (i.e it should not wait on external
// input such as an RPC from another host).
//
// NOTE: we do not support "rollback" of in-memory edits. Thus, once we call
// StartApplyingTransaction(), the transaction _must_ commit.
//
// See: docs/design_docs/repeatable-reads.md for more information on some of the concepts in
// this class like "clean" and "safe" time.
class MvccManager {
public:
MvccManager();
// Returns an error if the current snapshot has not been adjusted past its
// initial state. While in this state, it is unsafe for the MvccManager to
// serve information about already-applied transactions.
Status CheckIsSafeTimeInitialized() const;
// Begins a new transaction, which is assigned the provided timestamp.
//
// Requires that 'timestamp' is not committed.
// Requires that 'timestamp' is greater than 'safe_time'.
void StartTransaction(Timestamp timestamp);
// Mark that the transaction with the given timestamp is starting to apply
// its writes to in-memory stores. This must be called before CommitTransaction().
// If this is called, then AbortTransaction(timestamp) must never be called.
void StartApplyingTransaction(Timestamp timestamp);
// Abort the given transaction.
//
// If the transaction is not currently in-flight, this will trigger an
// assertion error. It is an error to abort the same transaction more
// than once.
//
// This makes sure that the transaction with 'timestamp' is removed from
// the in-flight set.
//
// The transaction must not have been marked as 'APPLYING' by calling
// StartApplyingTransaction(), or else this logs a FATAL error.
void AbortTransaction(Timestamp timestamp);
// Commit the given transaction.
//
// If the transaction is not currently in-flight, this will trigger an
// assertion error. It is an error to commit the same transaction more
// than once.
//
// The transaction must already have been marked as 'APPLYING' by calling
// StartApplyingTransaction(), or else this logs a FATAL error.
void CommitTransaction(Timestamp timestamp);
// Adjusts the safe time so that the MvccManager can trim state, provided
// 'safe_time' is higher than the current safe time.
//
// This must only be called when there is a guarantee that there won't be
// any more transactions with timestamps equal to or lower than 'safe_time'.
//
// TODO(dralves) Until leader leases is implemented this should only be called
// with the timestamps of consensus committed transactions, not with the safe
// time received from the leader (which can go back without leader leases).
void AdjustSafeTime(Timestamp safe_time);
// Take a snapshot of the current MVCC state, which indicates which
// transactions have been committed at the time of this call.
void TakeSnapshot(MvccSnapshot *snapshot) const;
// Take a snapshot of the MVCC state at 'timestamp' (i.e which includes
// all transactions which have a lower timestamp)
//
// If there are any in-flight transactions at a lower timestamp, waits for
// them to complete before returning.
//
// If 'timestamp' was marked safe before the call to this method (e.g. by TimeManager)
// then the returned snapshot is repeatable.
Status WaitForSnapshotWithAllCommitted(Timestamp timestamp,
MvccSnapshot* snapshot,
const MonoTime& deadline) const WARN_UNUSED_RESULT;
// Wait for all operations that are currently APPLYING to commit.
//
// NOTE: this does _not_ guarantee that no transactions are APPLYING upon
// return -- just that those that were APPLYING at call time are finished
// upon return.
//
// Returns Status::Aborted() if MVCC closed while waiting.
Status WaitForApplyingTransactionsToCommit() const WARN_UNUSED_RESULT;
bool AreAllTransactionsCommitted(Timestamp ts) const;
// Return the number of transactions in flight..
int CountTransactionsInFlight() const;
// Returns the earliest possible timestamp for an uncommitted transaction.
// All timestamps before this one are guaranteed to be committed.
Timestamp GetCleanTimestamp() const;
// Return the timestamps of all transactions which are currently 'APPLYING'
// (i.e. those which have started to apply their operations to in-memory data
// structures). Other transactions may have reserved their timestamps via
// StartTransaction() but not yet been applied.
//
// These transactions are guaranteed to eventually Commit() -- i.e. they will
// never Abort().
void GetApplyingTransactionsTimestamps(std::vector<Timestamp>* timestamps) const;
// Closes the MVCC manager. New transactions will not start, in-flight
// transactions will exit early on a best-effort basis, and waiting threads
// will return Status::Aborted().
void Close();
// Returns whether MVCC is open and available to manage transactions.
bool is_open() const {
return open_.load();
}
~MvccManager();
private:
friend class MvccTest;
FRIEND_TEST(MvccTest, TestAreAllTransactionsCommitted);
FRIEND_TEST(MvccTest, TestTxnAbort);
FRIEND_TEST(MvccTest, TestAutomaticCleanTimeMoveToSafeTimeOnCommit);
FRIEND_TEST(MvccTest, TestWaitForApplyingTransactionsToCommit);
FRIEND_TEST(MvccTest, TestWaitForCleanSnapshot_SnapAfterSafeTimeWithInFlights);
FRIEND_TEST(MvccTest, TestDontWaitAfterClose);
enum TxnState {
RESERVED,
APPLYING
};
bool InitTransactionUnlocked(const Timestamp& timestamp);
// TODO(dralves) ponder merging these since the new ALL_COMMITTED path no longer
// waits for the clean timestamp.
enum WaitFor {
ALL_COMMITTED,
NONE_APPLYING
};
struct WaitingState {
Timestamp timestamp;
CountDownLatch* latch;
WaitFor wait_for;
};
// Returns an error if the MVCC manager is closed.
Status CheckOpen() const;
// Returns true if all transactions before the given timestamp are committed.
//
// If 'ts' is not in the past, it's still possible that new transactions could
// start with a lower timestamp after this returns.
bool AreAllTransactionsCommittedUnlocked(Timestamp ts) const;
// Return true if there is any APPLYING operation with a timestamp
// less than or equal to 'ts'.
bool AnyApplyingAtOrBeforeUnlocked(Timestamp ts) const;
// Waits until all transactions before the given time are committed.
Status WaitUntil(WaitFor wait_for, Timestamp ts,
const MonoTime& deadline) const WARN_UNUSED_RESULT;
// Return true if the condition that the given waiter is waiting on has
// been achieved.
bool IsDoneWaitingUnlocked(const WaitingState& waiter) const;
// Commits the given transaction.
// Sets *was_earliest to true if this was the earliest in-flight transaction.
void CommitTransactionUnlocked(Timestamp timestamp,
bool* was_earliest_in_flight);
// Remove the timestamp 'ts' from the in-flight map.
// FATALs if the ts is not in the in-flight map.
// Returns its state.
TxnState RemoveInFlightAndGetStateUnlocked(Timestamp ts);
// Adjusts the clean time, i.e. the timestamp such that all transactions with
// lower timestamps are committed or aborted, based on which transactions are
// currently in flight and on what is the latest value of 'safe_time_'.
void AdjustCleanTime();
// Advances the earliest in-flight timestamp, based on which transactions are
// currently in-flight. Usually called when the previous earliest transaction
// commits or aborts.
void AdvanceEarliestInFlightTimestamp();
int GetNumWaitersForTests() const {
std::lock_guard<simple_spinlock> l(lock_);
return waiters_.size();
}
typedef simple_spinlock LockType;
mutable LockType lock_;
MvccSnapshot cur_snap_;
// The set of timestamps corresponding to currently in-flight transactions.
typedef std::unordered_map<Timestamp::val_type, TxnState> InFlightMap;
InFlightMap timestamps_in_flight_;
// A transaction timestamp below which all transactions are either committed or in-flight,
// meaning no new transactions will be started with a timestamp that is equal
// to or lower than this one.
Timestamp safe_time_;
// The minimum timestamp in timestamps_in_flight_, or Timestamp::kMax
// if that set is empty. This is cached in order to avoid having to iterate
// over timestamps_in_flight_ on every commit.
Timestamp earliest_in_flight_;
mutable std::vector<WaitingState*> waiters_;
std::atomic<bool> open_;
DISALLOW_COPY_AND_ASSIGN(MvccManager);
};
// A scoped handle to a running transaction.
// When this object goes out of scope, the transaction is automatically
// committed.
class ScopedTransaction {
public:
// Create a new transaction from the given MvccManager.
//
// When this transaction is committed it will use MvccManager::CommitTransaction().
ScopedTransaction(MvccManager* manager, Timestamp timestamp);
// Commit the transaction referenced by this scoped object, if it hasn't
// already been committed.
~ScopedTransaction();
Timestamp timestamp() const {
return timestamp_;
}
// Mark that this transaction is about to begin applying its modifications to
// in-memory stores.
//
// This must be called before Commit(). Abort() may not be called after this
// method.
void StartApplying();
// Commit the in-flight transaction.
//
// Requires that StartApplying() has been called.
void Commit();
// Abort the in-flight transaction.
//
// Requires that StartApplying() has NOT been called.
void Abort();
private:
bool done_;
MvccManager * const manager_;
const Timestamp timestamp_;
DISALLOW_COPY_AND_ASSIGN(ScopedTransaction);
};
} // namespace tablet
} // namespace kudu