blob: 3bacffa0d318cfd7c3dff908b4412c09d6e0184b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <gtest/gtest_prod.h>
#include "kudu/common/common.pb.h"
#include "kudu/consensus/metadata.pb.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tablet/tablet_replica.h"
#include "kudu/tserver/tablet_copy_client.h"
#include "kudu/tserver/tablet_replica_lookup.h"
#include "kudu/tserver/tserver.pb.h"
#include "kudu/tserver/tserver_admin.pb.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/locks.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/rw_mutex.h"
#include "kudu/util/status.h"
namespace boost {
template <class T>
class optional;
}
namespace kudu {
class FsManager;
class NodeInstancePB;
class Partition;
class PartitionSchema;
class Schema;
class ThreadPool;
namespace consensus {
class ConsensusMetadataManager;
class OpId;
class StartTabletCopyRequestPB;
} // namespace consensus
namespace master {
class ReportedTabletPB;
class TabletReportPB;
} // namespace master
namespace tablet {
class TabletMetadata;
}
namespace tserver {
class TabletServer;
// Map of tablet id -> transition reason string.
typedef std::unordered_map<std::string, std::string> TransitionInProgressMap;
// Map of dimension -> tablets number.
typedef std::unordered_map<std::string, int32_t> TabletNumByDimensionMap;
class TransitionInProgressDeleter;
// Keeps track of the tablets hosted on the tablet server side.
//
// TODO(todd): will also be responsible for keeping the local metadata about which
// tablets are hosted on this server persistent on disk, as well as re-opening all
// the tablets at startup, etc.
class TSTabletManager : public tserver::TabletReplicaLookupIf {
public:
// Construct the tablet manager.
explicit TSTabletManager(TabletServer* server);
virtual ~TSTabletManager();
// Load all tablet metadata blocks from disk, and open their respective tablets.
// Upon return of this method all existing tablets are registered, but
// the bootstrap is performed asynchronously.
Status Init();
// Waits for all the bootstraps to complete.
// Returns Status::OK if all tablets bootstrapped successfully. If
// the bootstrap of any tablet failed returns the failure reason for
// the first tablet whose bootstrap failed.
Status WaitForAllBootstrapsToFinish();
// Shut down all of the tablets, gracefully flushing before shutdown.
void Shutdown();
// Create a new tablet and register it with the tablet manager. The new tablet
// is persisted on disk and opened before this method returns.
//
// If 'replica' is non-NULL, the newly created tablet will be returned.
//
// If another tablet already exists with this ID, logs a DFATAL
// and returns a bad Status.
Status CreateNewTablet(const std::string& table_id,
const std::string& tablet_id,
const Partition& partition,
const std::string& table_name,
const Schema& schema,
const PartitionSchema& partition_schema,
consensus::RaftConfigPB config,
boost::optional<TableExtraConfigPB> extra_config,
boost::optional<std::string> dimension_label,
boost::optional<TableTypePB> table_type,
scoped_refptr<tablet::TabletReplica>* replica);
// Delete the specified tablet asynchronously with callback 'cb'.
// - If the async task cannot be started, 'cb' will be called with
// Status::ServiceUnavailable and TabletServerErrorPB::THROTTLED.
// - 'delete_type' must be one of TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED.
// - 'cas_config_index' is optionally specified to enable an
// atomic DeleteTablet operation that only occurs if the latest committed
// Raft config change op has an opid_index equal to or less than the specified
// value. If not, the callback is called with a non-OK Status and error code
// CAS_FAILED.
void DeleteTabletAsync(const std::string& tablet_id,
tablet::TabletDataState delete_type,
const boost::optional<int64_t>& cas_config_index,
const std::function<void(const Status&, TabletServerErrorPB::Code)>& cb);
// Delete the specified tablet synchronously.
// See DeleteTabletAsync() for more information.
Status DeleteTablet(const std::string& tablet_id,
tablet::TabletDataState delete_type,
const boost::optional<int64_t>& cas_config_index,
TabletServerErrorPB::Code* error_code = nullptr);
// Lookup the given tablet replica by its ID.
// Returns true if the tablet is found successfully.
bool LookupTablet(const std::string& tablet_id,
scoped_refptr<tablet::TabletReplica>* replica) const;
// Same as LookupTablet but doesn't acquired the shared lock.
bool LookupTabletUnlocked(const std::string& tablet_id,
scoped_refptr<tablet::TabletReplica>* replica) const;
virtual Status GetTabletReplica(const std::string& tablet_id,
scoped_refptr<tablet::TabletReplica>* replica) const
override;
virtual const NodeInstancePB& NodeInstance() const override;
// Initiate tablet copy of the specified tablet on the tablet_copy_pool_.
// See the StartTabletCopy() RPC declaration in consensus.proto for details.
// 'cb' is guaranteed to be invoked as a callback.
virtual void StartTabletCopy(
const consensus::StartTabletCopyRequestPB* req,
std::function<void(const Status&, TabletServerErrorPB::Code)> cb) override;
// Synchronously run the tablet copy procedure.
void RunTabletCopy(
const consensus::StartTabletCopyRequestPB* req,
std::function<void(const Status&, TabletServerErrorPB::Code)> cb);
// Adds updated tablet information to 'report'.
void PopulateFullTabletReport(master::TabletReportPB* report) const;
// Adds updated tablet information to 'report'. Only tablets in 'tablet_ids'
// are included.
void PopulateIncrementalTabletReport(master::TabletReportPB* report,
const std::vector<std::string>& tablet_ids) const;
// Get all of the tablets currently hosted on this server.
virtual void GetTabletReplicas(
std::vector<scoped_refptr<tablet::TabletReplica> >* replicas) const override;
// Marks tablet with 'tablet_id' as dirty so that it'll be included in the
// next round of master heartbeats.
//
// Dirtying events typically include state changes outside of the control of
// TsTabletManager, such as consensus role changes.
void MarkTabletDirty(const std::string& tablet_id, const std::string& reason);
// Marks tablets as dirty in batch.
void MarkTabletsDirty(const std::vector<std::string>& tablet_ids, const std::string& reason);
// Return the number of tablets in RUNNING or BOOTSTRAPPING state.
int GetNumLiveTablets() const;
// Get the number of tablets in RUNNING or BOOTSTRAPPING state in each dimension.
TabletNumByDimensionMap GetNumLiveTabletsByDimension() const;
Status RunAllLogGC();
// Delete the tablet using the specified delete_type as the final metadata
// state. Deletes the on-disk data, metadata, as well as all WAL segments.
//
// If set, 'last_logged_opid' will be persisted in the
// 'tombstone_last_logged_opid' field in the tablet metadata. Otherwise, if
// 'last_logged_opid' is equal to boost::none, the tablet metadata will
// retain its previous value of 'tombstone_last_logged_opid', if any.
static Status DeleteTabletData(
const scoped_refptr<tablet::TabletMetadata>& meta,
const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
tablet::TabletDataState delete_type,
boost::optional<consensus::OpId> last_logged_opid);
// Synchronously makes the specified tablet unavailable for further I/O and
// schedules its asynchronous shutdown.
void FailTabletAndScheduleShutdown(const std::string& tablet_id);
// Forces shutdown of the tablet replicas in the data dir corresponding to 'uuid'.
void FailTabletsInDataDir(const std::string& uuid);
// Refresh the cached counts of tablet states, if the cache is old enough,
// and return the count for tablet state 'st'.
int RefreshTabletStateCacheAndReturnCount(tablet::TabletStatePB st);
// Wait a period up to 'timeout' for there to be no tablet state transitions
// registered with the tablet manager.
// This method is for use in tests only. See KUDU-2444.
Status WaitForNoTransitionsForTests(const MonoDelta& timeout) const;
// Update the tablet statistics if necessary.
void UpdateTabletStatsIfNecessary();
private:
FRIEND_TEST(LeadershipChangeReportingTest, TestReportStatsDuringLeadershipChange);
FRIEND_TEST(TsTabletManagerTest, TestPersistBlocks);
FRIEND_TEST(TsTabletManagerTest, TestTabletStatsReports);
FRIEND_TEST(TsTabletManagerITest, TestTableStats);
// Flag specified when registering a TabletReplica.
enum RegisterTabletReplicaMode {
NEW_REPLICA,
REPLACEMENT_REPLICA
};
// Standard log prefix, given a tablet id.
static std::string LogPrefix(const std::string& tablet_id, FsManager *fs_manager);
std::string LogPrefix(const std::string& tablet_id) const {
return LogPrefix(tablet_id, fs_manager_);
}
// Returns Status::OK() iff state_ == MANAGER_RUNNING.
Status CheckRunningUnlocked(TabletServerErrorPB::Code* error_code) const;
// Registers the start of a tablet state transition by inserting the tablet
// id and reason string into the transition_in_progress_ map.
// 'reason' is a string included in the Status return when there is
// contention indicating why the tablet is currently already transitioning.
// Returns IllegalState if the tablet is already "locked" for a state
// transition by some other operation.
// On success, returns OK and populates 'deleter' with an object that removes
// the map entry on destruction.
Status StartTabletStateTransitionUnlocked(const std::string& tablet_id,
const std::string& reason,
scoped_refptr<TransitionInProgressDeleter>* deleter);
// Marks the replica indicated by 'tablet_id' as being in a transitional
// state. Returns an error status if the replica is already in a transitional
// state.
Status BeginReplicaStateTransition(const std::string& tablet_id,
const std::string& reason,
scoped_refptr<tablet::TabletReplica>* replica,
scoped_refptr<TransitionInProgressDeleter>* deleter,
TabletServerErrorPB::Code* error_code);
// Open a tablet meta from the local file system by loading its superblock.
Status OpenTabletMeta(const std::string& tablet_id,
scoped_refptr<tablet::TabletMetadata>* metadata);
// Open a tablet whose metadata has already been loaded/created.
// This method does not return anything as it can be run asynchronously.
// Upon completion of this method the tablet should be initialized and running.
// If something wrong happened on bootstrap/initialization the relevant error
// will be set on TabletReplica along with the state set to FAILED.
//
// The tablet must be registered and an entry corresponding to this tablet
// must be put into the transition_in_progress_ map before calling this
// method. A TransitionInProgressDeleter must be passed as 'deleter' into
// this method in order to remove that transition-in-progress entry when
// opening the tablet is complete (in either a success or a failure case).
void OpenTablet(const scoped_refptr<tablet::TabletReplica>& replica,
const scoped_refptr<TransitionInProgressDeleter>& deleter);
// Open a tablet whose metadata has already been loaded.
void BootstrapAndInitTablet(const scoped_refptr<tablet::TabletMetadata>& meta,
scoped_refptr<tablet::TabletReplica>* replica);
// Add the tablet to the tablet map.
// 'mode' specifies whether to expect an existing tablet to exist in the map.
// If mode == NEW_REPLICA but a tablet with the same name is already registered,
// or if mode == REPLACEMENT_REPLICA but a tablet with the same name is not
// registered, a FATAL message is logged, causing a process crash.
// Calls to this method are expected to be externally synchronized, typically
// using the transition_in_progress_ map.
void RegisterTablet(const std::string& tablet_id,
const scoped_refptr<tablet::TabletReplica>& replica,
RegisterTabletReplicaMode mode);
// Create and register a new TabletReplica, given tablet metadata.
// Calls RegisterTablet() with the given 'mode' parameter after constructing
// the TablerPeer object. See RegisterTablet() for details about the
// semantics of 'mode' and the locking requirements.
Status CreateAndRegisterTabletReplica(scoped_refptr<tablet::TabletMetadata> meta,
RegisterTabletReplicaMode mode,
scoped_refptr<tablet::TabletReplica>* replica_out);
// Helper to generate the report for a single tablet.
void CreateReportedTabletPB(const scoped_refptr<tablet::TabletReplica>& replica,
master::ReportedTabletPB* reported_tablet) const;
// Handle the case on startup where we find a tablet that is not in
// TABLET_DATA_READY state. Generally, we tombstone the replica.
Status HandleNonReadyTabletOnStartup(const scoped_refptr<tablet::TabletMetadata>& meta);
// Return Status::IllegalState if leader_term < last_logged_term.
// Helper function for use with tablet copy.
Status CheckLeaderTermNotLower(const std::string& tablet_id,
int64_t leader_term,
int64_t last_logged_term);
TSTabletManagerStatePB state() const {
shared_lock<RWMutex> l(lock_);
return state_;
}
// Initializes the RaftPeerPB for the local peer.
// Guaranteed to include both uuid and last_seen_addr fields.
// Crashes with an invariant check if the RPC server is not currently in a
// running state.
void InitLocalRaftPeerPB();
// A task to check for the staleness of transactions registered with
// corresponding transaction status tablets (if any).
void TxnStalenessTrackerTask();
// Just for tests.
void SetNextUpdateTimeForTests();
FsManager* const fs_manager_;
const scoped_refptr<consensus::ConsensusMetadataManager> cmeta_manager_;
TabletServer* server_;
consensus::RaftPeerPB local_peer_pb_;
typedef std::unordered_map<std::string, scoped_refptr<tablet::TabletReplica> > TabletMap;
// Lock protecting tablet_map_, dirty_tablets_, state_,
// transition_in_progress_, perm_deleted_tablet_ids_,
// tablet_state_counts_, and last_walked_.
mutable RWMutex lock_;
// A latch to notify the task running on the txn_status_manager_pool_ on
// shutdown.
//
// TODO(aserbin): instead of using CountDownLatch, extend ConditionVariable
// to be able to work with RWMutex and use lock_ from above
// to create one to notify the task on shutdown
CountDownLatch shutdown_latch_;
// Map from tablet ID to tablet
TabletMap tablet_map_;
// Permanently deleted tablet ids. If a tablet is removed with status
// TABLET_DATA_DELETED then it is added to this map (until the next process
// restart).
std::unordered_set<std::string> perm_deleted_tablet_ids_;
// Map of tablet ids -> reason strings where the keys are tablets whose
// bootstrap, creation, or deletion is in-progress
TransitionInProgressMap transition_in_progress_;
MetricRegistry* metric_registry_;
TabletCopyClientMetrics tablet_copy_metrics_;
// Timestamp indicating the last time tablet_map_ was walked to count
// tablet states.
MonoTime last_walked_ = MonoTime::Min();
// Holds cached tablet states from tablet_map_.
std::map<tablet::TabletStatePB, int> tablet_state_counts_;
TSTabletManagerStatePB state_;
// Thread pool used to run tablet copy operations.
std::unique_ptr<ThreadPool> tablet_copy_pool_;
// Thread pool used to open the tablets async, whether bootstrap is required or not.
std::unique_ptr<ThreadPool> open_tablet_pool_;
// Thread pool used to delete tablets asynchronously.
std::unique_ptr<ThreadPool> delete_tablet_pool_;
// Thread pool to run TxnStatusManager tasks. As of now, this pool is
// to run a long-running single periodic task to abort stale transactions
// registered with corresponding transaction status tablets.
std::unique_ptr<ThreadPool> txn_status_manager_pool_;
// Ensures that we only update stats from a single thread at a time.
mutable rw_spinlock lock_update_;
MonoTime next_update_time_;
// NOTE: it's important that this is the first member to be destructed. This
// ensures we do not attempt to collect metrics while calling the destructor.
FunctionGaugeDetacher metric_detacher_;
DISALLOW_COPY_AND_ASSIGN(TSTabletManager);
};
// Helper to delete the transition-in-progress entry from the corresponding set
// when tablet bootstrap, create, and delete operations complete.
class TransitionInProgressDeleter : public RefCountedThreadSafe<TransitionInProgressDeleter> {
public:
TransitionInProgressDeleter(TransitionInProgressMap* map, RWMutex* lock,
std::string entry);
void Destroy();
private:
friend class RefCountedThreadSafe<TransitionInProgressDeleter>;
~TransitionInProgressDeleter();
TransitionInProgressMap* const in_progress_;
RWMutex* const lock_;
const std::string entry_;
bool is_destroyed_;
};
} // namespace tserver
} // namespace kudu