| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| #ifndef KUDU_MASTER_CATALOG_MANAGER_H |
| #define KUDU_MASTER_CATALOG_MANAGER_H |
| |
| #include <boost/optional/optional_fwd.hpp> |
| #include <map> |
| #include <set> |
| #include <string> |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <vector> |
| |
| #include "kudu/common/partition.h" |
| #include "kudu/gutil/macros.h" |
| #include "kudu/gutil/ref_counted.h" |
| #include "kudu/master/master.pb.h" |
| #include "kudu/master/ts_manager.h" |
| #include "kudu/server/monitored_task.h" |
| #include "kudu/tserver/tablet_peer_lookup.h" |
| #include "kudu/util/cow_object.h" |
| #include "kudu/util/locks.h" |
| #include "kudu/util/monotime.h" |
| #include "kudu/util/oid_generator.h" |
| #include "kudu/util/promise.h" |
| #include "kudu/util/random.h" |
| #include "kudu/util/rw_mutex.h" |
| #include "kudu/util/status.h" |
| |
| namespace kudu { |
| |
| class Schema; |
| class ThreadPool; |
| class CreateTableStressTest_TestConcurrentCreateTableAndReloadMetadata_Test; |
| |
| namespace rpc { |
| class RpcContext; |
| } // namespace rpc |
| |
| namespace master { |
| |
| class CatalogManagerBgTasks; |
| class Master; |
| class SysCatalogTable; |
| class TableInfo; |
| class TSDescriptor; |
| |
| struct DeferredAssignmentActions; |
| |
| // The data related to a tablet which is persisted on disk. |
| // This portion of TableInfo is managed via CowObject. |
| // It wraps the underlying protobuf to add useful accessors. |
| struct PersistentTabletInfo { |
| bool is_running() const { |
| return pb.state() == SysTabletsEntryPB::RUNNING; |
| } |
| |
| bool is_deleted() const { |
| return pb.state() == SysTabletsEntryPB::REPLACED || |
| pb.state() == SysTabletsEntryPB::DELETED; |
| } |
| |
| // Helper to set the state of the tablet with a custom message. |
| // Requires that the caller has prepared this object for write. |
| // The change will only be visible after Commit(). |
| void set_state(SysTabletsEntryPB::State state, const std::string& msg); |
| |
| SysTabletsEntryPB pb; |
| }; |
| |
| // The information about a single tablet which exists in the cluster, |
| // |
| // This object uses copy-on-write for the portions of data which are persisted |
| // on disk. This allows the mutated data to be staged and written to disk |
| // while readers continue to access the previous version. These portions |
| // of data are in PersistentTableInfo above, and typically accessed using |
| // TabletMetadataLock. For example: |
| // |
| // TabletInfo* table = ...; |
| // TabletMetadataLock l(tablet, TableMetadataLock::READ); |
| // if (l.data().is_running()) { ... } |
| // |
| // The non-persistent information about the tablet is protected by an internal |
| // spin-lock. |
| // |
| // The object is owned/managed by the CatalogManager, and exposed for testing. |
| class TabletInfo : public RefCountedThreadSafe<TabletInfo> { |
| public: |
| typedef PersistentTabletInfo cow_state; |
| |
| TabletInfo(const scoped_refptr<TableInfo>& table, std::string tablet_id); |
| |
| const std::string& tablet_id() const { return tablet_id_; } |
| const scoped_refptr<TableInfo>& table() const { return table_; } |
| |
| // Access the persistent metadata. Typically you should use |
| // TabletMetadataLock to gain access to this data. |
| const CowObject<PersistentTabletInfo>& metadata() const { return metadata_; } |
| CowObject<PersistentTabletInfo>* mutable_metadata() { return &metadata_; } |
| |
| // Accessors for the last time create tablet RPCs were sent for this tablet. |
| void set_last_create_tablet_time(const MonoTime& ts); |
| MonoTime last_create_tablet_time() const; |
| |
| // Accessors for the last reported schema version |
| bool set_reported_schema_version(uint32_t version); |
| uint32_t reported_schema_version() const; |
| |
| // No synchronization needed. |
| std::string ToString() const; |
| |
| private: |
| friend class RefCountedThreadSafe<TabletInfo>; |
| ~TabletInfo(); |
| |
| const std::string tablet_id_; |
| const scoped_refptr<TableInfo> table_; |
| |
| CowObject<PersistentTabletInfo> metadata_; |
| |
| // Lock protecting the below mutable fields. |
| // This doesn't protect metadata_ (the on-disk portion). |
| mutable simple_spinlock lock_; |
| |
| // The last time the master sent create tablet RPCs for the tablet. |
| MonoTime last_create_tablet_time_; |
| |
| // Reported schema version (in-memory only). |
| uint32_t reported_schema_version_; |
| |
| DISALLOW_COPY_AND_ASSIGN(TabletInfo); |
| }; |
| |
| // The data related to a table which is persisted on disk. |
| // This portion of TableInfo is managed via CowObject. |
| // It wraps the underlying protobuf to add useful accessors. |
| struct PersistentTableInfo { |
| bool is_deleted() const { |
| return pb.state() == SysTablesEntryPB::REMOVED; |
| } |
| |
| bool is_running() const { |
| return pb.state() == SysTablesEntryPB::RUNNING || |
| pb.state() == SysTablesEntryPB::ALTERING; |
| } |
| |
| // Return the table's name. |
| const std::string& name() const { |
| return pb.name(); |
| } |
| |
| // Helper to set the state of the tablet with a custom message. |
| void set_state(SysTablesEntryPB::State state, const std::string& msg); |
| |
| SysTablesEntryPB pb; |
| }; |
| |
| // The information about a table, including its state and tablets. |
| // |
| // This object uses copy-on-write techniques similarly to TabletInfo. |
| // Please see the TabletInfo class doc above for more information. |
| // |
| // The non-persistent information about the table is protected by an internal |
| // spin-lock. |
| class TableInfo : public RefCountedThreadSafe<TableInfo> { |
| public: |
| typedef PersistentTableInfo cow_state; |
| typedef std::map<std::string, TabletInfo*> TabletInfoMap; |
| |
| explicit TableInfo(std::string table_id); |
| |
| std::string ToString() const; |
| |
| // Return the table's ID. Does not require synchronization. |
| const std::string& id() const { return table_id_; } |
| |
| // Add a tablet to this table. |
| void AddTablet(TabletInfo *tablet); |
| // Add multiple tablets to this table. |
| void AddTablets(const std::vector<TabletInfo*>& tablets); |
| |
| // Atomically add and remove multiple tablets from this table. |
| void AddRemoveTablets(const vector<scoped_refptr<TabletInfo>>& tablets_to_add, |
| const vector<scoped_refptr<TabletInfo>>& tablets_to_drop); |
| |
| // Return true if tablet with 'partition_key_start' has been |
| // removed from 'tablet_map_' below. |
| bool RemoveTablet(const std::string& partition_key_start); |
| |
| // This only returns tablets which are in RUNNING state. |
| void GetTabletsInRange(const GetTableLocationsRequestPB* req, |
| std::vector<scoped_refptr<TabletInfo> > *ret) const; |
| |
| // Adds all tablets to the vector in partition key sorted order. |
| void GetAllTablets(std::vector<scoped_refptr<TabletInfo> > *ret) const; |
| |
| // Access the persistent metadata. Typically you should use |
| // TableMetadataLock to gain access to this data. |
| const CowObject<PersistentTableInfo>& metadata() const { return metadata_; } |
| CowObject<PersistentTableInfo>* mutable_metadata() { return &metadata_; } |
| |
| // Returns true if the table creation is in-progress |
| bool IsCreateInProgress() const; |
| |
| // Returns true if an "Alter" operation is in-progress |
| bool IsAlterInProgress(uint32_t version) const; |
| |
| void AddTask(MonitoredTask *task); |
| void RemoveTask(MonitoredTask *task); |
| void AbortTasks(); |
| void WaitTasksCompletion(); |
| |
| // Allow for showing outstanding tasks in the master UI. |
| void GetTaskList(std::vector<scoped_refptr<MonitoredTask> > *tasks); |
| |
| // Returns a snapshot copy of the table info's tablet map. |
| TabletInfoMap tablet_map() const { |
| std::lock_guard<simple_spinlock> l(lock_); |
| return tablet_map_; |
| } |
| |
| // Returns the number of tablets. |
| int num_tablets() const { |
| std::lock_guard<simple_spinlock> l(lock_); |
| return tablet_map_.size(); |
| } |
| |
| private: |
| friend class RefCountedThreadSafe<TableInfo>; |
| ~TableInfo(); |
| |
| void AddTabletUnlocked(TabletInfo* tablet); |
| |
| const std::string table_id_; |
| |
| // Sorted index of tablet start partition-keys to TabletInfo. |
| // The TabletInfo objects are owned by the CatalogManager. |
| TabletInfoMap tablet_map_; |
| |
| // Protects tablet_map_ and pending_tasks_ |
| mutable simple_spinlock lock_; |
| |
| CowObject<PersistentTableInfo> metadata_; |
| |
| // List of pending tasks (e.g. create/alter tablet requests) |
| std::unordered_set<MonitoredTask*> pending_tasks_; |
| |
| DISALLOW_COPY_AND_ASSIGN(TableInfo); |
| }; |
| |
| // Helper to manage locking on the persistent metadata of TabletInfo or TableInfo. |
| template<class MetadataClass> |
| class MetadataLock : public CowLock<typename MetadataClass::cow_state> { |
| public: |
| typedef CowLock<typename MetadataClass::cow_state> super; |
| MetadataLock(MetadataClass* info, typename super::LockMode mode) |
| : super(DCHECK_NOTNULL(info)->mutable_metadata(), mode) { |
| } |
| MetadataLock(const MetadataClass* info, typename super::LockMode mode) |
| : super(&(DCHECK_NOTNULL(info))->metadata(), mode) { |
| } |
| }; |
| |
| typedef MetadataLock<TabletInfo> TabletMetadataLock; |
| typedef MetadataLock<TableInfo> TableMetadataLock; |
| |
| // The component of the master which tracks the state and location |
| // of tables/tablets in the cluster. |
| // |
| // This is the master-side counterpart of TSTabletManager, which tracks |
| // the state of each tablet on a given tablet-server. |
| // |
| // Thread-safe. |
| class CatalogManager : public tserver::TabletPeerLookupIf { |
| public: |
| |
| // Scoped "shared lock" to serialize master leader elections. |
| // |
| // While in scope, blocks the catalog manager in the event that it becomes |
| // the leader of its Raft configuration and needs to reload its persistent |
| // metadata. Once destroyed, the catalog manager is unblocked. |
| // |
| // Usage: |
| // |
| // void MasterServiceImpl::CreateTable(const CreateTableRequestPB* req, |
| // CreateTableResponsePB* resp, |
| // rpc::RpcContext* rpc) { |
| // CatalogManager::ScopedLeaderSharedLock l(server_->catalog_manager()); |
| // if (!l.CheckIsInitializedAndIsLeaderOrRespond(resp, rpc)) { |
| // return; |
| // } |
| // |
| // Status s = server_->catalog_manager()->CreateTable(req, resp, rpc); |
| // CheckRespErrorOrSetUnknown(s, resp); |
| // rpc->RespondSuccess(); |
| // } |
| // |
| class ScopedLeaderSharedLock { |
| public: |
| // Creates a new shared lock, acquiring the catalog manager's leader_lock_ |
| // for reading in the process. The lock is released when this object is |
| // destroyed. |
| // |
| // 'catalog' must outlive this object. |
| explicit ScopedLeaderSharedLock(CatalogManager* catalog); |
| |
| // General status of the catalog manager. If not OK (e.g. the catalog |
| // manager is still being initialized), all operations are illegal and |
| // leader_status() should not be trusted. |
| const Status& catalog_status() const { return catalog_status_; } |
| |
| // Leadership status of the catalog manager. If not OK, the catalog |
| // manager is not the leader, but some operations may still be legal. |
| const Status& leader_status() const { |
| DCHECK(catalog_status_.ok()); |
| return leader_status_; |
| } |
| |
| // First non-OK status of the catalog manager, adhering to the checking |
| // order specified above. |
| const Status& first_failed_status() const { |
| if (!catalog_status_.ok()) { |
| return catalog_status_; |
| } |
| return leader_status_; |
| } |
| |
| // Check that the catalog manager is initialized. It may or may not be the |
| // leader of its Raft configuration. |
| // |
| // If not initialized, writes the corresponding error to 'resp', |
| // responds to 'rpc', and returns false. |
| template<typename RespClass> |
| bool CheckIsInitializedOrRespond(RespClass* resp, rpc::RpcContext* rpc); |
| |
| // Check that the catalog manager is initialized and that it is the leader |
| // of its Raft configuration. Initialization status takes precedence over |
| // leadership status. |
| // |
| // If not initialized or if not the leader, writes the corresponding error |
| // to 'resp', responds to 'rpc', and returns false. |
| template<typename RespClass> |
| bool CheckIsInitializedAndIsLeaderOrRespond(RespClass* resp, rpc::RpcContext* rpc); |
| |
| private: |
| CatalogManager* catalog_; |
| shared_lock<RWMutex> leader_shared_lock_; |
| Status catalog_status_; |
| Status leader_status_; |
| |
| DISALLOW_COPY_AND_ASSIGN(ScopedLeaderSharedLock); |
| }; |
| |
| // Temporarily forces the catalog manager to be a follower. Only for tests! |
| class ScopedLeaderDisablerForTests { |
| public: |
| |
| explicit ScopedLeaderDisablerForTests(CatalogManager* catalog) |
| : catalog_(catalog), |
| old_leader_ready_term_(catalog->leader_ready_term_) { |
| catalog_->leader_ready_term_ = -1; |
| } |
| |
| ~ScopedLeaderDisablerForTests() { |
| catalog_->leader_ready_term_ = old_leader_ready_term_; |
| } |
| |
| private: |
| CatalogManager* catalog_; |
| int64_t old_leader_ready_term_; |
| |
| DISALLOW_COPY_AND_ASSIGN(ScopedLeaderDisablerForTests); |
| }; |
| |
| explicit CatalogManager(Master *master); |
| virtual ~CatalogManager(); |
| |
| Status Init(bool is_first_run); |
| |
| void Shutdown(); |
| Status CheckOnline() const; |
| |
| // Create a new Table with the specified attributes |
| // |
| // The RPC context is provided for logging/tracing purposes, |
| // but this function does not itself respond to the RPC. |
| Status CreateTable(const CreateTableRequestPB* req, |
| CreateTableResponsePB* resp, |
| rpc::RpcContext* rpc); |
| |
| // Get the information about an in-progress create operation |
| Status IsCreateTableDone(const IsCreateTableDoneRequestPB* req, |
| IsCreateTableDoneResponsePB* resp); |
| |
| // Delete the specified table |
| // |
| // The RPC context is provided for logging/tracing purposes, |
| // but this function does not itself respond to the RPC. |
| Status DeleteTable(const DeleteTableRequestPB* req, |
| DeleteTableResponsePB* resp, |
| rpc::RpcContext* rpc); |
| |
| // Alter the specified table |
| // |
| // The RPC context is provided for logging/tracing purposes, |
| // but this function does not itself respond to the RPC. |
| Status AlterTable(const AlterTableRequestPB* req, |
| AlterTableResponsePB* resp, |
| rpc::RpcContext* rpc); |
| |
| // Get the information about an in-progress alter operation |
| // |
| // The RPC context is provided for logging/tracing purposes, |
| // but this function does not itself respond to the RPC. |
| Status IsAlterTableDone(const IsAlterTableDoneRequestPB* req, |
| IsAlterTableDoneResponsePB* resp, |
| rpc::RpcContext* rpc); |
| |
| // Get the information about the specified table |
| Status GetTableSchema(const GetTableSchemaRequestPB* req, |
| GetTableSchemaResponsePB* resp); |
| |
| // List all the running tables |
| Status ListTables(const ListTablesRequestPB* req, |
| ListTablesResponsePB* resp); |
| |
| // Lookup the tablets contained in the partition range of the request. |
| // Returns an error if any of the tablets are not running. |
| Status GetTableLocations(const GetTableLocationsRequestPB* req, |
| GetTableLocationsResponsePB* resp); |
| |
| // Look up the locations of the given tablet. The locations |
| // vector is overwritten (not appended to). |
| // If the tablet is not found, returns Status::NotFound. |
| // If the tablet is not running, returns Status::ServiceUnavailable. |
| // Otherwise, returns Status::OK and puts the result in 'locs_pb'. |
| // This only returns tablets which are in RUNNING state. |
| Status GetTabletLocations(const std::string& tablet_id, |
| TabletLocationsPB* locs_pb); |
| |
| // Handle a tablet report from the given tablet server. |
| // |
| // The RPC context is provided for logging/tracing purposes, |
| // but this function does not itself respond to the RPC. |
| Status ProcessTabletReport(TSDescriptor* ts_desc, |
| const TabletReportPB& report, |
| TabletReportUpdatesPB *report_update, |
| rpc::RpcContext* rpc); |
| |
| SysCatalogTable* sys_catalog() { return sys_catalog_.get(); } |
| |
| // Dump all of the current state about tables and tablets to the |
| // given output stream. This is verbose, meant for debugging. |
| void DumpState(std::ostream* out) const; |
| |
| // Retrieve a table by ID, or null if no such table exists. May fail if the |
| // catalog manager is not yet running. Caller must hold leader_lock_. |
| // |
| // NOTE: This should only be used by tests or web-ui |
| Status GetTableInfo(const std::string& table_id, scoped_refptr<TableInfo> *table); |
| |
| // Retrieve all known tables, even those that are not running. May fail if |
| // the catalog manager is not yet running. Caller must hold leader_lock_. |
| // |
| // NOTE: This should only be used by tests or web-ui |
| Status GetAllTables(std::vector<scoped_refptr<TableInfo>>* tables); |
| |
| // Check if a table exists by name, setting 'exist' appropriately. May fail |
| // if the catalog manager is not yet running. Caller must hold leader_lock_. |
| // |
| // NOTE: This should only be used by tests |
| Status TableNameExists(const std::string& table_name, bool* exists); |
| |
| // Let the catalog manager know that the the given tablet server successfully |
| // deleted the specified tablet. |
| void NotifyTabletDeleteSuccess(const std::string& permanent_uuid, const std::string& tablet_id); |
| |
| // Used by ConsensusService to retrieve the TabletPeer for a system |
| // table specified by 'tablet_id'. |
| // |
| // See also: TabletPeerLookupIf, ConsensusServiceImpl. |
| virtual Status GetTabletPeer(const std::string& tablet_id, |
| scoped_refptr<tablet::TabletPeer>* tablet_peer) const OVERRIDE; |
| |
| virtual const NodeInstancePB& NodeInstance() const OVERRIDE; |
| |
| bool IsInitialized() const; |
| |
| virtual Status StartRemoteBootstrap( |
| const consensus::StartRemoteBootstrapRequestPB& req, |
| boost::optional<kudu::tserver::TabletServerErrorPB::Code>* error_code) OVERRIDE; |
| |
| // Returns this CatalogManager's role in a consensus configuration. CatalogManager |
| // must be initialized before calling this method. |
| consensus::RaftPeerPB::Role Role() const; |
| |
| private: |
| // These tests call ElectedAsLeaderCb() directly. |
| FRIEND_TEST(MasterTest, TestShutdownDuringTableVisit); |
| FRIEND_TEST(MasterTest, TestGetTableLocationsDuringRepeatedTableVisit); |
| |
| // This test calls VisitTablesAndTablets() directly. |
| FRIEND_TEST(kudu::CreateTableStressTest, TestConcurrentCreateTableAndReloadMetadata); |
| |
| friend class TableLoader; |
| friend class TabletLoader; |
| |
| typedef std::unordered_map<std::string, scoped_refptr<TableInfo>> TableInfoMap; |
| typedef std::unordered_map<std::string, scoped_refptr<TabletInfo>> TabletInfoMap; |
| |
| // Called by SysCatalog::SysCatalogStateChanged when this node |
| // becomes the leader of a consensus configuration. Executes VisitTablesAndTabletsTask |
| // via 'worker_pool_'. |
| Status ElectedAsLeaderCb(); |
| |
| // Loops and sleeps until one of the following conditions occurs: |
| // 1. The current node is the leader master in the current term |
| // and at least one op from the current term is committed. Returns OK. |
| // 2. The current node is not the leader master. |
| // Returns IllegalState. |
| // 3. The provided timeout expires. Returns TimedOut. |
| // |
| // This method is intended to ensure that all operations replicated by |
| // previous masters are committed and visible to the local node before |
| // reading that data, to ensure consistency across failovers. |
| Status WaitUntilCaughtUpAsLeader(const MonoDelta& timeout); |
| |
| // Performs several checks before calling VisitTablesAndTablets to actually |
| // reload table/tablet metadata into memory. |
| void VisitTablesAndTabletsTask(); |
| |
| // Clears out the existing metadata ('table_names_map_', 'table_ids_map_', |
| // and 'tablet_map_'), loads tables metadata into memory and if successful |
| // loads the tablets metadata. |
| Status VisitTablesAndTablets(); |
| |
| // Helper for initializing 'sys_catalog_'. After calling this |
| // method, the caller should call WaitUntilRunning() on sys_catalog_ |
| // WITHOUT holding 'lock_' to wait for consensus to start for |
| // sys_catalog_. |
| // |
| // This method is thread-safe. |
| Status InitSysCatalogAsync(bool is_first_run); |
| |
| // Helper for creating the initial TableInfo state |
| // Leaves the table "write locked" with the new info in the |
| // "dirty" state field. |
| TableInfo* CreateTableInfo(const CreateTableRequestPB& req, |
| const Schema& schema, |
| const PartitionSchema& partition_schema); |
| |
| // Helper for creating the initial TabletInfo state. |
| // Leaves the tablet "write locked" with the new info in the |
| // "dirty" state field. |
| scoped_refptr<TabletInfo> CreateTabletInfo(TableInfo* table, |
| const PartitionPB& partition); |
| |
| // Builds the TabletLocationsPB for a tablet based on the provided TabletInfo. |
| // Populates locs_pb and returns true on success. |
| // Returns Status::ServiceUnavailable if tablet is not running. |
| Status BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet, |
| TabletLocationsPB* locs_pb); |
| |
| Status FindTable(const TableIdentifierPB& table_identifier, |
| scoped_refptr<TableInfo>* table_info); |
| |
| // Handle one of the tablets in a tablet reported. |
| // Requires that the lock is already held. |
| Status HandleReportedTablet(TSDescriptor* ts_desc, |
| const ReportedTabletPB& report, |
| ReportedTabletUpdatesPB *report_updates); |
| |
| Status HandleRaftConfigChanged(const ReportedTabletPB& report, |
| const scoped_refptr<TabletInfo>& tablet, |
| TabletMetadataLock* tablet_lock, |
| TableMetadataLock* table_lock); |
| |
| // Extract the set of tablets that must be processed because not running yet. |
| void ExtractTabletsToProcess(std::vector<scoped_refptr<TabletInfo>>* tablets_to_process); |
| |
| Status ApplyAlterSchemaSteps(const SysTablesEntryPB& current_pb, |
| std::vector<AlterTableRequestPB::Step> steps, |
| Schema* new_schema, |
| ColumnId* next_col_id); |
| |
| Status ApplyAlterPartitioningSteps(const TableMetadataLock& l, |
| TableInfo* table, |
| const Schema& client_schema, |
| std::vector<AlterTableRequestPB::Step> steps, |
| std::vector<scoped_refptr<TabletInfo>>* tablets_to_add, |
| std::vector<scoped_refptr<TabletInfo>>* tablets_to_drop); |
| |
| // Task that takes care of the tablet assignments/creations. |
| // Loops through the "not created" tablets and sends a CreateTablet() request. |
| Status ProcessPendingAssignments(const std::vector<scoped_refptr<TabletInfo> >& tablets); |
| |
| // Given 'two_choices', which should be a vector of exactly two elements, select which |
| // one is the better choice for a new replica. |
| std::shared_ptr<TSDescriptor> PickBetterReplicaLocation(const TSDescriptorVector& two_choices); |
| |
| // Select a tablet server from 'ts_descs' on which to place a new replica. |
| // Any tablet servers in 'excluded' are not considered. |
| // REQUIRES: 'ts_descs' must include at least one non-excluded server. |
| std::shared_ptr<TSDescriptor> SelectReplica( |
| const TSDescriptorVector& ts_descs, |
| const std::set<std::shared_ptr<TSDescriptor>>& excluded); |
| |
| // Select N Replicas from online tablet servers (as specified by |
| // 'ts_descs') for the specified tablet and populate the consensus configuration |
| // object. If 'ts_descs' does not specify enough online tablet |
| // servers to select the N replicas, return Status::InvalidArgument. |
| // |
| // This method is called by "ProcessPendingAssignments()". |
| Status SelectReplicasForTablet(const TSDescriptorVector& ts_descs, TabletInfo* tablet); |
| |
| // Select N Replicas from the online tablet servers |
| // and populate the consensus configuration object. |
| // |
| // This method is called by "SelectReplicasForTablet". |
| void SelectReplicas(const TSDescriptorVector& ts_descs, |
| int nreplicas, |
| consensus::RaftConfigPB *config); |
| |
| void HandleAssignPreparingTablet(TabletInfo* tablet, |
| DeferredAssignmentActions* deferred); |
| |
| // Assign tablets and send CreateTablet RPCs to tablet servers. |
| // The out param 'new_tablets' should have any newly-created TabletInfo |
| // objects appended to it. |
| void HandleAssignCreatingTablet(TabletInfo* tablet, |
| DeferredAssignmentActions* deferred, |
| std::vector<scoped_refptr<TabletInfo> >* new_tablets); |
| |
| Status HandleTabletSchemaVersionReport(TabletInfo *tablet, |
| uint32_t version); |
| |
| // Send the "create tablet request" to all peers of a particular tablet. |
| //. |
| // The creation is async, and at the moment there is no error checking on the |
| // caller side. We rely on the assignment timeout. If we don't see the tablet |
| // after the timeout, we regenerate a new one and proceed with a new |
| // assignment/creation. |
| // |
| // This method is part of the "ProcessPendingAssignments()" |
| // |
| // This must be called after persisting the tablet state as |
| // CREATING to ensure coherent state after Master failover. |
| // |
| // The tablet lock must be acquired for reading before making this call. |
| void SendCreateTabletRequest(const scoped_refptr<TabletInfo>& tablet, |
| const TabletMetadataLock& tablet_lock); |
| |
| // Send the "alter table request" to all tablets of the specified table. |
| void SendAlterTableRequest(const scoped_refptr<TableInfo>& table); |
| |
| // Start the background task to send the AlterTable() RPC to the leader for this |
| // tablet. |
| void SendAlterTabletRequest(const scoped_refptr<TabletInfo>& tablet); |
| |
| // Send the "delete tablet request" to all replicas of all tablets of the |
| // specified table. |
| void SendDeleteTableRequest(const scoped_refptr<TableInfo>& table, |
| const std::string& deletion_msg); |
| |
| // Send the "delete tablet request" to all replicas of the specified tablet. |
| // |
| // The tablet lock must be acquired for reading before making this call. |
| void SendDeleteTabletRequest(const scoped_refptr<TabletInfo>& tablet, |
| const TabletMetadataLock& tablet_lock, |
| const std::string& deletion_msg); |
| |
| // Send the "delete tablet request" to a particular replica (i.e. TS and |
| // tablet combination). The specified 'reason' will be logged on the TS. |
| void SendDeleteReplicaRequest(const std::string& tablet_id, |
| tablet::TabletDataState delete_type, |
| const boost::optional<int64_t>& cas_config_opid_index_less_or_equal, |
| const scoped_refptr<TableInfo>& table, |
| const std::string& ts_uuid, |
| const std::string& reason); |
| |
| // Start a task to change the config to add an additional voter because the |
| // specified tablet is under-replicated. |
| void SendAddServerRequest(const scoped_refptr<TabletInfo>& tablet, |
| const consensus::ConsensusStatePB& cstate); |
| |
| std::string GenerateId() { return oid_generator_.Next(); } |
| |
| // Conventional "T xxx P yyy: " prefix for logging. |
| std::string LogPrefix() const; |
| |
| // Aborts all tasks belonging to 'tables' and waits for them to finish. |
| void AbortAndWaitForAllTasks(const std::vector<scoped_refptr<TableInfo>>& tables); |
| |
| // TODO: the maps are a little wasteful of RAM, since the TableInfo/TabletInfo |
| // objects have a copy of the string key. But STL doesn't make it |
| // easy to make a "gettable set". |
| |
| // Lock protecting the various maps and sets below. |
| typedef rw_spinlock LockType; |
| mutable LockType lock_; |
| |
| // Table maps: table-id -> TableInfo and table-name -> TableInfo |
| TableInfoMap table_ids_map_; |
| TableInfoMap table_names_map_; |
| |
| // Tablet maps: tablet-id -> TabletInfo |
| TabletInfoMap tablet_map_; |
| |
| // Names of tables that are currently reserved by CreateTable() or |
| // AlterTable(). |
| // |
| // As a rule, operations that add new table names should do so as follows: |
| // 1. Acquire lock_. |
| // 2. Ensure table_names_map_ does not contain the new name. |
| // 3. Ensure reserved_table_names_ does not contain the new name. |
| // 4. Add the new name to reserved_table_names_. |
| // 5. Release lock_. |
| // 6. Perform the operation. |
| // 7. If it succeeded, add the name to table_names_map_ with lock_ held. |
| // 8. Remove the new name from reserved_table_names_ with lock_ held. |
| std::unordered_set<std::string> reserved_table_names_; |
| |
| Master *master_; |
| ObjectIdGenerator oid_generator_; |
| |
| // Random number generator used for selecting replica locations. |
| ThreadSafeRandom rng_; |
| |
| gscoped_ptr<SysCatalogTable> sys_catalog_; |
| |
| // Background thread, used to execute the catalog manager tasks |
| // like the assignment and cleaner |
| friend class CatalogManagerBgTasks; |
| gscoped_ptr<CatalogManagerBgTasks> background_tasks_; |
| |
| enum State { |
| kConstructed, |
| kStarting, |
| kRunning, |
| kClosing |
| }; |
| |
| // Lock protecting state_, leader_ready_term_ |
| mutable simple_spinlock state_lock_; |
| State state_; |
| |
| // Singleton pool that serializes invocations of ElectedAsLeaderCb(). |
| gscoped_ptr<ThreadPool> leader_election_pool_; |
| |
| // This field is updated when a node becomes leader master, |
| // waits for all outstanding uncommitted metadata (table and tablet metadata) |
| // in the sys catalog to commit, and then reads that metadata into in-memory |
| // data structures. This is used to "fence" client and tablet server requests |
| // that depend on the in-memory state until this master can respond |
| // correctly. |
| int64_t leader_ready_term_; |
| |
| // Lock used to fence operations and leader elections. All logical operations |
| // (i.e. create table, alter table, etc.) should acquire this lock for |
| // reading. Following an election where this master is elected leader, it |
| // should acquire this lock for writing before reloading the metadata. |
| // |
| // Readers should not acquire this lock directly; use ScopedLeadershipLock |
| // instead. |
| // |
| // Always acquire this lock before state_lock_. |
| RWMutex leader_lock_; |
| |
| // Async operations are accessing some private methods |
| // (TODO: this stuff should be deferred and done in the background thread) |
| friend class AsyncAlterTable; |
| |
| DISALLOW_COPY_AND_ASSIGN(CatalogManager); |
| }; |
| |
| } // namespace master |
| } // namespace kudu |
| #endif /* KUDU_MASTER_CATALOG_MANAGER_H */ |