blob: a8adcc75304e546c98707ee01d9e28fed2ba3b8e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include <vector>
#include "gen-cpp/StatestoreService_types.h"
#include "util/network-util.h"
namespace impala {
class TBackendDescriptor;
/// Class to maintain a local blacklist of executors.
///
/// Executors can be added to the blacklist with Blacklist() and removed with
/// FindAndRemove(). Users must periodically call Maintenance(), which updates the
/// blacklist according to some timeouts.
///
/// There is a timeout for executors on the blacklist, defined as the amount of time the
/// statestore takes to consider an executor down after it stops responding to heartbeats,
/// plus some padding. When a blacklisted executor has passed this timeout and
/// Maintenance() is called, the executor is put on 'probation'. This allows us to track
/// executors that have been repeatedly blacklisted in a short amount of time. Executors
/// are only removed from the blacklist and put on probation during Maintenance().
///
/// There is a timeout for probation, which is much longer than the blacklist timeout.
/// When an executor has been on probation for longer than this timeout and Maintenance()
/// is called, the executor will be taken off probation and completely removed from
/// 'executor_list_'. If an executor that was on probation is re-blacklisted, its timeout
/// for getting back off the blacklist or probation is multiplied by the number of times
/// it has been re-blacklisted since the last time it was neither blacklisted nor on
/// probation.
///
/// FindAndRemove() removes executors whether they are blacklisted or on probation. If the
/// executor was previously blacklisted, it is not put on probation. This can be used when
/// the cluster membership is updated by the statestore to fully remove an executor that
/// is no longer part of the cluster membership.
///
/// This class is not thread-safe.
class ExecutorBlacklist {
public:
// Represents the blacklisting states an executor can be in.
enum State { NOT_BLACKLISTED, BLACKLISTED, ON_PROBATION };
/// Returns true if blacklisting is enabled.
static bool BlacklistingEnabled();
/// Adds an executor to the blacklist, if it is not already blacklisted. If the executor
/// was on probation, updates its entry in 'executor_list_' accordingly. Should only be
/// called if BlacklistingEnabled() is true.
void Blacklist(const TBackendDescriptor& be_desc);
/// Removes an executor from the blacklist or probation, if it is in 'executor_list_'.
/// Does not put blacklisted executors on probation. Returns the executor's state prior
/// to this call. Will return 'BLACKLISTED' for an executor that has passed the
/// blacklist timeout if Maintenance() wasn't called since the timeout.
State FindAndRemove(const TBackendDescriptor& be_desc);
/// Returns true if there are executors that have passed the blacklist timeout and can
/// be removed from the blacklist. Note that this does not consider executors that can
/// be removed from probation, even though Maintenance() also handles those executors.
/// This is to avoid unnecessary updates to the cluster membership, since removing
/// executors from probation doesn't actually affect the current membership.
bool NeedsMaintenance() const;
/// Performs maintenance on the blacklist - if an executor has been blacklisted for
/// longer than the blacklist timeout it will be removed from the blacklisted, put on
/// probation, and its descriptor will be returned in 'probation_list'. If an executor
/// has been on probation for longer than the probation timeout, it will be taken off
/// probation.
void Maintenance(std::list<TBackendDescriptor>* probation_list);
/// Returns true if 'be_desc' is blacklisted.
bool IsBlacklisted(const TBackendDescriptor& be_desc) const;
/// Returns a space-separated string of the addresses of executors that are currently
/// blacklisted.
std::string BlacklistToString() const;
/// Returns a string representation of the blacklist for use in debugging.
std::string DebugString() const;
private:
/// Info about an executor that is either blacklisted or on probabtion.
struct Entry {
Entry(const TBackendDescriptor& be_desc, int64_t blacklist_time_ms)
: be_desc(be_desc),
blacklist_time_ms(blacklist_time_ms),
state(State::BLACKLISTED),
num_consecutive_blacklistings(1) {}
TBackendDescriptor be_desc;
/// The UnixMillis() of the last time this executor was blacklisted.
int64_t blacklist_time_ms;
/// Whether the executor is blacklisted or on probation.
State state;
/// Number of times that this executor has been blacklisted since the last time it was
/// off probation.
int32_t num_consecutive_blacklistings;
};
/// Returns the base blacklist timeout in ms. This should be multiplied by
/// 'num_consecutive_blacklistings' for a particular executor when checking if it has
/// passed the timeout.
int64_t GetBlacklistTimeoutMs() const;
/// Predicate that returns true if the port number in 'be_desc' matches that in
/// 'existing.be_desc'. Assumes that they have the same 'ip_address'. Used to do find()
/// on the values of 'executor_list_'.
static bool eqBePort(const TBackendDescriptor& be_desc, const Entry& exiting);
/// Map from node ip address to a list of executors for that node that have been
/// blacklisted. Note that in normal operation there will be a single impalad per node
/// all executor lists will be length one. Contains both executors that are blacklisted
/// ones that are on probation.
std::unordered_map<IpAddr, std::vector<Entry>> executor_list_;
/// The amount to multiply the blacklist timeout by for the probation timeout.
static const int32_t PROBATION_TIMEOUT_MULTIPLIER;
/// Multiplier for extra padding to add the the blacklist timeout.
static const float BLACKLIST_TIMEOUT_PADDING;
};
} // namespace impala