blob: 7115503ba5d8cce20665d7051002b480f197c5d8 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
* and limitations under the License.
*/
package org.apache.storm.scheduler.blacklist.strategies;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.storm.DaemonConfig;
import org.apache.storm.scheduler.Cluster;
import org.apache.storm.scheduler.SupervisorDetails;
import org.apache.storm.scheduler.Topologies;
import org.apache.storm.scheduler.TopologyDetails;
import org.apache.storm.scheduler.blacklist.reporters.IReporter;
import org.apache.storm.scheduler.blacklist.reporters.LogReporter;
import org.apache.storm.utils.ObjectReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The default strategy used for blacklisting hosts.
*/
public class DefaultBlacklistStrategy implements IBlacklistStrategy {
public static final int DEFAULT_BLACKLIST_SCHEDULER_RESUME_TIME = 1800;
public static final int DEFAULT_BLACKLIST_SCHEDULER_TOLERANCE_COUNT = 3;
private static final Logger LOG = LoggerFactory.getLogger(DefaultBlacklistStrategy.class);
private IReporter reporter;
private int toleranceCount;
private int resumeTime;
private int nimbusMonitorFreqSecs;
private TreeMap<String, Integer> blacklist;
@Override
public void prepare(Map<String, Object> conf) {
toleranceCount = ObjectReader.getInt(conf.get(DaemonConfig.BLACKLIST_SCHEDULER_TOLERANCE_COUNT),
DEFAULT_BLACKLIST_SCHEDULER_TOLERANCE_COUNT);
resumeTime = ObjectReader.getInt(conf.get(DaemonConfig.BLACKLIST_SCHEDULER_RESUME_TIME), DEFAULT_BLACKLIST_SCHEDULER_RESUME_TIME);
String reporterClassName = ObjectReader.getString(conf.get(DaemonConfig.BLACKLIST_SCHEDULER_REPORTER),
LogReporter.class.getName());
reporter = (IReporter) initializeInstance(reporterClassName, "blacklist reporter");
nimbusMonitorFreqSecs = ObjectReader.getInt(conf.get(DaemonConfig.NIMBUS_MONITOR_FREQ_SECS));
blacklist = new TreeMap<>();
}
@Override
public Set<String> getBlacklist(List<Map<String, Set<Integer>>> supervisorsWithFailures, Cluster cluster, Topologies topologies) {
Map<String, Integer> countMap = new HashMap<>();
for (Map<String, Set<Integer>> item : supervisorsWithFailures) {
Set<String> supervisors = item.keySet();
for (String supervisor : supervisors) {
int supervisorCount = countMap.getOrDefault(supervisor, 0);
countMap.put(supervisor, supervisorCount + 1);
}
}
for (Map.Entry<String, Integer> entry : countMap.entrySet()) {
String supervisor = entry.getKey();
int count = entry.getValue();
if (count >= toleranceCount) {
if (!blacklist.containsKey(supervisor)) { // if not in blacklist then add it and set the resume time according to config
LOG.debug("Added supervisor {} to blacklist", supervisor);
LOG.debug("supervisorsWithFailures : {}", supervisorsWithFailures);
reporter.reportBlacklist(supervisor, supervisorsWithFailures);
blacklist.put(supervisor, resumeTime / nimbusMonitorFreqSecs);
}
}
}
Set<String> toRelease = releaseBlacklistWhenNeeded(cluster, new ArrayList<>(blacklist.keySet()));
// After having computed the final blacklist,
// the nodes which are released due to resource shortage will be put to the "greylist".
if (toRelease != null) {
LOG.debug("Releasing {} nodes because of low resources", toRelease.size());
cluster.setGreyListedSupervisors(toRelease);
for (String key : toRelease) {
blacklist.remove(key);
}
}
return blacklist.keySet();
}
@Override
public void resumeFromBlacklist() {
Set<String> readyToRemove = new HashSet<>();
for (Map.Entry<String, Integer> entry : blacklist.entrySet()) {
String supervisor = entry.getKey();
int countUntilResume = entry.getValue() - 1;
if (countUntilResume == 0) {
readyToRemove.add(supervisor);
} else {
blacklist.put(supervisor, countUntilResume);
}
}
for (String key : readyToRemove) {
blacklist.remove(key);
LOG.info("Supervisor {} has been blacklisted more than resume period. Removed from blacklist.", key);
}
}
/**
* Decide when/if to release blacklisted hosts.
* @param cluster the current state of the cluster.
* @param blacklistedNodeIds the current set of blacklisted node ids sorted by earliest
* @return the set of nodes to be released.
*/
protected Set<String> releaseBlacklistWhenNeeded(Cluster cluster, final List<String> blacklistedNodeIds) {
Set<String> readyToRemove = new HashSet<>();
if (blacklistedNodeIds.size() > 0) {
int availableSlots = cluster.getNonBlacklistedAvailableSlots(blacklistedNodeIds).size();
int neededSlots = 0;
for (TopologyDetails td : cluster.needsSchedulingTopologies()) {
int slots = td.getNumWorkers();
int assignedSlots = cluster.getAssignedNumWorkers(td);
int tdSlotsNeeded = slots - assignedSlots;
neededSlots += tdSlotsNeeded;
}
//Now we need to free up some resources...
Map<String, SupervisorDetails> availableSupervisors = cluster.getSupervisors();
int shortageSlots = neededSlots - availableSlots;
LOG.debug("Need {} slots.", neededSlots);
LOG.debug("Available {} slots.", availableSlots);
LOG.debug("Shortage {} slots.", shortageSlots);
if (shortageSlots > 0) {
LOG.info("Need {} slots more. Releasing some blacklisted nodes to cover it.", shortageSlots);
//release earliest blacklist - but release all supervisors on a given blacklisted host.
Map<String, Set<String>> hostToSupervisorIds = createHostToSupervisorMap(blacklistedNodeIds, cluster);
for (Set<String> supervisorIds : hostToSupervisorIds.values()) {
for (String supervisorId : supervisorIds) {
SupervisorDetails sd = availableSupervisors.get(supervisorId);
if (sd != null) {
int sdAvailableSlots = cluster.getAvailablePorts(sd).size();
readyToRemove.add(supervisorId);
shortageSlots -= sdAvailableSlots;
LOG.debug("Releasing {} with {} slots leaving {} slots to go", supervisorId,
sdAvailableSlots, shortageSlots);
}
}
if (shortageSlots <= 0) {
// we have enough resources now...
break;
}
}
}
}
return readyToRemove;
}
private Object initializeInstance(String className, String representation) {
try {
return Class.forName(className).newInstance();
} catch (ClassNotFoundException e) {
LOG.error("Can't find {} for name {}", representation, className);
throw new RuntimeException(e);
} catch (InstantiationException e) {
LOG.error("Throw InstantiationException {} for name {}", representation, className);
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
LOG.error("Throw IllegalAccessException {} for name {}", representation, className);
throw new RuntimeException(e);
}
}
protected Map<String, Set<String>> createHostToSupervisorMap(final List<String> blacklistedNodeIds, Cluster cluster) {
Map<String, Set<String>> hostToSupervisorMap = new TreeMap<>();
for (String supervisorId : blacklistedNodeIds) {
String hostname = cluster.getHost(supervisorId);
if (hostname != null) {
Set<String> supervisorIds = hostToSupervisorMap.get(hostname);
if (supervisorIds == null) {
supervisorIds = new HashSet<>();
hostToSupervisorMap.put(hostname, supervisorIds);
}
supervisorIds.add(supervisorId);
}
}
return hostToSupervisorMap;
}
}