blob: 2d4d2ba7d6469093624354b7a3f919f8bcfb1a11 [file] [log] [blame]
package org.apache.helix.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.helix.HelixDataAccessor;
import org.apache.helix.HelixDefinedState;
import org.apache.helix.HelixException;
import org.apache.helix.PropertyKey;
import org.apache.helix.model.ClusterConfig;
import org.apache.helix.model.CurrentState;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.IdealState;
import org.apache.helix.model.InstanceConfig;
import org.apache.helix.model.LiveInstance;
import org.apache.helix.model.StateModelDefinition;
import org.apache.helix.task.TaskConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableSet;
/**
* Utility class for validating Helix properties
* Warning: each method validates one single property of instance individually and independently.
* One validation wouldn't depend on the results of other validations
*/
public class InstanceValidationUtil {
private static final Logger _logger = LoggerFactory.getLogger(InstanceValidationUtil.class);
public static Set<String> UNHEALTHY_STATES =
ImmutableSet.of(HelixDefinedState.DROPPED.name(), HelixDefinedState.ERROR.name());
private InstanceValidationUtil() {
}
/**
* Method to check if the instance is enabled by configuration
* @param dataAccessor
* @param instanceName
* @return
*/
public static boolean isEnabled(HelixDataAccessor dataAccessor, String instanceName) {
PropertyKey.Builder propertyKeyBuilder = dataAccessor.keyBuilder();
InstanceConfig instanceConfig = dataAccessor.getProperty(propertyKeyBuilder.instanceConfig(instanceName));
ClusterConfig clusterConfig = dataAccessor.getProperty(propertyKeyBuilder.clusterConfig());
// TODO deprecate instance level config checks once migrated the enable status to cluster config only
if (instanceConfig == null || clusterConfig == null) {
throw new HelixException("InstanceConfig or ClusterConfig is NULL");
}
boolean enabledInInstanceConfig = instanceConfig.getInstanceEnabled();
Map<String, String> disabledInstances = clusterConfig.getDisabledInstances();
boolean enabledInClusterConfig =
disabledInstances == null || !disabledInstances.keySet().contains(instanceName);
return enabledInClusterConfig && enabledInInstanceConfig;
}
/**
* Method to check if the instance is up and running by configuration
* @param dataAccessor
* @param instanceName
* @return
*/
public static boolean isAlive(HelixDataAccessor dataAccessor, String instanceName) {
LiveInstance liveInstance = dataAccessor.getProperty(dataAccessor.keyBuilder().liveInstance(instanceName));
return liveInstance != null;
}
/**
* Method to check if the instance is assigned at least 1 resource, not in a idle state;
* Independent of the instance alive/enabled status
* @param dataAccessor
* @param clusterId
* @param instanceName
* @return
*/
public static boolean hasResourceAssigned(HelixDataAccessor dataAccessor, String clusterId,
String instanceName) {
PropertyKey.Builder propertyKeyBuilder = dataAccessor.keyBuilder();
LiveInstance liveInstance = dataAccessor.getProperty(propertyKeyBuilder.liveInstance(instanceName));
if (liveInstance != null) {
String sessionId = liveInstance.getSessionId();
List<String> resourceNames = dataAccessor.getChildNames(propertyKeyBuilder.currentStates(instanceName, sessionId));
for (String resourceName : resourceNames) {
PropertyKey currentStateKey = propertyKeyBuilder.currentState(instanceName, sessionId, resourceName);
CurrentState currentState = dataAccessor.getProperty(currentStateKey);
if (currentState != null && currentState.getPartitionStateMap().size() > 0) {
return true;
}
}
}
_logger.warn(String.format("The instance %s is not active", instanceName));
return false;
}
/**
* Method to check if the instance has any disabled partition assigned
* @param dataAccessor
* @param clusterId
* @param instanceName
* @return
*/
public static boolean hasDisabledPartitions(HelixDataAccessor dataAccessor, String clusterId,
String instanceName) {
PropertyKey propertyKey = dataAccessor.keyBuilder().instanceConfig(instanceName);
InstanceConfig instanceConfig = dataAccessor.getProperty(propertyKey);
if (instanceConfig != null) {
return !instanceConfig.getDisabledPartitionsMap().isEmpty();
}
throw new HelixException("Fail to get instance config for " + instanceName);
}
/**
* Method to check if the instance has valid configuration
* @param dataAccessor
* @param clusterId
* @param instanceName
* @return
*/
public static boolean hasValidConfig(HelixDataAccessor dataAccessor, String clusterId,
String instanceName) {
PropertyKey propertyKey = dataAccessor.keyBuilder().instanceConfig(instanceName);
InstanceConfig instanceConfig = dataAccessor.getProperty(propertyKey);
return instanceConfig != null && instanceConfig.isValid();
}
/**
* Method to check if the instance has error partitions
* @param dataAccessor
* @param clusterId
* @param instanceName
* @return
*/
public static boolean hasErrorPartitions(HelixDataAccessor dataAccessor, String clusterId,
String instanceName) {
PropertyKey.Builder propertyKeyBuilder = new PropertyKey.Builder(clusterId);
PropertyKey liveInstanceKey = propertyKeyBuilder.liveInstance(instanceName);
LiveInstance liveInstance = dataAccessor.getProperty(liveInstanceKey);
if (liveInstance != null) {
String sessionId = liveInstance.getSessionId();
PropertyKey currentStatesKey = propertyKeyBuilder.currentStates(instanceName, sessionId);
List<String> resourceNames = dataAccessor.getChildNames(currentStatesKey);
for (String resourceName : resourceNames) {
PropertyKey key = propertyKeyBuilder.currentState(instanceName, sessionId, resourceName);
CurrentState currentState = dataAccessor.getProperty(key);
if (currentState != null
&& currentState.getPartitionStateMap().containsValue(HelixDefinedState.ERROR.name())) {
return true;
}
}
}
_logger.warn(String.format("The instance %s is not active", instanceName));
return false;
}
/**
* Get the problematic partitions on the to-be-stop instance
* Requirement:
* If the instance gets stopped and the partitions on the instance are OFFLINE,
* the cluster still have enough "healthy" replicas on other sibling instances
*
* - sibling instances mean those who share the same partition (replicas) of the to-be-stop instance
*
* @param globalPartitionHealthStatus (instance => (partition name, health status))
* @param instanceToBeStop The instance to be stopped
* @param dataAccessor The data accessor
* @return A list of problematic partitions if the instance is stopped
*/
public static List<String> perPartitionHealthCheck(List<ExternalView> externalViews,
Map<String, Map<String, Boolean>> globalPartitionHealthStatus, String instanceToBeStop,
HelixDataAccessor dataAccessor) {
List<String> unhealthyPartitions = new ArrayList<>();
for (ExternalView externalView : externalViews) {
StateModelDefinition stateModelDefinition = dataAccessor
.getProperty(dataAccessor.keyBuilder().stateModelDef(externalView.getStateModelDefRef()));
for (String partition : externalView.getPartitionSet()) {
Map<String, String> stateMap = externalView.getStateMap(partition);
// Only check if instance holds top state
if (stateMap.containsKey(instanceToBeStop)
&& stateMap.get(instanceToBeStop).equals(stateModelDefinition.getTopState())) {
for (String siblingInstance : stateMap.keySet()) {
// Skip this self check
if (siblingInstance.equals(instanceToBeStop)) {
continue;
}
// We are checking sibling partition healthy status. So if partition health does not
// exist or it is not healthy. We should mark this partition is unhealthy.
if (!globalPartitionHealthStatus.containsKey(siblingInstance)
|| !globalPartitionHealthStatus.get(siblingInstance).containsKey(partition)
|| !globalPartitionHealthStatus.get(siblingInstance).get(partition)) {
unhealthyPartitions.add(partition);
break;
}
}
}
}
}
return unhealthyPartitions;
}
/**
* Check instance is already in the stable state. Here stable means all the ideal state mapping
* matches external view (view of current state).
* It requires PERSIST_INTERMEDIATE_ASSIGNMENT turned on!
* @param dataAccessor
* @param instanceName
* @return
*/
public static boolean isInstanceStable(HelixDataAccessor dataAccessor, String instanceName) {
PropertyKey.Builder keyBuilder = dataAccessor.keyBuilder();
ClusterConfig clusterConfig = dataAccessor.getProperty(keyBuilder.clusterConfig());
if (!clusterConfig.isPersistIntermediateAssignment()) {
throw new HelixException("isInstanceStable needs persist assignment on!");
}
List<String> idealStateNames = dataAccessor.getChildNames(keyBuilder.idealStates());
for (String idealStateName : idealStateNames) {
IdealState idealState = dataAccessor.getProperty(keyBuilder.idealStates(idealStateName));
if (idealState == null || !idealState.isEnabled() || !idealState.isValid()
|| TaskConstants.STATE_MODEL_NAME.equals(idealState.getStateModelDefRef())) {
continue;
}
ExternalView externalView = dataAccessor.getProperty(keyBuilder.externalView(idealStateName));
if (externalView == null) {
throw new HelixException(
String.format("Resource %s does not have external view!", idealStateName));
}
for (String partition : idealState.getPartitionSet()) {
Map<String, String> isPartitionMap = idealState.getInstanceStateMap(partition);
Map<String, String> evPartitionMap = externalView.getStateMap(partition);
if (isPartitionMap.containsKey(instanceName) && (!evPartitionMap.containsKey(instanceName)
|| !evPartitionMap.get(instanceName).equals(isPartitionMap.get(instanceName)))) {
// only checks the state from IS matches EV. Return false when
// 1. This partition not has current state on this instance
// 2. The state does not match the state on ideal state
return false;
}
}
}
return true;
}
/**
* Check if sibling nodes of the instance meet min active replicas constraint
* Two instances are sibling of each other if they host the same partition
* WARNING: The check uses ExternalView to reduce network traffic but suffer from accuracy
* due to external view propagation latency
*
* TODO: Use in memory cache and query instance's currentStates
*
* @param dataAccessor
* @param instanceName
* @return
*/
public static boolean siblingNodesActiveReplicaCheck(HelixDataAccessor dataAccessor, String instanceName) {
PropertyKey.Builder propertyKeyBuilder = dataAccessor.keyBuilder();
List<String> resources = dataAccessor.getChildNames(propertyKeyBuilder.idealStates());
for (String resourceName : resources) {
IdealState idealState = dataAccessor.getProperty(propertyKeyBuilder.idealStates(resourceName));
if (idealState == null || !idealState.isEnabled() || !idealState.isValid()
|| TaskConstants.STATE_MODEL_NAME.equals(idealState.getStateModelDefRef())) {
continue;
}
ExternalView externalView =
dataAccessor.getProperty(propertyKeyBuilder.externalView(resourceName));
if (externalView == null) {
throw new HelixException(
String.format("Resource %s does not have external view!", resourceName));
}
// Get the minActiveReplicas constraint for the resource
int minActiveReplicas = externalView.getMinActiveReplicas();
if (minActiveReplicas == -1) {
throw new HelixException(
"ExternalView " + resourceName + " is missing minActiveReplica field");
}
String stateModeDef = externalView.getStateModelDefRef();
StateModelDefinition stateModelDefinition =
dataAccessor.getProperty(propertyKeyBuilder.stateModelDef(stateModeDef));
Set<String> unhealthyStates = new HashSet<>(UNHEALTHY_STATES);
if (stateModelDefinition != null) {
unhealthyStates.add(stateModelDefinition.getInitialState());
}
for (String partition : externalView.getPartitionSet()) {
Map<String, String> stateByInstanceMap = externalView.getStateMap(partition);
// found the resource hosted on the instance
if (stateByInstanceMap.containsKey(instanceName)) {
int numHealthySiblings = 0;
for (Map.Entry<String, String> entry : stateByInstanceMap.entrySet()) {
if (!entry.getKey().equals(instanceName)
&& !unhealthyStates.contains(entry.getValue())) {
numHealthySiblings++;
}
}
if (numHealthySiblings < minActiveReplicas) {
_logger.info(
"Partition {} doesn't have enough active replicas in sibling nodes. NumHealthySiblings: {}, minActiveReplicas: {}",
partition, numHealthySiblings, minActiveReplicas);
return false;
}
}
}
}
return true;
}
}