blob: bafab56aced1d16f960fa7c8ec33f99bc96dbecc [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
package org.apache.hadoop.hdds.scm.container.states;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplicaNotFoundException;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Set;
import java.util.Collections;
import java.util.Map;
import java.util.NavigableSet;
import java.util.TreeSet;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.ConcurrentHashMap;
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes
* Container State Map acts like a unified map for various attributes that are
* used to select containers when we need allocated blocks.
* <p>
* This class provides the ability to query 5 classes of attributes. They are
* <p>
* 1. LifeCycleStates - LifeCycle States of container describe in which state
* a container is. For example, a container needs to be in Open State for a
* client to able to write to it.
* <p>
* 2. Owners - Each instance of Name service, for example, Namenode of HDFS or
* Ozone Manager (OM) of Ozone or CBlockServer -- is an owner. It is
* possible to have many OMs for a Ozone cluster and only one SCM. But SCM
* keeps the data from each OM in separate bucket, never mixing them. To
* write data, often we have to find all open containers for a specific owner.
* <p>
* 3. ReplicationType - The clients are allowed to specify what kind of
* replication pipeline they want to use. Each Container exists on top of a
* pipeline, so we need to get ReplicationType that is specified by the user.
* <p>
* 4. ReplicationFactor - The replication factor represents how many copies
* of data should be made, right now we support 2 different types, ONE
* Replica and THREE Replica. User can specify how many copies should be made
* for a ozone key.
* <p>
* The most common access pattern of this class is to select a container based
* on all these parameters, for example, when allocating a block we will
* select a container that belongs to user1, with Ratis replication which can
* make 3 copies of data. The fact that we will look for open containers by
* default and if we cannot find them we will add new containers.
public class ContainerStateMap {
private static final Logger LOG =
private final static NavigableSet<ContainerID> EMPTY_SET =
Collections.unmodifiableNavigableSet(new TreeSet<>());
private final ContainerAttribute<LifeCycleState> lifeCycleStateMap;
private final ContainerAttribute<String> ownerMap;
private final ContainerAttribute<ReplicationFactor> factorMap;
private final ContainerAttribute<ReplicationType> typeMap;
private final Map<ContainerID, ContainerInfo> containerMap;
private final Map<ContainerID, Set<ContainerReplica>> replicaMap;
private final Map<ContainerQueryKey, NavigableSet<ContainerID>> resultCache;
// Container State Map lock should be held before calling into
// Update ContainerAttributes. The consistency of ContainerAttributes is
// protected by this lock.
private final ReadWriteLock lock;
* Create a ContainerStateMap.
public ContainerStateMap() {
this.lifeCycleStateMap = new ContainerAttribute<>();
this.ownerMap = new ContainerAttribute<>();
this.factorMap = new ContainerAttribute<>();
this.typeMap = new ContainerAttribute<>();
this.containerMap = new ConcurrentHashMap<>();
this.lock = new ReentrantReadWriteLock();
this.replicaMap = new ConcurrentHashMap<>();
this.resultCache = new ConcurrentHashMap<>();
* Adds a ContainerInfo Entry in the ContainerStateMap.
* @param info - container info
* @throws SCMException - throws if create failed.
public void addContainer(final ContainerInfo info)
throws SCMException {
Preconditions.checkNotNull(info, "Container Info cannot be null");
Preconditions.checkArgument(info.getReplicationFactor().getNumber() > 0,
"ExpectedReplicaCount should be greater than 0");
try {
final ContainerID id = info.containerID();
if (containerMap.putIfAbsent(id, info) != null) {
LOG.debug("Duplicate container ID detected. {}", id);
throw new
SCMException("Duplicate container ID detected.",
lifeCycleStateMap.insert(info.getState(), id);
ownerMap.insert(info.getOwner(), id);
factorMap.insert(info.getReplicationFactor(), id);
typeMap.insert(info.getReplicationType(), id);
replicaMap.put(id, ConcurrentHashMap.newKeySet());
// Flush the cache of this container type, will be added later when
// get container queries are executed.
LOG.trace("Created container with {} successfully.", id);
} finally {
* Removes a Container Entry from ContainerStateMap.
* @param containerID - ContainerID
* @throws SCMException - throws if create failed.
public void removeContainer(final ContainerID containerID)
throws ContainerNotFoundException {
Preconditions.checkNotNull(containerID, "ContainerID cannot be null");
try {
// Should we revert back to the original state if any of the below
// remove operation fails?
final ContainerInfo info = containerMap.remove(containerID);
lifeCycleStateMap.remove(info.getState(), containerID);
ownerMap.remove(info.getOwner(), containerID);
factorMap.remove(info.getReplicationFactor(), containerID);
typeMap.remove(info.getReplicationType(), containerID);
// Flush the cache of this container type.
LOG.trace("Removed container with {} successfully.", containerID);
} finally {
* Returns the latest state of Container from SCM's Container State Map.
* @param containerID - ContainerID
* @return container info, if found.
public ContainerInfo getContainerInfo(final ContainerID containerID)
throws ContainerNotFoundException {
try {
return containerMap.get(containerID);
} finally {
* Returns the latest list of DataNodes where replica for given containerId
* exist. Throws an SCMException if no entry is found for given containerId.
* @param containerID
* @return Set<DatanodeDetails>
public Set<ContainerReplica> getContainerReplicas(
final ContainerID containerID) throws ContainerNotFoundException {
try {
return Collections
} finally {
* Adds given datanodes as nodes where replica for given containerId exist.
* Logs a debug entry if a datanode is already added as replica for given
* ContainerId.
* @param containerID
* @param replica
public void updateContainerReplica(final ContainerID containerID,
final ContainerReplica replica) throws ContainerNotFoundException {
try {
Set<ContainerReplica> replicas = replicaMap.get(containerID);
} finally {
* Remove a container Replica for given DataNode.
* @param containerID
* @param replica
* @return True of dataNode is removed successfully else false.
public void removeContainerReplica(final ContainerID containerID,
final ContainerReplica replica)
throws ContainerNotFoundException, ContainerReplicaNotFoundException {
try {
if(!replicaMap.get(containerID).remove(replica)) {
throw new ContainerReplicaNotFoundException(
"Container #"
+ containerID.getId() + ", replica: " + replica);
} finally {
* Just update the container State.
* @param info ContainerInfo.
public void updateContainerInfo(final ContainerInfo info)
throws ContainerNotFoundException {
try {
final ContainerInfo currentInfo = containerMap.get(info.containerID());
flushCache(info, currentInfo);
containerMap.put(info.containerID(), info);
} finally {
* Update the State of a container.
* @param containerID - ContainerID
* @param currentState - CurrentState
* @param newState - NewState.
* @throws SCMException - in case of failure.
public void updateState(ContainerID containerID, LifeCycleState currentState,
LifeCycleState newState) throws SCMException, ContainerNotFoundException {
try {
final ContainerInfo currentInfo = containerMap.get(containerID);
try {
// We are updating two places before this update is done, these can
// fail independently, since the code needs to handle it.
// We update the attribute map, if that fails it will throw an
// exception, so no issues, if we are successful, we keep track of the
// fact that we have updated the lifecycle state in the map, and update
// the container state. If this second update fails, we will attempt to
// roll back the earlier change we did. If the rollback fails, we can
// be in an inconsistent state,
lifeCycleStateMap.update(currentState, newState, containerID);
if (LOG.isTraceEnabled()) {
LOG.trace("Updated the container {} to new state. Old = {}, new = " +
"{}", containerID, currentState, newState);
// Just flush both old and new data sets from the result cache.
} catch (SCMException ex) {
LOG.error("Unable to update the container state.", ex);
// we need to revert the change in this attribute since we are not
// able to update the hash table."Reverting the update to lifecycle state. Moving back to " +
"old state. Old = {}, Attempted state = {}", currentState,
// if this line throws, the state map can be in an inconsistent
// state, since we will have modified the attribute by the
// container state will not in sync since we were not able to put
// that into the hash table.
lifeCycleStateMap.update(newState, currentState, containerID);
throw new SCMException("Updating the container map failed.", ex,
} finally {
public Set<ContainerID> getAllContainerIDs() {
return Collections.unmodifiableSet(containerMap.keySet());
* Returns A list of containers owned by a name service.
* @param ownerName - Name of the NameService.
* @return - NavigableSet of ContainerIDs.
NavigableSet<ContainerID> getContainerIDsByOwner(final String ownerName) {
try {
return ownerMap.getCollection(ownerName);
} finally {
* Returns Containers in the System by the Type.
* @param type - Replication type -- StandAlone, Ratis etc.
* @return NavigableSet
NavigableSet<ContainerID> getContainerIDsByType(final ReplicationType type) {
try {
return typeMap.getCollection(type);
} finally {
* Returns Containers by replication factor.
* @param factor - Replication Factor.
* @return NavigableSet.
NavigableSet<ContainerID> getContainerIDsByFactor(
final ReplicationFactor factor) {
try {
return factorMap.getCollection(factor);
} finally {
* Returns Containers by State.
* @param state - State - Open, Closed etc.
* @return List of containers by state.
public NavigableSet<ContainerID> getContainerIDsByState(
final LifeCycleState state) {
try {
return lifeCycleStateMap.getCollection(state);
} finally {
* Gets the containers that matches the following filters.
* @param state - LifeCycleState
* @param owner - Owner
* @param factor - Replication Factor
* @param type - Replication Type
* @return ContainerInfo or Null if not container satisfies the criteria.
public NavigableSet<ContainerID> getMatchingContainerIDs(
final LifeCycleState state, final String owner,
final ReplicationFactor factor, final ReplicationType type) {
Preconditions.checkNotNull(state, "State cannot be null");
Preconditions.checkNotNull(owner, "Owner cannot be null");
Preconditions.checkNotNull(factor, "Factor cannot be null");
Preconditions.checkNotNull(type, "Type cannot be null");
try {
final ContainerQueryKey queryKey =
new ContainerQueryKey(state, owner, factor, type);
return resultCache.get(queryKey);
// If we cannot meet any one condition we return EMPTY_SET immediately.
// Since when we intersect these sets, the result will be empty if any
// one is empty.
final NavigableSet<ContainerID> stateSet =
if (stateSet.size() == 0) {
return EMPTY_SET;
final NavigableSet<ContainerID> ownerSet =
if (ownerSet.size() == 0) {
return EMPTY_SET;
final NavigableSet<ContainerID> factorSet =
if (factorSet.size() == 0) {
return EMPTY_SET;
final NavigableSet<ContainerID> typeSet =
if (typeSet.size() == 0) {
return EMPTY_SET;
// if we add more constraints we will just add those sets here..
final NavigableSet<ContainerID>[] sets = sortBySize(stateSet,
ownerSet, factorSet, typeSet);
NavigableSet<ContainerID> currentSet = sets[0];
// We take the smallest set and intersect against the larger sets. This
// allows us to reduce the lookups to the least possible number.
for (int x = 1; x < sets.length; x++) {
currentSet = intersectSets(currentSet, sets[x]);
resultCache.put(queryKey, currentSet);
return currentSet;
} finally {
* Calculates the intersection between sets and returns a new set.
* @param smaller - First Set
* @param bigger - Second Set
* @return resultSet which is the intersection of these two sets.
private NavigableSet<ContainerID> intersectSets(
final NavigableSet<ContainerID> smaller,
final NavigableSet<ContainerID> bigger) {
Preconditions.checkState(smaller.size() <= bigger.size(),
"This function assumes the first set is lesser or equal to second " +
final NavigableSet<ContainerID> resultSet = new TreeSet<>();
for (ContainerID id : smaller) {
if (bigger.contains(id)) {
return resultSet;
* Sorts a list of Sets based on Size. This is useful when we are
* intersecting the sets.
* @param sets - varagrs of sets
* @return Returns a sorted array of sets based on the size of the set.
private NavigableSet<ContainerID>[] sortBySize(
final NavigableSet<ContainerID>... sets) {
for (int x = 0; x < sets.length - 1; x++) {
for (int y = 0; y < sets.length - x - 1; y++) {
if (sets[y].size() > sets[y + 1].size()) {
final NavigableSet temp = sets[y];
sets[y] = sets[y + 1];
sets[y + 1] = temp;
return sets;
private void flushCache(final ContainerInfo... containerInfos) {
for (ContainerInfo containerInfo : containerInfos) {
final ContainerQueryKey key = new ContainerQueryKey(
private void checkIfContainerExist(ContainerID containerID)
throws ContainerNotFoundException {
if (!containerMap.containsKey(containerID)) {
throw new ContainerNotFoundException("Container with id " +
containerID.getId() + " not found.");