blob: 7f1d34efdeeba1b1a07a197e483d9e9868ebdbc0 [file] [log] [blame]
#!/usr/bin/python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from ozone.cluster import OzoneCluster
from ozone.exceptions import ContainerNotFoundError
logger = logging.getLogger(__name__)
def setup_function():
global cluster
cluster = OzoneCluster.create()
cluster.start()
def teardown_function():
cluster.stop()
def test_isolate_single_datanode():
"""
In this test case we will create a network partition in such a way that
one of the DN will not be able to communicate with other datanodes
but it will be able to communicate with SCM.
Once the network partition happens, SCM detects it and closes the pipeline,
which in-turn closes the containers.
The container on the first two DN will get CLOSED as they have quorum.
The container replica on the third node will be QUASI_CLOSED as it is not
able to connect with the other DNs and it doesn't have latest BCSID.
Once we restore the network, the stale replica on the third DN will be
deleted and a latest replica will be copied from any one of the other
DNs.
"""
om = cluster.om
scm = cluster.scm
dns = cluster.datanodes
client = cluster.client
oz_client = cluster.get_client()
oz_client.run_freon(1, 1, 1, 10240)
# Partition the network
first_set = [om, scm, dns[0], dns[1], client]
second_set = [om, scm, dns[2], client]
logger.info("Partitioning the network")
cluster.partition_network(first_set, second_set)
oz_client.run_freon(1, 1, 1, 10240)
containers = cluster.get_containers_on_datanode(dns[0])
# The same set of containers should also be in datanode[2]
for container in containers:
assert container.is_on(dns[2])
logger.info("Waiting for container to be CLOSED")
for container in containers:
container.wait_until_one_replica_is_closed()
for container in containers:
assert container.get_state(dns[0]) == 'CLOSED'
assert container.get_state(dns[1]) == 'CLOSED'
try:
assert container.get_state(dns[2]) == 'CLOSING' or \
container.get_state(dns[2]) == 'QUASI_CLOSED'
except ContainerNotFoundError:
assert True
# Since the replica in datanode[2] doesn't have the latest BCSID,
# ReplicationManager will delete it and copy a closed replica.
# We will now restore the network and datanode[2] should get a
# closed replica of the container
logger.info("Restoring the network")
cluster.restore_network()
logger.info("Waiting for the replica to be CLOSED")
for container in containers:
container.wait_until_replica_is_closed(dns[2])
for container in containers:
assert container.get_state(dns[0]) == 'CLOSED'
assert container.get_state(dns[1]) == 'CLOSED'
assert container.get_state(dns[2]) == 'CLOSED'
exit_code, output = oz_client.run_freon(1, 1, 1, 10240)
assert exit_code == 0, "freon run failed with output=[%s]" % output
def test_datanode_isolation_all():
"""
In this test case we will create a network partition in such a way that
all DNs cannot communicate with each other.
All DNs will be able to communicate with SCM.
Once the network partition happens, SCM detects it and closes the pipeline,
which in-turn tries to close the containers.
At least one of the replica should be in closed state
Once we restore the network, there will be three closed replicas.
"""
om = cluster.om
scm = cluster.scm
dns = cluster.datanodes
client = cluster.client
oz_client = cluster.get_client()
oz_client.run_freon(1, 1, 1, 10240)
logger.info("Partitioning the network")
first_set = [om, scm, dns[0], client]
second_set = [om, scm, dns[1], client]
third_set = [om, scm, dns[2], client]
cluster.partition_network(first_set, second_set, third_set)
containers = cluster.get_containers_on_datanode(dns[0])
container = containers.pop()
logger.info("Waiting for a replica to be CLOSED")
container.wait_until_one_replica_is_closed()
# At least one of the replica should be in closed state
assert 'CLOSED' in container.get_datanode_states()
logger.info("Restoring the network")
cluster.restore_network()
logger.info("Waiting for the container to be replicated")
container.wait_until_all_replicas_are_closed()
# After restoring the network all the replicas should be in CLOSED state
for state in container.get_datanode_states():
assert state == 'CLOSED'
exit_code, output = oz_client.run_freon(1, 1, 1, 10240)
assert exit_code == 0, "freon run failed with output=[%s]" % output