blob: 9b131558dc24d28aceb89a915362d12ef7ed9d0c [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import marvin
from marvin.cloudstackTestCase import *
from marvin.cloudstackAPI import *
from marvin.lib.utils import *
from marvin.lib.base import *
from marvin.lib.common import *
from nose.plugins.attrib import attr
import random
from ipmisim.ipmisim import IpmiServerContext, IpmiServer, ThreadedIpmiServer
import random
import socket
import sys
import _thread
import time
class TestHAKVM(cloudstackTestCase):
""" Test cases for host HA using KVM host(s)
"""
def setUp(self):
self.testClient = super(TestHAKVM, self).getClsTestClient()
self.apiclient = self.testClient.getApiClient()
self.dbclient = self.testClient.getDbConnection()
self.services = self.testClient.getParsedTestDataConfig()
self.logger = logging.getLogger('TestHAKVM')
#Get Zone specifics
self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests())
self.hypervisor = self.testClient.getHypervisorInfo()
self.host = self.getHost()
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
self.cluster_id = self.host.clusterid
# Cleanup any existing configs
self.dbclient.execute("delete from ha_config where resource_type='Host'")
# use random port for ipmisim
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
s = socket.socket()
s.bind(('', 0))
self.serverPort = s.getsockname()[1]
s.close()
# Set Cluster-level setting in order to run tests faster
self.updateConfiguration("kvm.ha.activity.check.failure.ratio", "0.6")
self.updateConfiguration("kvm.ha.activity.check.interval", "8")
self.updateConfiguration("kvm.ha.activity.check.max.attempts", "5")
self.updateConfiguration("kvm.ha.activity.check.timeout", "30")
self.updateConfiguration("kvm.ha.degraded.max.period", "30")
self.updateConfiguration("kvm.ha.fence.timeout", "30")
self.updateConfiguration("kvm.ha.health.check.timeout", "30")
self.updateConfiguration("kvm.ha.recover.failure.threshold", "2")
self.updateConfiguration("kvm.ha.recover.timeout", "30")
self.updateConfiguration("kvm.ha.recover.wait.period", "30")
self.service_offering = ServiceOffering.create(
self.apiclient,
self.services["service_offerings"]["hasmall"]
)
self.template = get_test_template(
self.apiclient,
self.zone.id,
self.hypervisor
)
self.configureAndDisableHostHa()
self.cleanup = [self.service_offering]
def updateConfiguration(self, name, value):
cmd = updateConfiguration.updateConfigurationCmd()
cmd.name = name
cmd.value = value
cmd.clusterid = self.cluster_id
self.apiclient.updateConfiguration(cmd)
def getFakeMsId(self):
return self.fakeMsId
def getFakeMsRunId(self):
return self.fakeMsId * 1000
def tearDown(self):
self.configureAndDisableHostHa()
self.host = None
try:
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from cluster_details where name='resourceHAEnabled'")
self.dbclient.execute("delete from data_center_details where name='resourceHAEnabled'")
self.dbclient.execute("delete from ha_config where resource_type='Host'")
self.dbclient.execute("delete from oobm where port=%d" % self.getIpmiServerPort())
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from cluster_details where name='outOfBandManagementEnabled'")
self.dbclient.execute("delete from data_center_details where name='outOfBandManagementEnabled'")
cleanup_resources(self.apiclient, self.cleanup)
except Exception as e:
raise Exception("Warning: Exception during cleanup : %s" % e)
def getHostHaEnableCmd(self):
cmd = enableHAForHost.enableHAForHostCmd()
cmd.hostid = self.host.id
return cmd
def getHost(self, hostId=None):
response = list_hosts(
self.apiclient,
type='Routing',
hypervisor='kvm',
id=hostId
)
# Check if more than one kvm hosts are available in order to successfully configure host-ha
if response and len(response) > 0:
self.host = response[0]
return self.host
raise self.skipTest("Not enough KVM hosts found, skipping host-ha test")
def getHostHaConfigCmd(self, provider='kvmhaprovider'):
cmd = configureHAForHost.configureHAForHostCmd()
cmd.provider = provider
cmd.hostid = self.host.id
return cmd
def getHostHaDisableCmd(self):
cmd = disableHAForHost.disableHAForHostCmd()
cmd.hostid = self.host.id
return cmd
def configureAndEnableHostHa(self):
#Adding sleep between configuring and enabling
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
response = self.apiclient.enableHAForHost(self.getHostHaEnableCmd())
self.assertEqual(response.haenable, True)
def configureAndDisableHostHa(self):
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaDisableCmd()
cmd.hostid = self.host.id
response = self.apiclient.disableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, False)
def enableHostHa(self, hostId):
cmd = self.getHostHaEnableCmd()
cmd.hostid = hostId
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
return response
def disableAgent(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("systemctl disable cloudstack-agent || chkconfig cloudstack-agent off")
def resetHost(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("reboot")
def enableAgent(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("systemctl enable cloudstack-agent || chkconfig cloudstack-agent on")
def waitUntilHostInState(self, state="Available", interval=3):
def checkForState(expectedState):
response = self.getHost(self.host.id)
print("checkForState:: expected=%s, actual=%s" % (state, response.hostha))
return response.hostha.hastate == expectedState, None
res, _ = wait_until(interval, 200, checkForState, state)
if not res:
self.fail("Failed to see host ha state in :" + state)
def deployVM(self):
try:
vm = VirtualMachine.create(
self.apiclient,
services=self.services["virtual_machine"],
serviceofferingid=self.service_offering.id,
templateid=self.template.id,
zoneid=self.zone.id,
hostid = self.host.id,
method="POST"
)
self.cleanup.append(vm)
except Exception as e:
raise self.skipTest("Failed to deploy VM, skipping kvm host-ha test case")
def skipIfMSIsUnsupported(self) :
os_details = SshClient(self.mgtSvrDetails["mgtSvrIp"], 22, self.mgtSvrDetails["user"], self.mgtSvrDetails["passwd"]).execute \
("/usr/share/cloudstack-common/scripts/vm/hypervisor/versions.sh | cut -d '=' -f2")
os = os_details[0].lower()
if 'ubuntu' in os or 'debian' in os :
return
# RHEL < 8 works fine
os_ver = os_details[1].split('.')[0]
if float(os_ver) < 8:
return
self.skipTest("Skipping since RHEL8 / SUSE have known IPMI issues")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_disable_oobm_ha_state_ineligible(self):
"""
Tests that when HA is enabled for a host, if oobm is disabled HA State should turn into Ineligible
"""
self.logger.debug("Starting test_disable_oobm_ha_state_ineligible")
# Enable ha for host
self.configureAndEnableHostHa()
# Disable OOBM
self.apiclient.configureOutOfBandManagement(self.getOobmConfigCmd())
oobm_cmd = self.getOobmDisableCmd()
oobm_cmd.hostid = self.host.id
response = self.apiclient.disableOutOfBandManagementForHost(oobm_cmd)
self.assertEqual(response.hostid, oobm_cmd.hostid)
self.assertEqual(response.enabled, False)
response = self.getHost(hostId=self.host.id).outofbandmanagement
self.assertEqual(response.powerstate, 'Disabled')
# Verify HA State is Ineligeble
self.waitUntilHostInState("Ineligible")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_configure_default_driver(self):
"""
Tests host-ha configuration with valid data
"""
self.logger.debug("Starting test_hostha_configure_default_driver")
cmd = self.getHostHaConfigCmd()
response = self.apiclient.configureHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haprovider, cmd.provider.lower())
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_ha_when_host_disabled(self):
"""
Tests Enable HA when host is disconnected, should be Ineligible
"""
self.logger.debug("Starting test_hostha_enable_ha_when_host_disabled")
# Enable HA
self.configureAndEnableHostHa()
# Disable Host
self.disableHost(self.host.id)
# Check HA State
try:
self.waitUntilHostInState("Ineligible")
except Exception as e:
self.enableHost(self.host.id)
self.fail(e)
# Enable Host
self.enableHost(self.host.id)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_ha_when_host_in_maintenance(self):
"""
Tests Enable HA when host is in Maintenance mode, should be Ineligible
"""
self.logger.debug("Starting test_hostha_enable_ha_when_host_in_maintenance")
self.logger.debug("Pausing to wait for VMs to have finished starting")
time.sleep(300)
# Enable HA
self.configureAndEnableHostHa()
# Prepare for maintenance Host
self.setHostToMaintanance(self.host.id)
# Check HA State
try:
self.waitUntilHostInState("Ineligible")
except Exception as e:
self.cancelMaintenance()
self.fail(e)
# Enable Host
self.cancelMaintenance()
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_ha_when_host_disconected(self):
"""
Tests Enable HA when host is disconnected, should be Ineligible
"""
self.logger.debug("Starting test_hostha_enable_ha_when_host_disconected")
# Enable HA
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
cmd.hostid = self.host.id
enable = self.apiclient.enableHAForHost(cmd)
self.assertEqual(enable.hostid, cmd.hostid)
self.assertEqual(enable.haenable, True)
# Make Host Disconnected
self.killAgent()
# Check HA State
try:
self.waitUntilHostInState("Ineligible")
except Exception as e:
self.startAgent()
self.fail(e)
# Enable Host
self.startAgent()
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_remove_ha_provider_not_possible(self):
"""
Tests HA Provider should be possible to be removed when HA is enabled
"""
self.logger.debug("Starting test_remove_ha_provider_not_possible")
# Enable HA
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
cmd.hostid = self.host.id
enable = self.apiclient.enableHAForHost(cmd)
self.assertEqual(enable.hostid, cmd.hostid)
self.assertEqual(enable.haenable, True)
try:
self.apiclient.configureHAForHost(self.getHostHaConfigCmd(''))
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags = ["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_kvm_host_degraded(self):
"""
Tests degraded HA state when agent is stopped/killed
"""
self.skipIfMSIsUnsupported()
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.configureAndEnableHostHa()
self.deployVM()
# Start with the available state
self.waitUntilHostInState("Available")
# SSH into the KVM Host and executes kill -9 of the agent
self.stopAgent()
# Check if host would go into Suspect state
try:
self.waitUntilHostInState("Suspect")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Degraded
try:
self.waitUntilHostInState("Degraded")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
self.startAgent()
self.waitUntilHostInState("Available")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_kvm_host_recovering(self):
"""
Tests recovery and fencing HA state transitions
"""
self.skipIfMSIsUnsupported()
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.configureAndEnableHostHa()
self.deployVM()
# Start with the available state
self.waitUntilHostInState("Available")
# Kill host by triggering a fault
self.killAgent()
self.disableAgent()
self.resetHost()
# Check if host would go into Suspect state
try:
self.waitUntilHostInState("Suspect")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Recovered
try:
self.waitUntilHostInState("Recovered")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
self.enableAgent()
self.startAgent()
self.waitUntilHostInState("Available")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_kvm_host_fencing(self):
"""
Tests fencing/fenced HA state when host crashes
"""
self.logger.debug("Starting test_ha_kvm_host_fencing")
self.skipIfMSIsUnsupported()
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.configureAndEnableHostHa()
self.deployVM()
# Start with the available state
self.waitUntilHostInState("Available")
# Fail oobm commands
cmd = self.getOobmConfigCmd()
cmd.address = "1.1.1.1"
self.apiclient.configureOutOfBandManagement(cmd)
# Kill host by triggering a fault
self.killAgent()
self.disableAgent()
self.resetHost()
# Check if host would go into Suspect state
try:
self.waitUntilHostInState("Suspect")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Fencing
try:
self.waitUntilHostInState("Fencing")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Allow oobm commands to work now
self.configureAndEnableOobm()
# Checks if the host would turn into Fenced
try:
self.waitUntilHostInState("Fenced")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
self.enableAgent()
self.startAgent()
self.cancelMaintenance()
self.waitUntilHostInState("Available")
def configureAndStartIpmiServer(self, power_state=None):
"""
Setup ipmisim and enable out-of-band management for host
"""
self.configureAndEnableOobm()
self.startIpmiServer()
if power_state:
bmc = IpmiServerContext().bmc
bmc.powerstate = power_state
def assertIssueCommandState(self, command, expected):
"""
Asserts power action result for a given power command
"""
if command != 'STATUS':
self.issuePowerActionCmd(command)
try:
response = self.issuePowerActionCmd('STATUS')
self.assertEqual(response.powerstate, expected)
except:
pass # in case of ipmisim errors ignore
def configureAndEnableOobm(self):
self.apiclient.configureOutOfBandManagement(self.getOobmConfigCmd())
response = self.apiclient.enableOutOfBandManagementForHost(self.getOobmEnableCmd())
self.assertEqual(response.enabled, True)
def startIpmiServer(self):
def startIpmiServer(tname, server):
self.debug("Starting ipmisim server")
try:
server.serve_forever()
except Exception: pass
IpmiServerContext('reset')
ThreadedIpmiServer.allow_reuse_address = False
server = ThreadedIpmiServer(('0.0.0.0', self.getIpmiServerPort()), IpmiServer)
_thread.start_new_thread(startIpmiServer, ("ipmi-server", server,))
self.server = server
def stopIpmiServer(self):
if self.server:
self.server.shutdown()
self.server.server_close()
def getOobmIssueActionCmd(self):
cmd = issueOutOfBandManagementPowerAction.issueOutOfBandManagementPowerActionCmd()
cmd.hostid = self.host.id
cmd.action = 'STATUS'
return cmd
def issuePowerActionCmd(self, action, timeout=None):
cmd = self.getOobmIssueActionCmd()
cmd.action = action
if timeout:
cmd.timeout = timeout
try:
return self.apiclient.issueOutOfBandManagementPowerAction(cmd)
except Exception as e:
if "packet session id 0x0 does not match active session" in str(e):
raise self.skipTest("Known ipmitool issue hit, skipping test")
raise e
def getOobmEnableCmd(self):
cmd = enableOutOfBandManagementForHost.enableOutOfBandManagementForHostCmd()
cmd.hostid = self.host.id
return cmd
def getOobmDisableCmd(self):
cmd = disableOutOfBandManagementForHost.disableOutOfBandManagementForHostCmd()
cmd.hostid = self.host.id
return cmd
def getIpmiServerPort(self):
return self.serverPort
def getOobmConfigCmd(self):
cmd = configureOutOfBandManagement.configureOutOfBandManagementCmd()
cmd.driver = 'ipmitool' # The default available driver
cmd.address = self.getIpmiServerIp()
cmd.port = self.getIpmiServerPort()
cmd.username = 'admin'
cmd.password = 'password'
cmd.hostid = self.host.id
return cmd
def getIpmiServerIp(self):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect((self.mgtSvrDetails["mgtSvrIp"], self.mgtSvrDetails["port"]))
return s.getsockname()[0]
def get_non_configured_ha_host(self):
response = list_hosts(
self.apiclient,
type='Routing'
)
for host in response:
if host.haprovider is None:
return host
else:
cloudstackTestCase.skipTest(self, "There is no non configured hosts. Skipping test.")
raise self.skipTest("No KVM hosts found, skipping host-ha test")
def getHAState(self, id):
cmd = listHostHAResources.listHostHAResourcesCmd()
cmd.hostid = id
response = self.apiclient.listHostHAResources(cmd)
return response[0]
def startAgent(self):
SshClient(host=self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("systemctl start cloudstack-agent || service cloudstack-agent start")
def stopAgent(self):
SshClient(host=self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("systemctl stop cloudstack-agent || service cloudstack-agent stop")
def killAgent(self):
SshClient(host=self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("kill -9 $(ps aux | grep 'cloudstack-agent' | awk '{print $2}')")
def disableHost(self, id):
cmd = updateHost.updateHostCmd()
cmd.id = id
cmd.allocationstate = "Disable"
response = self.apiclient.updateHost(cmd)
self.assertEqual(response.resourcestate, "Disabled")
def enableHost(self, id):
cmd = updateHost.updateHostCmd()
cmd.id = id
cmd.allocationstate = "Enable"
response = self.apiclient.updateHost(cmd)
self.assertEqual(response.resourcestate, "Enabled")
def setHostToMaintanance(self, id):
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
cmd.id = id
response = self.apiclient.prepareHostForMaintenance(cmd)
self.assertEqual(response.resourcestate, "PrepareForMaintenance")
def cancelMaintenance(self):
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
cmd.id = self.host.id
response = self.apiclient.cancelHostMaintenance(cmd)
self.assertEqual(response.resourcestate, "Enabled")