blob: fae1b31e6e8ac4527798593a8a08b7bd36e02bb4 [file] [log] [blame]
package org.apache.helix.integration.rebalancer;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.lang.management.ManagementFactory;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.management.MBeanServerConnection;
import javax.management.MalformedObjectNameException;
import javax.management.ObjectName;
import org.apache.helix.ConfigAccessor;
import org.apache.helix.HelixAdmin;
import org.apache.helix.HelixDataAccessor;
import org.apache.helix.PropertyKey;
import org.apache.helix.TestHelper;
import org.apache.helix.common.ZkTestBase;
import org.apache.helix.integration.manager.ClusterControllerManager;
import org.apache.helix.integration.manager.MockParticipantManager;
import org.apache.helix.manager.zk.ZKHelixAdmin;
import org.apache.helix.manager.zk.ZKHelixDataAccessor;
import org.apache.helix.model.BuiltInStateModelDefinitions;
import org.apache.helix.model.ClusterConfig;
import org.apache.helix.model.MaintenanceSignal;
import org.apache.helix.monitoring.mbeans.MonitorDomainNames;
import org.apache.helix.tools.ClusterVerifiers.BestPossibleExternalViewVerifier;
import org.apache.helix.tools.ClusterVerifiers.ZkHelixClusterVerifier;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import static org.apache.helix.monitoring.mbeans.ClusterStatusMonitor.CLUSTER_DN_KEY;
import static org.apache.helix.util.StatusUpdateUtil.ErrorType.RebalanceResourceFailure;
public class TestClusterInMaintenanceModeWhenReachingOfflineInstancesLimit extends ZkTestBase {
private static final int NUM_NODE = 10;
private static final int START_PORT = 12918;
private static final int _PARTITIONS = 5;
private static final MBeanServerConnection _server = ManagementFactory.getPlatformMBeanServer();
private final String CLASS_NAME = getShortClassName();
private final String CLUSTER_NAME = CLUSTER_PREFIX + "_" + CLASS_NAME;
private ClusterControllerManager _controller;
private List<MockParticipantManager> _participants = new ArrayList<>();
private HelixDataAccessor _dataAccessor;
private int _maxOfflineInstancesAllowed = 4;
@BeforeClass
public void beforeClass() throws Exception {
System.out.println("START " + CLASS_NAME + " at " + new Date(System.currentTimeMillis()));
_gSetupTool.addCluster(CLUSTER_NAME, true);
for (int i = 0; i < NUM_NODE; i++) {
String instanceName = PARTICIPANT_PREFIX + "_" + (START_PORT + i);
_gSetupTool.addInstanceToCluster(CLUSTER_NAME, instanceName);
// start dummy participants
MockParticipantManager participant =
new MockParticipantManager(ZK_ADDR, CLUSTER_NAME, instanceName);
participant.syncStart();
_participants.add(participant);
}
// start controller
String controllerName = CONTROLLER_PREFIX + "_0";
_controller = new ClusterControllerManager(ZK_ADDR, CLUSTER_NAME, controllerName);
_controller.syncStart();
ZkHelixClusterVerifier clusterVerifier =
new BestPossibleExternalViewVerifier.Builder(CLUSTER_NAME).setZkClient(_gZkClient)
.setWaitTillVerify(TestHelper.DEFAULT_REBALANCE_PROCESSING_WAIT_TIME)
.build();
enablePersistBestPossibleAssignment(_gZkClient, CLUSTER_NAME, true);
_dataAccessor = new ZKHelixDataAccessor(CLUSTER_NAME, _baseAccessor);
ConfigAccessor configAccessor = new ConfigAccessor(_gZkClient);
ClusterConfig clusterConfig = configAccessor.getClusterConfig(CLUSTER_NAME);
clusterConfig.setMaxOfflineInstancesAllowed(_maxOfflineInstancesAllowed);
configAccessor.setClusterConfig(CLUSTER_NAME, clusterConfig);
for (int i = 0; i < 3; i++) {
String db = "Test-DB-" + i++;
createResourceWithDelayedRebalance(CLUSTER_NAME, db,
BuiltInStateModelDefinitions.MasterSlave.name(), _PARTITIONS, 3, 3, -1);
}
Assert.assertTrue(clusterVerifier.verifyByPolling());
}
@AfterMethod
public void afterMethod() {
cleanupRebalanceError();
}
@Test
public void testWithDisabledInstancesLimit() throws Exception {
MaintenanceSignal maintenanceSignal =
_dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
Assert.assertNull(maintenanceSignal);
checkForRebalanceError(false);
HelixAdmin admin = new ZKHelixAdmin(_gZkClient);
// disable instance
int i;
for (i = 2; i < 2 + _maxOfflineInstancesAllowed; i++) {
String instance = _participants.get(i).getInstanceName();
admin.enableInstance(CLUSTER_NAME, instance, false);
}
boolean result = TestHelper.verify(() -> {
MaintenanceSignal ms = _dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
return ms == null;
}, TestHelper.WAIT_DURATION);
Assert.assertTrue(result);
String instance = _participants.get(i).getInstanceName();
admin.enableInstance(CLUSTER_NAME, instance, false);
ZkHelixClusterVerifier clusterVerifier =
new BestPossibleExternalViewVerifier.Builder(CLUSTER_NAME).setZkClient(_gZkClient)
.setWaitTillVerify(TestHelper.DEFAULT_REBALANCE_PROCESSING_WAIT_TIME)
.build();
Assert.assertTrue(clusterVerifier.verifyByPolling());
result = TestHelper.verify(() -> {
MaintenanceSignal ms =_dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
return ms != null && ms.getReason() != null;
}, TestHelper.WAIT_DURATION);
Assert.assertTrue(result);
checkForRebalanceError(true);
for (i = 2; i < 2 + _maxOfflineInstancesAllowed + 1; i++) {
instance = _participants.get(i).getInstanceName();
admin.enableInstance(CLUSTER_NAME, instance, true);
}
admin.enableMaintenanceMode(CLUSTER_NAME, false);
Assert.assertTrue(clusterVerifier.verifyByPolling());
}
@Test(dependsOnMethods = "testWithDisabledInstancesLimit")
public void testWithOfflineInstancesLimit() throws Exception {
MaintenanceSignal maintenanceSignal =
_dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
Assert.assertNull(maintenanceSignal);
checkForRebalanceError(false);
int i;
for (i = 2; i < 2 + _maxOfflineInstancesAllowed; i++) {
_participants.get(i).syncStop();
}
boolean result = TestHelper.verify(() -> {
MaintenanceSignal ms = _dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
return ms == null;
}, TestHelper.WAIT_DURATION);
Assert.assertTrue(result);
_participants.get(i).syncStop();
result = TestHelper.verify(() -> {
MaintenanceSignal ms =_dataAccessor.getProperty(_dataAccessor.keyBuilder().maintenance());
return ms != null && ms.getReason() != null;
}, TestHelper.WAIT_DURATION);
Assert.assertTrue(result);
// Verify there is rebalance error logged
checkForRebalanceError(true);
}
@AfterClass
public void afterClass() throws Exception {
/*
* shutdown order: 1) disconnect the controller 2) disconnect participants
*/
_controller.syncStop();
for (MockParticipantManager participant : _participants) {
participant.syncStop();
}
deleteCluster(CLUSTER_NAME);
System.out.println("END " + CLASS_NAME + " at " + new Date(System.currentTimeMillis()));
}
private void checkForRebalanceError(final boolean expectError) throws Exception {
boolean result = TestHelper.verify(() -> {
/*
* TODO re-enable this check when we start recording rebalance error again
* ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(CLUSTER_NAME, _baseAccessor);
* PropertyKey errorNodeKey =
* accessor.keyBuilder().controllerTaskError(RebalanceResourceFailure.name());
* Assert.assertEquals(accessor.getProperty(errorNodeKey) != null, expectError);
*/
Long value =
(Long) _server.getAttribute(getClusterMbeanName(CLUSTER_NAME), "RebalanceFailureGauge");
return expectError == (value != null && value > 0);
}, TestHelper.WAIT_DURATION);
Assert.assertTrue(result);
}
private void cleanupRebalanceError() {
ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(CLUSTER_NAME, _baseAccessor);
PropertyKey errorNodeKey =
accessor.keyBuilder().controllerTaskError(RebalanceResourceFailure.name());
accessor.removeProperty(errorNodeKey);
}
private ObjectName getClusterMbeanName(String clusterName) throws MalformedObjectNameException {
String clusterBeanName = String.format("%s=%s", CLUSTER_DN_KEY, clusterName);
return new ObjectName(
String.format("%s:%s", MonitorDomainNames.ClusterStatus.name(), clusterBeanName));
}
}