blob: fded493bd86e8ca18fe8ccf0e46f8861f236cc1a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ratis;
import org.apache.log4j.Level;
import org.apache.ratis.conf.RaftProperties;
import org.apache.ratis.metrics.RatisMetricRegistry;
import org.apache.ratis.protocol.RaftPeerId;
import org.apache.ratis.server.RaftServer;
import org.apache.ratis.server.RaftServerConfigKeys;
import org.apache.ratis.server.impl.MiniRaftCluster;
import org.apache.ratis.server.impl.RaftServerTestUtil;
import org.apache.ratis.server.metrics.RaftServerMetricsImpl;
import org.apache.ratis.server.simulation.MiniRaftClusterWithSimulatedRpc;
import org.apache.ratis.proto.RaftProtos;
import org.apache.ratis.statemachine.SimpleStateMachine4Testing;
import org.apache.ratis.statemachine.StateMachine;
import org.apache.ratis.util.Log4jUtils;
import org.apache.ratis.util.TimeDuration;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
import java.util.List;
import java.util.SortedMap;
import java.util.concurrent.TimeUnit;
import com.codahale.metrics.Gauge;
/**
* Test Raft Server Slowness detection and notification to Leader's statemachine.
*/
//TODO: fix StateMachine.notifySlowness(..); see RATIS-370
@Ignore
public class TestRaftServerSlownessDetection extends BaseTest {
static {
Log4jUtils.setLogLevel(RaftServer.Division.LOG, Level.DEBUG);
}
public static final int NUM_SERVERS = 3;
protected static final RaftProperties properties = new RaftProperties();
private final MiniRaftClusterWithSimulatedRpc cluster = MiniRaftClusterWithSimulatedRpc
.FACTORY.newCluster(NUM_SERVERS, getProperties());
public RaftProperties getProperties() {
RaftServerConfigKeys.Rpc
.setSlownessTimeout(properties, TimeDuration.valueOf(1, TimeUnit.SECONDS));
properties.setClass(MiniRaftCluster.STATEMACHINE_CLASS_KEY,
SimpleStateMachine4Testing.class, StateMachine.class);
return properties;
}
@Before
public void setup() throws IOException {
Assert.assertNull(cluster.getLeader());
cluster.start();
}
@After
public void tearDown() {
if (cluster != null) {
cluster.shutdown();
}
}
@Test
public void testSlownessDetection() throws Exception {
RaftServer.Division leaderServer = RaftTestUtil.waitForLeader(cluster);
long slownessTimeout = RaftServerConfigKeys.Rpc
.slownessTimeout(cluster.getProperties()).toIntExact(TimeUnit.MILLISECONDS);
RaftServer.Division failedFollower = cluster.getFollowers().get(0);
final RatisMetricRegistry ratisMetricRegistry
= ((RaftServerMetricsImpl)leaderServer.getRaftServerMetrics()).getRegistry();
SortedMap<String, Gauge> heartbeatElapsedTimeGauges =
ratisMetricRegistry.getGauges((s, metric) ->
s.contains("lastHeartbeatElapsedTime"));
String followerId = failedFollower.getId().toString();
Gauge metric = heartbeatElapsedTimeGauges.entrySet().parallelStream().filter(e -> e.getKey().contains(
followerId)).iterator().next().getValue();
long followerHeartBeatElapsedMetric = (long) metric.getValue();
// fail the node and wait for the callback to be triggered
cluster.killServer(failedFollower.getId());
Thread.sleep( slownessTimeout * 2);
long followerHeartBeatElapsedMetricNew = (long) metric.getValue();
Assert.assertTrue(followerHeartBeatElapsedMetricNew > followerHeartBeatElapsedMetric);
// Followers should not get any failed not notification
for (RaftServer.Division followerServer : cluster.getFollowers()) {
Assert.assertNull(SimpleStateMachine4Testing.get(followerServer).getSlownessInfo());
}
// the leader should get notification that the follower has failed now
RaftProtos.RoleInfoProto roleInfoProto =
SimpleStateMachine4Testing.get(cluster.getLeader()).getSlownessInfo();
Assert.assertNotNull(roleInfoProto);
List<RaftProtos.ServerRpcProto> followers =
roleInfoProto.getLeaderInfo().getFollowerInfoList();
//Assert that the node shutdown is lagging behind
for (RaftProtos.ServerRpcProto serverProto : followers) {
if (RaftPeerId.valueOf(serverProto.getId().getId()).equals(failedFollower.getId())) {
Assert.assertTrue(serverProto.getLastRpcElapsedTimeMs() > slownessTimeout);
}
}
}
}