Improving local HealthServiceIntegrationTest reliablility
There appears to be a race condidtion with the simulacron library that causes
test failure if we don't wait long enough. This patch simply extends
the window that we can wait for a test node to come back online.
Patch by Jon Haddad; Reviewed by Dinesh Joshi for CASSANDRA-15615
diff --git a/CHANGES.txt b/CHANGES.txt
index dcba20e..d08bc9e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,6 @@
1.0.0
-----
+ * Improving local HealthCheckTest reliability (CASSANDRA-15615)
* Read sidecar.yaml from sidecar.config System Property instead of classpath (CASSANDRA-15288)
* Add integration tests task (CASSANDRA-15031)
* Add support for SSL and bindable address (CASSANDRA-15030)
diff --git a/src/integration/java/org/apache/cassandra/sidecar/HealthServiceIntegrationTest.java b/src/integration/java/org/apache/cassandra/sidecar/HealthServiceIntegrationTest.java
index 567f67e..5ad6d28 100644
--- a/src/integration/java/org/apache/cassandra/sidecar/HealthServiceIntegrationTest.java
+++ b/src/integration/java/org/apache/cassandra/sidecar/HealthServiceIntegrationTest.java
@@ -49,10 +49,13 @@
import io.vertx.core.Vertx;
import io.vertx.core.http.HttpServer;
import io.vertx.core.http.HttpServerOptions;
+import io.vertx.core.logging.Logger;
+import io.vertx.core.logging.LoggerFactory;
import io.vertx.ext.web.Router;
import io.vertx.ext.web.client.WebClient;
import io.vertx.ext.web.codec.BodyCodec;
import io.vertx.junit5.VertxTestContext;
+
import org.apache.cassandra.sidecar.routes.HealthCheck;
import org.apache.cassandra.sidecar.routes.HealthService;
@@ -72,6 +75,9 @@
.build();
private static final HashedWheelTimer sharedHWT = new HashedWheelTimer(threadFactory);
private static final EventLoopGroup sharedEventLoopGroup = new NioEventLoopGroup(0, threadFactory);
+
+ private static final Logger logger = LoggerFactory.getLogger(HealthServiceIntegrationTest.class);
+
private static final NettyOptions shared = new NettyOptions()
{
public EventLoopGroup eventLoopGroup(ThreadFactory threadFactory)
@@ -125,6 +131,11 @@
sessions.clear();
}
+ /**
+ * This test has a race condition that can result in test failure. Be sure to wait long enough for the server
+ * to register as up.
+ * See CASSANDRA-15615
+ */
@DisplayName("100 node cluster stopping, then starting")
@Test
public void testDownHost() throws InterruptedException
@@ -143,15 +154,15 @@
Set<BoundNode> downNodes = new HashSet<>();
Map<BoundNode, HealthCheck> checks = new HashMap<>();
- // Create a HealthCheck per node
+ logger.info("Create a health check per node");
for (BoundNode node : bCluster.getNodes())
checks.put(node, healthCheckFor(node, shared));
- // verify all nodes marked as up
+ logger.info("verify all nodes marked as up");
for (BoundNode node : bCluster.getNodes())
assertTrue(checks.get(node).get());
- // shut down nodes one at a time, and verify we get correct response on all HealthChecks every iteration
+ logger.info("shut down nodes one at a time, and verify we get correct response on all HealthChecks");
for (int i = 0; downNodes.size() < nodeCount; i++)
{
for (BoundNode node : bCluster.getNodes())
@@ -160,23 +171,32 @@
downNodes.add(bCluster.node(i));
}
- // all hosts should be down
+ logger.info("all hosts should be down");
for (BoundNode node : bCluster.getNodes())
assertFalse(checks.get(node).get());
- for (int i = 0; downNodes.size() > 0; i++)
+ logger.info("Starting nodes back up");
+
+ int i;
+ for (i = 0; downNodes.size() > 0; i++)
{
bCluster.node(i).start();
downNodes.remove(bCluster.node(i));
}
+ logger.info("Nodes started back up: " + i);
- // verify all nodes marked as up
+ logger.info("verify all nodes marked as up");
+
long start = System.currentTimeMillis();
+
+ int checkNumber = 0;
for (BoundNode node : bCluster.getNodes())
{
- while ((System.currentTimeMillis() - start) < 10000 && !checks.get(node).get())
- Thread.sleep(100);
- assertTrue(checks.get(node).get());
+ while ((System.currentTimeMillis() - start) < 20000 && !checks.get(node).get())
+ Thread.sleep(250);
+ logger.info("Started node " + checkNumber);
+ assertTrue(checks.get(node).get(), "Failed on node " + checkNumber);
+ checkNumber++;
}
}
}