Catch exception inside ITRetryUtil to fix one of the causes for flaky integration tests (#11265)
* Do not stop retrying when an exception is encountered. Save & propagate last exception if retry count is exceeded.
* Add one more log message to help with debugging
* Limit schema registry heap to attempt to control OOMs
diff --git a/integration-tests/docker/docker-compose.base.yml b/integration-tests/docker/docker-compose.base.yml
index f8119de..9267b1a 100644
--- a/integration-tests/docker/docker-compose.base.yml
+++ b/integration-tests/docker/docker-compose.base.yml
@@ -398,4 +398,4 @@
SCHEMA_REGISTRY_AUTHENTICATION_METHOD: BASIC
SCHEMA_REGISTRY_AUTHENTICATION_REALM: druid
SCHEMA_REGISTRY_AUTHENTICATION_ROLES: users
- SCHEMA_REGISTRY_OPTS: -Djava.security.auth.login.config=/usr/lib/druid/conf/jaas_config.file
+ SCHEMA_REGISTRY_OPTS: -Djava.security.auth.login.config=/usr/lib/druid/conf/jaas_config.file -Xmx32m
diff --git a/integration-tests/src/main/java/org/apache/druid/testing/utils/ITRetryUtil.java b/integration-tests/src/main/java/org/apache/druid/testing/utils/ITRetryUtil.java
index 3ef2f71..e43c26d 100644
--- a/integration-tests/src/main/java/org/apache/druid/testing/utils/ITRetryUtil.java
+++ b/integration-tests/src/main/java/org/apache/druid/testing/utils/ITRetryUtil.java
@@ -52,23 +52,43 @@
String taskMessage
)
{
- try {
- int currentTry = 0;
- while (callable.call() != expectedValue) {
- if (currentTry > retryCount) {
- throw new ISE("Max number of retries[%d] exceeded for Task[%s]. Failing.", retryCount, taskMessage);
+ int currentTry = 0;
+ Exception lastException = null;
+
+ while (true) {
+ try {
+ LOG.info("Trying attempt[%d/%d]...", currentTry, retryCount);
+ if (currentTry > retryCount || callable.call() == expectedValue) {
+ break;
}
LOG.info(
- "Attempt[%d]: Task %s still not complete. Next retry in %d ms",
- currentTry, taskMessage, delayInMillis
+ "Attempt[%d/%d] did not pass: Task %s still not complete. Next retry in %d ms",
+ currentTry, retryCount, taskMessage, delayInMillis
);
Thread.sleep(delayInMillis);
-
currentTry++;
}
+ catch (Exception e) {
+ // just continue retrying if there is an exception (it may be transient!) but save the last:
+ lastException = e;
+ }
}
- catch (Exception e) {
- throw new RuntimeException(e);
+
+ if (currentTry > retryCount) {
+ if (lastException != null) {
+ throw new ISE(
+ "Max number of retries[%d] exceeded for Task[%s]. Failing.",
+ retryCount,
+ taskMessage,
+ lastException
+ );
+ } else {
+ throw new ISE(
+ "Max number of retries[%d] exceeded for Task[%s]. Failing.",
+ retryCount,
+ taskMessage
+ );
+ }
}
}