UIMA-6111 Don't treat ResourcesUnavailable as a service error
git-svn-id: https://svn.apache.org/repos/asf/uima/uima-ducc/trunk@1864725 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/resources/default.ducc.properties b/src/main/resources/default.ducc.properties
index 932d7a5..8857a40 100644
--- a/src/main/resources/default.ducc.properties
+++ b/src/main/resources/default.ducc.properties
@@ -681,10 +681,11 @@
# If a node becomes inoperative, the Resource Manager deallocates all processes on that
# node and attempts to reallocate them on other nodes. The node is marked offline and is
# unusable until its heartbeats start up again.
-# The default configuration declares the agent heartbeats to occur at 1 minute intervals.
-# Therefore heartbeats must be missed for five minutes before the Resource Manager takes
+# The default configuration declares the agent heartbeats to occur at 30 second intervals.
+# (see ducc.agent.node.metrics.publish.rate)
+# Therefore heartbeats must be missed for 3 minutes before the Resource Manager takes
# corrective action.
-ducc.rm.node.stability = 5
+ducc.rm.node.stability = 6
# Which policy to use when shrinking/evicting shares - alternatively, SHRINK_BY_MACHINE.
# The eviction.policy is a heuristic to choose which processes of a job to preempt because of
diff --git a/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java b/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
index 61e2b06..a1eee31 100644
--- a/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
+++ b/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
@@ -1497,7 +1497,12 @@
logger.info(methodName, id, "Instance", inst_id, "stopped by SM. Not restarting.");
} else {
// An instance stopped and we (SM) didn't ask it to - by definition this is failure no matter how it exits.
-
+
+ // If the RM purges the instance on a node failure the service should not be penalized UIMA-6111
+ if (jct == JobCompletionType.ResourcesUnavailable) {
+ logger.info(methodName, id, "Instance purged by RM (node died?) - prior state[", old_state,
+ "] current state[", state, "] completion[", jct, "]");
+ } else {
switch ( old_state ) {
case WaitingForServices:
case WaitingForResources:
@@ -1515,6 +1520,7 @@
logger.info(methodName, id, "Instance stopped unexpectedly: prior state[", old_state, " completion[", jct, "]");
break;
}
+ }
if ( excessiveFailures() ) {
String disable_reason = null;