fix race in DSTTest.testCancel
seeing errors on jenkins server though it runs fine locally;
have patched what looks like one race, but not sure it explains the failure
at https://builds.apache.org/job/incubator-brooklyn-pull-requests/org.apache.brooklyn$brooklyn-core/2440/testReport/junit/org.apache.brooklyn.util.core.task/DynamicSequentialTaskTest/testCancelled/
where we are seeing an extra semaphore at the end of the test;
have added comments and logging in case it is seen again.
(locally all is well.)
diff --git a/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicExecutionManager.java b/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicExecutionManager.java
index 0aab7d5..c75ad0b 100644
--- a/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicExecutionManager.java
+++ b/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicExecutionManager.java
@@ -67,7 +67,6 @@
import com.google.common.base.CaseFormat;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
-import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.ExecutionList;
@@ -565,7 +564,15 @@
int subtasksReallyCancelled=0;
if (task instanceof HasTaskChildren) {
- for (Task<?> child: ((HasTaskChildren)task).getChildren()) {
+ // cancel tasks in reverse order --
+ // it should be the case that if child1 is cancelled,
+ // a parentTask should NOT call a subsequent child2,
+ // but just in case, we cancel child2 first
+ // NB: DST and others may apply their own recursive cancel behaviour
+ MutableList<Task<?>> childrenReversed = MutableList.copyOf( ((HasTaskChildren)task).getChildren() );
+ Collections.reverse(childrenReversed);
+
+ for (Task<?> child: childrenReversed) {
if (log.isTraceEnabled()) {
log.trace("Cancelling "+child+" on recursive cancellation of "+task);
}
diff --git a/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicTask.java b/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicTask.java
index 7c29bba..efd3001 100644
--- a/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicTask.java
+++ b/brooklyn-server/core/src/main/java/org/apache/brooklyn/util/core/task/BasicTask.java
@@ -298,7 +298,7 @@
public synchronized boolean cancel(TaskCancellationMode mode) {
if (isDone()) return false;
if (log.isTraceEnabled()) {
- log.trace("BT cancelling "+this+" mode "+mode);
+ log.trace("BT cancelling "+this+" mode "+mode+", from thread "+Thread.currentThread());
}
cancelled = true;
doCancel(mode);
diff --git a/brooklyn-server/core/src/test/java/org/apache/brooklyn/util/core/task/DynamicSequentialTaskTest.java b/brooklyn-server/core/src/test/java/org/apache/brooklyn/util/core/task/DynamicSequentialTaskTest.java
index 364870a..763c067 100644
--- a/brooklyn-server/core/src/test/java/org/apache/brooklyn/util/core/task/DynamicSequentialTaskTest.java
+++ b/brooklyn-server/core/src/test/java/org/apache/brooklyn/util/core/task/DynamicSequentialTaskTest.java
@@ -139,6 +139,7 @@
Thread.sleep(duration.toMillisecondsRoundingUp());
}
} catch (InterruptedException e) {
+ log.info("releasing semaphore on interruption after saying "+message);
cancellations.release();
throw Exceptions.propagate(e);
}
@@ -159,7 +160,8 @@
}
public Task<String> sayTask(String message, Duration duration, String message2) {
- return Tasks.<String>builder().displayName("say:"+message).body(sayCallable(message, duration, message2)).build();
+ return Tasks.<String>builder().displayName("say:"+message+(duration!=null ? ":wait("+duration+")" : "")+(message2!=null ? ":"+message2 : ""))
+ .body(sayCallable(message, duration, message2)).build();
}
public <T> Task<T> submitting(final Task<T> task) {
@@ -193,33 +195,52 @@
sayTask("2a", Duration.THIRTY_SECONDS, "2b"),
sayTask("3"));
ec.submit(t);
-
+
+ // wait for 2 to start, saying "2a", and the first interruptible block is when it waits for its 30s
waitForMessages(Predicates.compose(MathPredicates.greaterThanOrEqual(2), CollectionFunctionals.sizeFunction()), TIMEOUT);
Assert.assertEquals(messages, Arrays.asList("1", "2a"));
- Time.sleep(Duration.millis(50));
+
+ // now cancel, and make sure we get the right behaviour
t.cancel(true);
Assert.assertTrue(t.isDone());
- // 2 should get cancelled, and invoke the cancellation semaphore
+ // 2 should get cancelled, and invoke the cancellation semaphore, but not say 2b
// 3 should get cancelled and not run at all
- Assert.assertEquals(messages, Arrays.asList("1", "2a"));
- // Need to ensure that 2 has been started; race where we might cancel it before its run method
- // is even begun. Hence doing "2a; pause; 2b" where nothing is interruptable before pause.
+ // cancel(..) currently cancels everything in the tree in the calling thread
+ // so we could even assert task3.isCancelled() now
+ // but not sure we will guarantee that for subtasks, so weaker assertion
+ // that it is eventually cancelled, and that it for sure never starts
+
+ // message list is still 1, 2a
+ Assert.assertEquals(messages, Arrays.asList("1", "2a"));
+ // And 2 when cancelled should release the semaphore
+ log.info("testCancelled waiting on semaphore; permits left is "+cancellations.availablePermits());
Assert.assertTrue(cancellations.tryAcquire(10, TimeUnit.SECONDS));
+ log.info("testCancelled acquired semaphore; permits left is "+cancellations.availablePermits());
Iterator<Task<?>> ci = ((HasTaskChildren)t).getChildren().iterator();
+ // 1 completed fine
Assert.assertEquals(ci.next().get(), "1");
+ // 2 is cancelled -- cancelled flag should always be set *before* the interrupt sent
+ // (and that released the semaphore above, even if thread is not completed, so this should be set)
Task<?> task2 = ci.next();
Assert.assertTrue(task2.isBegun());
Assert.assertTrue(task2.isDone());
Assert.assertTrue(task2.isCancelled());
-
+
Task<?> task3 = ci.next();
+ // 3 is being cancelled in the thread which cancelled 2, and should either be
+ // *before* 2 was cancelled or *not run* because the parent was cancelled
+ // so we shouldn't need to wait ... but if we did:
+// Asserts.eventually(Suppliers.ofInstance(task3), TaskPredicates.isDone());
+ Assert.assertTrue(task3.isDone());
+ Assert.assertTrue(task3.isCancelled());
Assert.assertFalse(task3.isBegun());
- Assert.assertTrue(task2.isDone());
- Assert.assertTrue(task2.isCancelled());
-
- // but we do _not_ get a mutex from task3 as it does not run (is not interrupted)
+ // messages unchanged
+ Assert.assertEquals(messages, Arrays.asList("1", "2a"));
+ // no further mutexes should be available (ie 3 should not run)
+ // TODO for some reason this was observed to fail on the jenkins box (2016-01)
+ // but i can't see why; have added logging in case it happens again
Assert.assertEquals(cancellations.availablePermits(), 0);
}