[FLINK-11850][zk] Tolerate concurrent child deletions when deleting owned zNode When calling ZooKeeperHaServices#closeAndCleanupAllData it can happen that a child of the owned zNode of the ZooKeeperHaServices is being concurrently deleted (e.g. a LeaderElectionService has been shut down). In order to tolerate concurrent deletions, we use now ZKPaths#deleteChildren. This closes #7929.

commit: 2f765e7b47f9acfd7c3f538cbdb5e560a2f41a3b [log] [tgz]
author: Till Rohrmann <trohrmann@apache.org> Thu Mar 07 13:31:16 2019 +0100
committer: Till Rohrmann <trohrmann@apache.org> Thu Mar 07 19:27:34 2019 +0100
tree: a20ab72491de2f31fcd9274e3c0304952c506ac9
parent: b7d85c87195200d5cdccb30160d7f3cecd980de8 [diff]
diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/highavailability/zookeeper/ZooKeeperHaServices.java b/flink-runtime/src/main/java/org/apache/flink/runtime/highavailability/zookeeper/ZooKeeperHaServices.java
index 1b2ff44..2596981 100644
--- a/flink-runtime/src/main/java/org/apache/flink/runtime/highavailability/zookeeper/ZooKeeperHaServices.java
+++ b/flink-runtime/src/main/java/org/apache/flink/runtime/highavailability/zookeeper/ZooKeeperHaServices.java

@@ -251,7 +251,21 @@
 
 	private void deleteOwnedZNode() throws Exception {
 		// delete the HA_CLUSTER_ID znode which is owned by this cluster
-		client.delete().deletingChildrenIfNeeded().forPath("/");
+
+		// Since we are using Curator version 2.12 there is a bug in deleting the children
+		// if there is a concurrent delete operation. Therefore we need to add this retry
+		// logic. See https://issues.apache.org/jira/browse/CURATOR-430 for more information.
+		// The retry logic can be removed once we upgrade to Curator version >= 4.0.1.
+		boolean zNodeDeleted = false;
+		while (!zNodeDeleted) {
+			try {
+				client.delete().deletingChildrenIfNeeded().forPath("/");
+				zNodeDeleted = true;
+			} catch (KeeperException.NoNodeException ignored) {
+				// concurrent delete operation. Try again.
+				LOG.debug("Retrying to delete owned znode because of other concurrent delete operation.");
+			}
+		}
 	}
 
 	/**
commit	2f765e7b47f9acfd7c3f538cbdb5e560a2f41a3b	[log] [tgz]
author	Till Rohrmann <trohrmann@apache.org>	Thu Mar 07 13:31:16 2019 +0100
committer	Till Rohrmann <trohrmann@apache.org>	Thu Mar 07 19:27:34 2019 +0100
tree	a20ab72491de2f31fcd9274e3c0304952c506ac9
parent	b7d85c87195200d5cdccb30160d7f3cecd980de8 [diff]