ZOOKEEPER-1636: cleanup completion list of a failed multi request
This is a backport of https://github.com/apache/zookeeper/pull/717 to `branch-3.4`
For some reason it was NOT merged to `branch-3.4`, but only to `branch-3.5` and `master`:
- 4bd32bef2b75238e67b13bdfc7e6896d64c32434
- b1fd480b2c8e0cc1429345ee04510d3849001c5c
Here, it's almost a cherry-pick of the original patch, but slightly adopted to specific of `branch-3.4`.
Namely, `cleanup_failed_multi()` and `deserialize_multi()` do not have the first parameter `zhandle_t *zh`.
----
I wish to backport this also to use it as the base for my https://github.com/apache/zookeeper/pull/999
Author: Alexander A. Strelets <streletsaa@gmail.com>
Reviewers: Andor Molnár <andor@apache.org>, Michael Han <hanm@apache.org>
Closes #1030 from xoiss/ZOOKEEPER-1636
diff --git a/zookeeper-client/zookeeper-client-c/src/zookeeper.c b/zookeeper-client/zookeeper-client-c/src/zookeeper.c
index 1e780ff..9479719 100644
--- a/zookeeper-client/zookeeper-client-c/src/zookeeper.c
+++ b/zookeeper-client/zookeeper-client-c/src/zookeeper.c
@@ -2021,6 +2021,17 @@
}
}
+// cleanup completion list of a failed multi request
+static void cleanup_failed_multi(int xid, int rc, completion_list_t *cptr) {
+ completion_list_t *entry;
+ completion_head_t *clist = &cptr->c.clist;
+ while ((entry = dequeue_completion(clist)) != NULL) {
+ // Fake failed response for all sub-requests
+ deserialize_response(entry->c.type, xid, 1, rc, entry, NULL);
+ destroy_completion_entry(entry);
+ }
+}
+
static int deserialize_multi(int xid, completion_list_t *cptr, struct iarchive *ia)
{
int rc = 0;
@@ -2138,8 +2149,12 @@
case COMPLETION_MULTI:
LOG_DEBUG(("Calling COMPLETION_MULTI for xid=%#x failed=%d rc=%d",
cptr->xid, failed, rc));
- rc = deserialize_multi(xid, cptr, ia);
assert(cptr->c.void_result);
+ if (failed) {
+ cleanup_failed_multi(xid, rc, cptr);
+ } else {
+ rc = deserialize_multi(xid, cptr, ia);
+ }
cptr->c.void_result(rc, cptr->data);
break;
default:
diff --git a/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc b/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc
index a54e047..e9172a3 100644
--- a/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc
+++ b/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc
@@ -178,6 +178,7 @@
CPPUNIT_TEST(testCheck);
CPPUNIT_TEST(testWatch);
CPPUNIT_TEST(testSequentialNodeCreateInAsyncMulti);
+ CPPUNIT_TEST(testBigAsyncMulti);
#endif
CPPUNIT_TEST_SUITE_END();
@@ -249,6 +250,16 @@
count++;
}
+ static void multi_completion_fn_rc(int rc, const void *data) {
+ count++;
+ *((int*) data) = rc;
+ }
+
+ static void create_completion_fn_rc(int rc, const char* value, const void *data) {
+ count++;
+ *((int*) data) = rc;
+ }
+
static void waitForMultiCompletion(int seconds) {
time_t expires = time(0) + seconds;
while(count == 0 && time(0) < expires) {
@@ -656,6 +667,63 @@
waitForMultiCompletion(5);
}
+ /**
+ * ZOOKEEPER-1636: If request is too large, the server will cut the
+ * connection without sending response packet. The client will try to
+ * process completion on multi request and eventually cause SIGSEGV
+ */
+ void testBigAsyncMulti() {
+ int rc;
+ int callback_rc = (int) ZOK;
+ watchctx_t ctx;
+ zhandle_t *zk = createClient(&ctx);
+
+ // The request should be more than 1MB which exceeds the default
+ // jute.maxbuffer and causes the server to drop client connection
+ const int iteration = 500;
+ const int type_count = 3;
+ const int nops = iteration * type_count;
+ char buff[1024];
+
+ zoo_op_result_t results[nops];
+ zoo_op_t ops[nops];
+ struct Stat* s[nops];
+ int index = 0;
+
+ // Test that we deliver error to 3 types of sub-request
+ for (int i = 0; i < iteration; ++i) {
+ zoo_set_op_init(&ops[index++], "/x", buff, sizeof(buff), -1, s[i]);
+ zoo_create_op_init(&ops[index++], "/x", buff, sizeof(buff),
+ &ZOO_OPEN_ACL_UNSAFE, ZOO_SEQUENCE, NULL, 0);
+ zoo_delete_op_init(&ops[index++], "/x", -1);
+ }
+
+ rc = zoo_amulti(zk, nops, ops, results, multi_completion_fn_rc,
+ &callback_rc);
+ CPPUNIT_ASSERT_EQUAL((int) ZOK, rc);
+
+ waitForMultiCompletion(10);
+ // With the bug, we will get SIGSEGV before reaching this point
+ CPPUNIT_ASSERT_EQUAL((int) ZCONNECTIONLOSS, callback_rc);
+
+ // Make sure that all sub-request completions get processed
+ for (int i = 0; i < nops; ++i) {
+ CPPUNIT_ASSERT_EQUAL((int) ZCONNECTIONLOSS, results[i].err);
+ }
+
+ // The handle should be able to recover itself.
+ ctx.waitForConnected(zk);
+
+ // Try to submit another async request to see if it get processed
+ // correctly
+ rc = zoo_acreate(zk, "/target", "", 0, &ZOO_OPEN_ACL_UNSAFE, 0,
+ create_completion_fn_rc, &callback_rc);
+ CPPUNIT_ASSERT_EQUAL((int) ZOK, rc);
+
+ waitForMultiCompletion(10);
+ CPPUNIT_ASSERT_EQUAL((int) ZOK, callback_rc);
+ }
+
/**
* ZOOKEEPER-1624: PendingChanges of create sequential node request didn't
* get rollbacked correctly when multi-op failed. This caused