ZOOKEEPER-1636: cleanup completion list of a failed multi request

This is a backport of https://github.com/apache/zookeeper/pull/717 to `branch-3.4`

For some reason it was NOT merged to `branch-3.4`, but only to `branch-3.5` and `master`:
- 4bd32bef2b75238e67b13bdfc7e6896d64c32434
- b1fd480b2c8e0cc1429345ee04510d3849001c5c

Here, it's almost a cherry-pick of the original patch, but slightly adopted to specific of `branch-3.4`.
Namely, `cleanup_failed_multi()` and `deserialize_multi()` do not have the first parameter `zhandle_t *zh`.

----

I wish to backport this also to use it as the base for my https://github.com/apache/zookeeper/pull/999

Author: Alexander A. Strelets <streletsaa@gmail.com>

Reviewers: Andor Molnár <andor@apache.org>, Michael Han <hanm@apache.org>

Closes #1030 from xoiss/ZOOKEEPER-1636
diff --git a/zookeeper-client/zookeeper-client-c/src/zookeeper.c b/zookeeper-client/zookeeper-client-c/src/zookeeper.c
index 1e780ff..9479719 100644
--- a/zookeeper-client/zookeeper-client-c/src/zookeeper.c
+++ b/zookeeper-client/zookeeper-client-c/src/zookeeper.c
@@ -2021,6 +2021,17 @@
     }
 }
 
+// cleanup completion list of a failed multi request
+static void cleanup_failed_multi(int xid, int rc, completion_list_t *cptr) {
+    completion_list_t *entry;
+    completion_head_t *clist = &cptr->c.clist;
+    while ((entry = dequeue_completion(clist)) != NULL) {
+        // Fake failed response for all sub-requests
+        deserialize_response(entry->c.type, xid, 1, rc, entry, NULL);
+        destroy_completion_entry(entry);
+    }
+}
+
 static int deserialize_multi(int xid, completion_list_t *cptr, struct iarchive *ia)
 {
     int rc = 0;
@@ -2138,8 +2149,12 @@
     case COMPLETION_MULTI:
         LOG_DEBUG(("Calling COMPLETION_MULTI for xid=%#x failed=%d rc=%d",
                     cptr->xid, failed, rc));
-        rc = deserialize_multi(xid, cptr, ia);
         assert(cptr->c.void_result);
+        if (failed) {
+            cleanup_failed_multi(xid, rc, cptr);
+        } else {
+            rc = deserialize_multi(xid, cptr, ia);
+        }
         cptr->c.void_result(rc, cptr->data);
         break;
     default:
diff --git a/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc b/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc
index a54e047..e9172a3 100644
--- a/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc
+++ b/zookeeper-client/zookeeper-client-c/tests/TestMulti.cc
@@ -178,6 +178,7 @@
     CPPUNIT_TEST(testCheck);
     CPPUNIT_TEST(testWatch);
     CPPUNIT_TEST(testSequentialNodeCreateInAsyncMulti);
+    CPPUNIT_TEST(testBigAsyncMulti);
 #endif
     CPPUNIT_TEST_SUITE_END();
 
@@ -249,6 +250,16 @@
         count++;
     }
 
+    static void multi_completion_fn_rc(int rc, const void *data) {
+        count++;
+        *((int*) data) = rc;
+    }
+
+     static void create_completion_fn_rc(int rc, const char* value, const void *data) {
+        count++;
+        *((int*) data) = rc;
+    }
+
     static void waitForMultiCompletion(int seconds) {
         time_t expires = time(0) + seconds;
         while(count == 0 && time(0) < expires) {
@@ -656,6 +667,63 @@
         waitForMultiCompletion(5);
      }
 
+    /**
+     * ZOOKEEPER-1636: If request is too large, the server will cut the
+     * connection without sending response packet. The client will try to
+     * process completion on multi request and eventually cause SIGSEGV
+     */
+    void testBigAsyncMulti() {
+        int rc;
+        int callback_rc = (int) ZOK;
+        watchctx_t ctx;
+        zhandle_t *zk = createClient(&ctx);
+
+        // The request should be more than 1MB which exceeds the default
+        // jute.maxbuffer and causes the server to drop client connection
+        const int iteration = 500;
+        const int type_count = 3;
+        const int nops = iteration * type_count;
+        char buff[1024];
+
+        zoo_op_result_t results[nops];
+        zoo_op_t ops[nops];
+        struct Stat* s[nops];
+        int index = 0;
+
+        // Test that we deliver error to 3 types of sub-request
+        for (int i = 0; i < iteration; ++i) {
+            zoo_set_op_init(&ops[index++], "/x", buff, sizeof(buff), -1, s[i]);
+            zoo_create_op_init(&ops[index++], "/x", buff, sizeof(buff),
+                               &ZOO_OPEN_ACL_UNSAFE, ZOO_SEQUENCE, NULL, 0);
+            zoo_delete_op_init(&ops[index++], "/x", -1);
+        }
+
+        rc = zoo_amulti(zk, nops, ops, results, multi_completion_fn_rc,
+                        &callback_rc);
+        CPPUNIT_ASSERT_EQUAL((int) ZOK, rc);
+
+        waitForMultiCompletion(10);
+        // With the bug, we will get SIGSEGV before reaching this point
+        CPPUNIT_ASSERT_EQUAL((int) ZCONNECTIONLOSS, callback_rc);
+
+        // Make sure that all sub-request completions get processed
+        for (int i = 0; i < nops; ++i) {
+            CPPUNIT_ASSERT_EQUAL((int) ZCONNECTIONLOSS, results[i].err);
+        }
+
+        // The handle should be able to recover itself.
+        ctx.waitForConnected(zk);
+
+        // Try to submit another async request to see if it get processed
+        // correctly
+        rc = zoo_acreate(zk, "/target", "", 0, &ZOO_OPEN_ACL_UNSAFE, 0,
+                         create_completion_fn_rc, &callback_rc);
+        CPPUNIT_ASSERT_EQUAL((int) ZOK, rc);
+
+        waitForMultiCompletion(10);
+        CPPUNIT_ASSERT_EQUAL((int) ZOK, callback_rc);
+    }
+
      /**
       * ZOOKEEPER-1624: PendingChanges of create sequential node request didn't
       * get rollbacked correctly when multi-op failed. This caused