[client-test] fix TestFailedDnsResolution

I noticed that ClientTest.TestFailedDnsResolution fails unexpectedly
with the following error when running on macOS:

  src/kudu/client/client-test.cc:3205: Failure
  Value of: s.IsIOError()
    Actual: false
  Expected: true
  unexpected status: OK

It turned out that the scenario didn't expect that
  (a) the results of DNS resolution are cached
  (b) a tablet server's address can be the same as master's

(a) turned true with changelist 48467ccf4, and (b) is true in case of
running test mini-cluster with other than UNIQUE_LOOPBACK bind mode:
e.g., on macOS it's run in LOOPBACK mode.

I updated the scenario to use a non-caching DNS resolver.  I also
increased the timeout for write operations because the scenario
was failing from time to time in case TSAN builds.  In addition, since
timeouts in GetTableLocations RPC are reported two-fold due to the
client's metacache activity, the test was failing rarely due to
receiving other non-expected error message.  I updated the list of
expected error messages to add the missing case.

With this patch, the scenario succeeds on macOS and runs more stable
for Linux TSAN builds.

Change-Id: I0493d992c43adb14ef02efae0a15dddc53301d7d
Reviewed-on: http://gerrit.cloudera.org:8080/17142
Tested-by: Kudu Jenkins
Reviewed-by: Bankim Bhavsar <bankim@cloudera.com>
Reviewed-by: Andrew Wong <awong@cloudera.com>
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index b844ec9..b9cde36 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -174,6 +174,7 @@
 DECLARE_string(location_mapping_cmd);
 DECLARE_string(superuser_acl);
 DECLARE_string(user_acl);
+DECLARE_uint32(dns_resolver_cache_capacity_mb);
 DECLARE_uint32(txn_keepalive_interval_ms);
 DECLARE_uint32(txn_staleness_tracker_interval_ms);
 DECLARE_uint32(txn_manager_status_table_num_replicas);
@@ -3188,42 +3189,56 @@
 }
 
 TEST_F(ClientTest, TestFailedDnsResolution) {
-  shared_ptr<KuduSession> session = client_->NewSession();
+  // Create a dedicated instance of a client which doesn't cache DNS resolution
+  // results. This is to avoid using the DNS resolver cache if the hostname/IP
+  // address of tablet server is the same as master's address. The latter is
+  // the case when using other than UNIQUE_LOOPBACK binding mode for the
+  // server components of the test mini-cluster.
+  FLAGS_dns_resolver_cache_capacity_mb = 0;
+  shared_ptr<KuduClient> c;
+  ASSERT_OK(KuduClientBuilder()
+      .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr().ToString())
+      .Build(&c));
+  shared_ptr<KuduTable> t;
+  ASSERT_OK(c->OpenTable(kTableName, &t));
+
+  shared_ptr<KuduSession> session = c->NewSession();
   ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH));
-  const string kMasterError = "timed out after deadline expired: GetTableLocations RPC";
 
-  // First time disable dns resolution.
-  // Set the timeout to be short since we know it can't succeed, but not to the point where we
-  // can timeout before getting the dns error.
-  {
-    for (int i = 0;;i++) {
-      google::FlagSaver saver;
-      FLAGS_fail_dns_resolution = true;
-      session->SetTimeoutMillis(500);
-      ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row"));
-      Status s = session->Flush();
-      ASSERT_TRUE(s.IsIOError()) << "unexpected status: " << s.ToString();
-      unique_ptr<KuduError> error = GetSingleErrorFromSession(session.get());
-      ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString();
+  // First, make DNS resolution time out.
+  // Set the timeout to be short since we know it can't succeed, but not to the
+  // point where we can timeout before getting the DNS error.
+  FLAGS_fail_dns_resolution = true;
+  session->SetTimeoutMillis(1000);
 
-      // Due to KUDU-1466 there is a narrow window in which the error reported might be that the
-      // GetTableLocations RPC to the master timed out instead of the expected dns resolution error.
-      // In that case just loop again.
-
-      if (error->status().ToString().find(kMasterError) != std::string::npos) {
-        ASSERT_LE(i, 10) << "Didn't get a dns resolution error after 10 tries.";
-        continue;
-      }
-
-      ASSERT_STR_CONTAINS(error->status().ToString(),
-                          "Network error: Failed to resolve address for TS");
-      break;
+  // Due to KUDU-1466, there is a narrow window in which the error reported
+  // might be that the GetTableLocations RPC to the master timed out instead of
+  // the expected DNS resolution error while trying to send Write RPC to
+  // tablet server.
+  for (auto i = 0;; ++i) {
+    constexpr const char* const kMasterErrors[] = {
+      "timed out after deadline expired: GetTableLocations RPC",
+      "LookupRpc timed out after deadline expired",
+    };
+    ASSERT_OK(ApplyInsertToSession(session.get(), t, 1, 1, "row"));
+    auto s = session->Flush();
+    ASSERT_TRUE(s.IsIOError()) << s.ToString();
+    unique_ptr<KuduError> error = GetSingleErrorFromSession(session.get());
+    ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString();
+    const auto row_status_str = error->status().ToString();
+    if (row_status_str.find(kMasterErrors[0]) != std::string::npos ||
+        row_status_str.find(kMasterErrors[1]) != std::string::npos) {
+      ASSERT_LE(i, 10) << "could not get DNS resolution error after 10 tries";
+      continue;
     }
+    ASSERT_STR_CONTAINS(row_status_str,
+                        "Network error: Failed to resolve address for TS");
+    break;
   }
 
-  // Now re-enable dns resolution, the write should succeed.
+  // Now, re-enable the DNS resolution: the write should succeed.
   FLAGS_fail_dns_resolution = false;
-  ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row"));
+  ASSERT_OK(ApplyInsertToSession(session.get(), t, 1, 1, "row"));
   ASSERT_OK(session->Flush());
 }