[client-test] fix TestFailedDnsResolution
I noticed that ClientTest.TestFailedDnsResolution fails unexpectedly
with the following error when running on macOS:
src/kudu/client/client-test.cc:3205: Failure
Value of: s.IsIOError()
Actual: false
Expected: true
unexpected status: OK
It turned out that the scenario didn't expect that
(a) the results of DNS resolution are cached
(b) a tablet server's address can be the same as master's
(a) turned true with changelist 48467ccf4, and (b) is true in case of
running test mini-cluster with other than UNIQUE_LOOPBACK bind mode:
e.g., on macOS it's run in LOOPBACK mode.
I updated the scenario to use a non-caching DNS resolver. I also
increased the timeout for write operations because the scenario
was failing from time to time in case TSAN builds. In addition, since
timeouts in GetTableLocations RPC are reported two-fold due to the
client's metacache activity, the test was failing rarely due to
receiving other non-expected error message. I updated the list of
expected error messages to add the missing case.
With this patch, the scenario succeeds on macOS and runs more stable
for Linux TSAN builds.
Change-Id: I0493d992c43adb14ef02efae0a15dddc53301d7d
Reviewed-on: http://gerrit.cloudera.org:8080/17142
Tested-by: Kudu Jenkins
Reviewed-by: Bankim Bhavsar <bankim@cloudera.com>
Reviewed-by: Andrew Wong <awong@cloudera.com>
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index b844ec9..b9cde36 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -174,6 +174,7 @@
DECLARE_string(location_mapping_cmd);
DECLARE_string(superuser_acl);
DECLARE_string(user_acl);
+DECLARE_uint32(dns_resolver_cache_capacity_mb);
DECLARE_uint32(txn_keepalive_interval_ms);
DECLARE_uint32(txn_staleness_tracker_interval_ms);
DECLARE_uint32(txn_manager_status_table_num_replicas);
@@ -3188,42 +3189,56 @@
}
TEST_F(ClientTest, TestFailedDnsResolution) {
- shared_ptr<KuduSession> session = client_->NewSession();
+ // Create a dedicated instance of a client which doesn't cache DNS resolution
+ // results. This is to avoid using the DNS resolver cache if the hostname/IP
+ // address of tablet server is the same as master's address. The latter is
+ // the case when using other than UNIQUE_LOOPBACK binding mode for the
+ // server components of the test mini-cluster.
+ FLAGS_dns_resolver_cache_capacity_mb = 0;
+ shared_ptr<KuduClient> c;
+ ASSERT_OK(KuduClientBuilder()
+ .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr().ToString())
+ .Build(&c));
+ shared_ptr<KuduTable> t;
+ ASSERT_OK(c->OpenTable(kTableName, &t));
+
+ shared_ptr<KuduSession> session = c->NewSession();
ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH));
- const string kMasterError = "timed out after deadline expired: GetTableLocations RPC";
- // First time disable dns resolution.
- // Set the timeout to be short since we know it can't succeed, but not to the point where we
- // can timeout before getting the dns error.
- {
- for (int i = 0;;i++) {
- google::FlagSaver saver;
- FLAGS_fail_dns_resolution = true;
- session->SetTimeoutMillis(500);
- ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row"));
- Status s = session->Flush();
- ASSERT_TRUE(s.IsIOError()) << "unexpected status: " << s.ToString();
- unique_ptr<KuduError> error = GetSingleErrorFromSession(session.get());
- ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString();
+ // First, make DNS resolution time out.
+ // Set the timeout to be short since we know it can't succeed, but not to the
+ // point where we can timeout before getting the DNS error.
+ FLAGS_fail_dns_resolution = true;
+ session->SetTimeoutMillis(1000);
- // Due to KUDU-1466 there is a narrow window in which the error reported might be that the
- // GetTableLocations RPC to the master timed out instead of the expected dns resolution error.
- // In that case just loop again.
-
- if (error->status().ToString().find(kMasterError) != std::string::npos) {
- ASSERT_LE(i, 10) << "Didn't get a dns resolution error after 10 tries.";
- continue;
- }
-
- ASSERT_STR_CONTAINS(error->status().ToString(),
- "Network error: Failed to resolve address for TS");
- break;
+ // Due to KUDU-1466, there is a narrow window in which the error reported
+ // might be that the GetTableLocations RPC to the master timed out instead of
+ // the expected DNS resolution error while trying to send Write RPC to
+ // tablet server.
+ for (auto i = 0;; ++i) {
+ constexpr const char* const kMasterErrors[] = {
+ "timed out after deadline expired: GetTableLocations RPC",
+ "LookupRpc timed out after deadline expired",
+ };
+ ASSERT_OK(ApplyInsertToSession(session.get(), t, 1, 1, "row"));
+ auto s = session->Flush();
+ ASSERT_TRUE(s.IsIOError()) << s.ToString();
+ unique_ptr<KuduError> error = GetSingleErrorFromSession(session.get());
+ ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString();
+ const auto row_status_str = error->status().ToString();
+ if (row_status_str.find(kMasterErrors[0]) != std::string::npos ||
+ row_status_str.find(kMasterErrors[1]) != std::string::npos) {
+ ASSERT_LE(i, 10) << "could not get DNS resolution error after 10 tries";
+ continue;
}
+ ASSERT_STR_CONTAINS(row_status_str,
+ "Network error: Failed to resolve address for TS");
+ break;
}
- // Now re-enable dns resolution, the write should succeed.
+ // Now, re-enable the DNS resolution: the write should succeed.
FLAGS_fail_dns_resolution = false;
- ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row"));
+ ASSERT_OK(ApplyInsertToSession(session.get(), t, 1, 1, "row"));
ASSERT_OK(session->Flush());
}