blob: c69dbbcd03297e8ee9eae2d1a4ada5fc920f3e53 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <dlfcn.h>
#ifdef __linux__
#include <link.h>
#endif
#include "kudu/util/debug-util.h"
#include <unistd.h>
#include <algorithm>
#include <csignal>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <initializer_list>
#include <memory>
#include <ostream>
#include <string>
#include <vector>
#include <gflags/gflags_declare.h>
#include <glog/logging.h>
#include <glog/stl_logging.h>
#include <gtest/gtest.h>
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/gutil/walltime.h"
#include "kudu/util/array_view.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/kernel_stack_watchdog.h"
#include "kudu/util/monotime.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
#include "kudu/util/thread.h"
using std::string;
using std::vector;
using strings::Substitute;
DECLARE_int32(test_timeout_after);
DECLARE_int32(stress_cpu_threads);
namespace kudu {
class DebugUtilTest : public KuduTest {
};
TEST_F(DebugUtilTest, TestStackTrace) {
StackTrace t;
t.Collect(0);
string trace = t.Symbolize();
ASSERT_STR_CONTAINS(trace, "kudu::DebugUtilTest_TestStackTrace_Test::TestBody");
}
// DumpThreadStack is only supported on Linux, since the implementation relies
// on the tgkill syscall which is not portable.
#if defined(__linux__)
namespace {
void SleeperThread(CountDownLatch* l) {
// We use an infinite loop around WaitFor() instead of a normal Wait()
// so that this test passes in TSAN. Without this, we run into this TSAN
// bug which prevents the sleeping thread from handling signals:
// https://code.google.com/p/thread-sanitizer/issues/detail?id=91
while (!l->WaitFor(MonoDelta::FromMilliseconds(10))) {
}
}
void fake_signal_handler(int signum) {}
bool IsSignalHandlerRegistered(int signum) {
struct sigaction cur_action;
CHECK_EQ(0, sigaction(signum, nullptr, &cur_action));
return cur_action.sa_handler != SIG_DFL;
}
} // anonymous namespace
TEST_F(DebugUtilTest, TestStackTraceInvalidTid) {
string s = DumpThreadStack(1);
ASSERT_STR_CONTAINS(s, "unable to deliver signal");
}
TEST_F(DebugUtilTest, TestStackTraceSelf) {
string s = DumpThreadStack(Thread::CurrentThreadId());
ASSERT_STR_CONTAINS(s, "kudu::DebugUtilTest_TestStackTraceSelf_Test::TestBody()");
}
TEST_F(DebugUtilTest, TestStackTraceMainThread) {
string s = DumpThreadStack(getpid());
ASSERT_STR_CONTAINS(s, "kudu::DebugUtilTest_TestStackTraceMainThread_Test::TestBody()");
}
TEST_F(DebugUtilTest, TestSignalStackTrace) {
CountDownLatch l(1);
scoped_refptr<Thread> t;
ASSERT_OK(Thread::Create("test", "test thread", [&l]() { SleeperThread(&l); }, &t));
SCOPED_CLEANUP({
// Allow the thread to finish.
l.CountDown();
t->Join();
});
// We have to loop a little bit because it takes a little while for the thread
// to start up and actually call our function.
//
// Note: due to RELEASE build inlining, we need to make sure to pick a stack
// frame that isn't optimized away. Different compilers behave differently, so
// we'll use a regular expression to get maximal coverage.
static constexpr const char* kTestThreadOneStackFrame =
"SleeperThread";
static constexpr const char* kTestThreadAnotherStackFrame =
"kudu::ConditionVariable::WaitUntil()";
static const string kStackFrameRegExp = Substitute(
"$0|$1", kTestThreadOneStackFrame, kTestThreadAnotherStackFrame);
ASSERT_EVENTUALLY([&]() {
ASSERT_STR_MATCHES(DumpThreadStack(t->tid()), kStackFrameRegExp);
});
// Test that we can change the signal and that the stack traces still work,
// on the new signal.
ASSERT_FALSE(IsSignalHandlerRegistered(SIGHUP));
ASSERT_OK(SetStackTraceSignal(SIGHUP));
// Should now be registered.
ASSERT_TRUE(IsSignalHandlerRegistered(SIGHUP));
// SIGUSR2 should be relinquished.
ASSERT_FALSE(IsSignalHandlerRegistered(SIGUSR2));
// Stack traces should work using the new handler.
ASSERT_STR_MATCHES(DumpThreadStack(t->tid()), kStackFrameRegExp);
// Switch back to SIGUSR2 and ensure it changes back.
ASSERT_OK(SetStackTraceSignal(SIGUSR2));
ASSERT_TRUE(IsSignalHandlerRegistered(SIGUSR2));
ASSERT_FALSE(IsSignalHandlerRegistered(SIGHUP));
// Stack traces should work using the new handler.
ASSERT_STR_MATCHES(DumpThreadStack(t->tid()), kStackFrameRegExp);
// Register our own signal handler on SIGHUP, and ensure that
// we get a bad Status if we try to use it.
signal(SIGHUP, &fake_signal_handler);
ASSERT_STR_CONTAINS(SetStackTraceSignal(SIGHUP).ToString(),
"unable to install signal handler");
signal(SIGHUP, SIG_DFL);
// Stack traces should be disabled
ASSERT_STR_CONTAINS(DumpThreadStack(t->tid()), "unable to take thread stack");
// Re-enable so that other tests pass.
ASSERT_OK(SetStackTraceSignal(SIGUSR2));
}
// Test which dumps all known threads within this process.
// We don't validate the results in any way -- but this verifies that we can
// dump library threads such as the libc timer_thread and properly time out.
TEST_F(DebugUtilTest, TestSnapshot) {
// HACK: prior tests in this suite start threads. Even though they Join on the
// threads before the test case finishes, there is actually a very short
// period of time after Join() returns but before the actual thread has exited
// and removed itself from /proc/self/task/. That means that 'ListThreads' below
// can sometimes show these threads from prior test cases, and then the assertions
// in this test case would fail.
//
// So, we have to wait here for the number of running threads to level off to the
// expected value.
// Ensure Kernel Stack Watchdog is running.
KernelStackWatchdog::GetInstance();
int initial_thread_count =
1 // main thread
+ 1 // KernelStackWatchdog
+ (FLAGS_test_timeout_after > 0 ? 1 : 0) // test timeout thread if running
+ FLAGS_stress_cpu_threads;
#ifdef THREAD_SANITIZER
initial_thread_count++; // tsan signal thread
#endif
// The test and runtime environment runs various utility threads (for example,
// the kernel stack watchdog, the TSAN runtime thread, the test timeout thread, etc).
// Count them before we start any additional threads for this test.
ASSERT_EVENTUALLY([&]{
vector<pid_t> threads;
ASSERT_OK(ListThreads(&threads));
ASSERT_EQ(initial_thread_count, threads.size()) << threads;
});
// Start a bunch of sleeping threads.
const int kNumThreads = 30;
CountDownLatch l(1);
vector<scoped_refptr<Thread>> threads(kNumThreads);
for (int i = 0; i < kNumThreads; i++) {
ASSERT_OK(Thread::Create("test", "test thread",
[&l]() { SleeperThread(&l); }, &threads[i]));
}
SCOPED_CLEANUP({
// Allow the thread to finish.
l.CountDown();
for (auto& t : threads) {
t->Join();
}
});
StackTraceSnapshot snap;
ASSERT_OK(snap.SnapshotAllStacks());
int count = 0;
int groups = 0;
snap.VisitGroups([&](ArrayView<StackTraceSnapshot::ThreadInfo> group) {
groups++;
for (const auto& info : group) {
count++;
LOG(INFO) << info.tid << " " << info.thread_name
<< " (" << info.status.ToString() << ")";
}
LOG(INFO) << group[0].stack.ToHexString();
});
int tsan_threads = 0;
#ifdef THREAD_SANITIZER
// TSAN starts an extra thread of its own.
tsan_threads++;
#endif
ASSERT_EQ(kNumThreads + initial_thread_count, count);
// The threads might not have exactly identical stacks, but
// we should have far fewer groups than the total number
// of threads.
ASSERT_LE(groups, kNumThreads / 2);
ASSERT_EQ(tsan_threads, snap.num_failed());
}
TEST_F(DebugUtilTest, Benchmark) {
CountDownLatch l(1);
scoped_refptr<Thread> t;
ASSERT_OK(Thread::Create("test", "test thread", [&l]() { SleeperThread(&l); }, &t));
SCOPED_CLEANUP({
// Allow the thread to finish.
l.CountDown();
t->Join();
});
for (bool symbolize : {false, true}) {
MonoTime end_time = MonoTime::Now() + MonoDelta::FromSeconds(1);
int count = 0;
volatile int prevent_optimize = 0;
while (MonoTime::Now() < end_time) {
StackTrace trace;
GetThreadStack(t->tid(), &trace);
if (symbolize) {
prevent_optimize += trace.Symbolize().size();
}
count++;
}
LOG(INFO) << "Throughput: " << count << " dumps/second (symbolize=" << symbolize << ")";
}
}
int TakeStackTrace(struct dl_phdr_info* /*info*/, size_t /*size*/, void* data) {
StackTrace* s = reinterpret_cast<StackTrace*>(data);
s->Collect(0);
return 0;
}
// Test that if we try to collect a stack trace while inside a libdl function
// call that we properly return the bogus stack indicating the issue.
//
// This doesn't work in ThreadSanitizer since we don't intercept dl_iterate_phdr
// in those builds (see note in unwind_safeness.cc).
#ifndef THREAD_SANITIZER
TEST_F(DebugUtilTest, TestUnwindWhileUnsafe) {
StackTrace s;
dl_iterate_phdr(&TakeStackTrace, &s);
ASSERT_STR_CONTAINS(s.Symbolize(), "CouldNotCollectStackTraceBecauseInsideLibDl");
}
#endif
int DoNothingDlCallback(struct dl_phdr_info* /*info*/, size_t /*size*/, void* /*data*/) {
return 0;
}
// Parameterized test which performs various operations which might be dangerous to
// collect a stack trace while the main thread tries to take stack traces. These
// operations are all possibly executed on normal application threads, so we need to
// ensure that if we happen to gather the stack from a thread in the middle of the
// function that we don't crash or deadlock.
//
// Example self-deadlock if we didn't have the appropriate workarounds in place:
// #0 __lll_lock_wait ()
// #1 0x00007ffff6f16e42 in __GI___pthread_mutex_lock
// #2 0x00007ffff6c8601f in __GI___dl_iterate_phdr
// #3 0x0000000000695b02 in dl_iterate_phdr
// #4 0x000000000056d013 in _ULx86_64_dwarf_find_proc_info
// #5 0x000000000056d1d5 in fetch_proc_info (c=c@ent
// #6 0x000000000056e2e7 in _ULx86_64_dwarf_find_save_
// #7 0x000000000056c1b9 in _ULx86_64_dwarf_step (c=c@
// #8 0x000000000056be21 in _ULx86_64_step
// #9 0x0000000000566b1d in google::GetStackTrace
// #10 0x00000000004dc4d1 in kudu::StackTrace::Collect
// #11 kudu::(anonymous namespace)::HandleStackTraceSignal
// #12 <signal handler called>
// #13 0x00007ffff6f16e31 in __GI___pthread_mutex_lock
// #14 0x00007ffff6c8601f in __GI___dl_iterate_phdr
// #15 0x0000000000695b02 in dl_iterate_phdr
enum DangerousOp {
DLOPEN_AND_CLOSE,
DL_ITERATE_PHDR,
GET_STACK_TRACE,
MALLOC_AND_FREE
};
class RaceTest : public DebugUtilTest, public ::testing::WithParamInterface<DangerousOp> {};
INSTANTIATE_TEST_CASE_P(DifferentRaces, RaceTest,
::testing::Values(DLOPEN_AND_CLOSE,
DL_ITERATE_PHDR,
GET_STACK_TRACE,
MALLOC_AND_FREE));
void DangerousOperationThread(DangerousOp op, CountDownLatch* l) {
while (l->count()) {
switch (op) {
case DLOPEN_AND_CLOSE: {
// Check races against dlopen/dlclose.
void* v = dlopen("libc.so.6", RTLD_LAZY);
CHECK(v);
dlclose(v);
break;
}
case DL_ITERATE_PHDR: {
// Check for races against dl_iterate_phdr.
dl_iterate_phdr(&DoNothingDlCallback, nullptr);
break;
}
case GET_STACK_TRACE: {
// Check for reentrancy issues
GetStackTrace();
break;
}
case MALLOC_AND_FREE: {
// Check large allocations in tcmalloc.
volatile char* x = new char[1024 * 1024 * 2];
delete[] x;
break;
}
default:
LOG(FATAL) << "unknown op";
}
}
}
// Starts a thread performing dangerous operations and then gathers
// its stack trace in a loop trying to trigger races.
TEST_P(RaceTest, TestStackTraceRaces) {
DangerousOp op = GetParam();
CountDownLatch l(1);
scoped_refptr<Thread> t;
ASSERT_OK(Thread::Create("test", "test thread",
[op, &l]() { DangerousOperationThread(op, &l); }, &t));
SCOPED_CLEANUP({
// Allow the thread to finish.
l.CountDown();
// Crash if we can't join the thread after a reasonable amount of time.
// That probably indicates a deadlock.
CHECK_OK(ThreadJoiner(t.get()).give_up_after_ms(10000).Join());
});
MonoTime end_time = MonoTime::Now() + MonoDelta::FromSeconds(1);
while (MonoTime::Now() < end_time) {
StackTrace trace;
GetThreadStack(t->tid(), &trace);
}
}
void BlockSignalsThread() {
sigset_t set;
sigemptyset(&set);
sigaddset(&set, SIGUSR2);
for (int i = 0; i < 3; i++) {
CHECK_ERR(pthread_sigmask((i % 2) ? SIG_UNBLOCK : SIG_BLOCK, &set, nullptr));
SleepFor(MonoDelta::FromSeconds(1));
}
}
TEST_F(DebugUtilTest, TestThreadBlockingSignals) {
scoped_refptr<Thread> t;
ASSERT_OK(Thread::Create("test", "test thread", &BlockSignalsThread, &t));
SCOPED_CLEANUP({ t->Join(); });
string ret;
while (ret.find("unable to deliver signal") == string::npos) {
ret = DumpThreadStack(t->tid());
LOG(INFO) << ret;
}
}
// Test stack traces which time out despite the destination thread not blocking
// signals.
TEST_F(DebugUtilTest, TestTimeouts) {
const int kRunTimeSecs = AllowSlowTests() ? 5 : 1;
CountDownLatch l(1);
scoped_refptr<Thread> t;
ASSERT_OK(Thread::Create("test", "test thread", [&l]() { SleeperThread(&l); }, &t));
SCOPED_CLEANUP({
// Allow the thread to finish.
l.CountDown();
t->Join();
});
// First, time a few stack traces to determine how long a non-timed-out stack
// trace takes.
vector<MicrosecondsInt64> durations;
for (int i = 0; i < 20; i++) {
StackTrace stack;
auto st = GetMonoTimeMicros();
ASSERT_OK(GetThreadStack(t->tid(), &stack));
auto dur = GetMonoTimeMicros() - st;
durations.push_back(dur);
}
// Compute the median to throw out outliers.
std::sort(durations.begin(), durations.end());
auto median_duration = durations[durations.size() / 2];
LOG(INFO) << "Median duration: " << median_duration << "us";
// Now take a bunch of stack traces with timeouts clustered around
// the expected time. When we time out, we adjust the timeout to be
// higher so the next attempt is less likely to time out. Conversely,
// when we succeed, we adjust the timeout to be shorter so the next
// attempt is more likely to time out. This has the effect of triggering
// all the interesting cases: (a) success, (b) timeout, (c) timeout
// exactly as the signal finishes.
int num_timeouts = 0;
int num_successes = 0;
auto end_time = MonoTime::Now() + MonoDelta::FromSeconds(kRunTimeSecs);
int64_t timeout_us = median_duration;
while (MonoTime::Now() < end_time) {
StackTraceCollector stc;
// Allocate Stack on the heap so that if we get a use-after-free it
// will be caught more easily by ASAN.
std::unique_ptr<StackTrace> stack(new StackTrace());
ASSERT_OK(stc.TriggerAsync(t->tid(), stack.get()));
Status s = stc.AwaitCollection(MonoTime::Now() + MonoDelta::FromMicroseconds(timeout_us));
if (s.ok()) {
num_successes++;
timeout_us--;
} else if (s.IsTimedOut()) {
num_timeouts++;
timeout_us++;
} else {
FAIL() << "Unexpected status: " << s.ToString();
}
}
LOG(INFO) << "Timed out " << num_timeouts << " times";
LOG(INFO) << "Succeeded " << num_successes << " times";
}
#endif
} // namespace kudu