blob: f8067bdf43f1d97bfb978ba53623e41ab9b83bc3 [file] [log] [blame]
// Copyright 2014 Cloudera, Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
// Systemtap script for tracing the filesystem-related system calls issued
// by a particular process, optionally scoped to within a particular path
// subtree (i.e. a path and all paths below it).
// Effectively like 'strace -fT', except that fds are automatically resolved
// to file names.
// Invoke as follows:
// - stap trace_io.stp -x <pid> [path]
// To start tracing an existing process.
// - stap trace_io.stp -c <command> [path]
// To run a command and trace it.
// See README.systemtap for prerequisite information.
// Maps (pid, fd) ==> filename.
global fds
// Maps (pid, filename) ==> reference_count.
global references
// Path to filter all filenames against.
// Sadly, SystemTap doesn't provide a way to test if a string starts with a
// prefix that isn't a string literal. Regex matching requires that the
// regex be a string literal (not a variable), and the "string library" only
// provides a substring matching method. We could write our own in embedded
// C, but that means the script must be run in "unsafe" mode. Given those
// options we use substring matching, opening us up to false positives.
global filter_path
global running
// Add a reference to [fd, filename] for the current process.
function start_tracking (fd, filename) {
fds[pid(), fd] = filename
if ([pid(), filename] in references) {
references[pid(), filename]++
} else {
references[pid(), filename] = 1
// Remove a reference from [fd, filename] for the current process. Returns
// 1 if the pair has no more references, 0 otherwise.
function stop_tracking (fd, filename) {
delete fds[pid(), fd]
references[pid(), filename]--
if (references[pid(), filename] == 0) {
delete references[pid(), filename]
return 1
return 0
probe begin {
filter_path = argc ? argv[1] : ""
running = 0
if (target()) {
printf("[%d] %s: beginning tracing of PID %d with path \"%s\"\n",
pid(), ctime(gettimeofday_s()), target(), filter_path)
running = 1
} else {
printf("Need to supply a target on the stap command line\n")
printf("Pass either -x <pid> (to trace an existing process)")
printf(" or -c <command> (to trace a new one)\n");
probe end {
if (running) {
printf("[%d] %s: ending tracing of PID %d\n",
pid(), ctime(gettimeofday_s()), target())
running = 0
probe {
// The target() corresponds to the pid provided to stap(1) via -x, or the
// pid of the command run via -c.
// See 'man stap' for more details.
filename = user_string_quoted($filename)
if (pid() == target() && isinstr(filename, filter_path)) {
if ($flags & 64) {
argstr = sprintf("%s, %s, %#o", filename, _sys_open_flag_str($flags),
} else {
argstr = sprintf("%s, %s", filename, _sys_open_flag_str($flags))
if ($return >= 0) {
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: open %s (%d us)\n",
tid(), ctime(gettimeofday_s()), argstr, time)
start_tracking($return, filename)
probe syscall.close.return {
if ([pid(), $fd] in fds) {
filename = fds[pid(), $fd]
if (stop_tracking($fd, filename)) {
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: close %s (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, time)
probe syscall.dup.return {
oldfd = $fildes
newfd = $return
if (newfd >= 0) {
if ([pid(), oldfd] in fds) {
// oldfd is duplicated to newfd, update the tracking.
start_tracking(newfd, fds[pid(), oldfd])
// For some reason probes on syscall.dup2 never trigger, but probes on the
// raw kernel function do.
probe kernel.function("sys_dup2").return {
oldfd = $oldfd
newfd = $return
if (newfd >= 0 && newfd != oldfd) {
// If newfd is already open, dup2() closes it.
if ([pid(), newfd] in fds) {
stop_tracking(newfd, fds[pid(), newfd])
// oldfd is duplicated to newfd.
if ([pid(), oldfd] in fds) {
start_tracking(newfd, fds[pid(), oldfd])
// The constants here can be found in linux/fs.h, but are hard-coded to
// avoid usage of embedded C in systemtap.
// See `man 2 sync_file_range` for more information.
function _sys_sync_file_range_flag_str:string (f:long) {
retval = ""
if (f & 1) {
retval = retval . "|WAIT_BEFORE"
if (f & 2) {
retval = retval . "|WRITE"
if (f & 4) {
retval = retval . "|WAIT_AFTER"
if (retval == "") {
return "0"
} else {
// Trim the extra pipe at the beginning.
return substr(retval, 1, strlen(retval) - 1)
// As of Ubuntu 14.04 we have to refer to sync_file_range by kernel function
// name because there's no syscall alias defined.
probe kernel.function("sys_sync_file_range").return {
if ([pid(), $fd] in fds) {
filename = fds[pid(), $fd]
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: sync_file_range %s %d %d %s -> %d (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, $offset, $nbytes,
_sys_sync_file_range_flag_str($flags), $return, time)
probe {
if ([pid(), $fd] in fds) {
filename = fds[pid(), $fd]
argstr = sprintf("%p, %d", $buf, $count)
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: read %s %s -> %d (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, argstr, $return, time)
probe syscall.write.return {
if ([pid(), $fd] in fds) {
filename = fds[pid(), $fd]
argstr = sprintf("%p, %d", $buf, $count)
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: write %s %s -> %d (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, argstr, $return, time)
probe syscall.fsync.return {
if ([pid(), $fd] in fds) {
filename = fds[pid(), $fd]
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: fsync %s -> %d (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, $return, time)
probe syscall.fdatasync.return {
if ([pid(), $fd] in fds) {
filename = fds[pid(), $fd]
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: fdatasync %s -> %d (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, $return, time)
probe syscall.unlink.return {
filename = user_string_quoted($pathname)
if (pid() == target() && isinstr(filename, filter_path)) {
time = gettimeofday_us() - @entry(gettimeofday_us())
printf("[%d] %s: unlink %s -> %d (%d us)\n",
tid(), ctime(gettimeofday_s()), filename, $return, time)