<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
    <meta name="description" content="A new open source Apache Hadoop ecosystem project, Apache Kudu completes Hadoop's storage layer to enable fast analytics on fast data" />
    <meta name="author" content="Cloudera" />
    <title>Apache Kudu - Apache Kudu Troubleshooting</title>
    <!-- Bootstrap core CSS -->
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css"
          integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7"
          crossorigin="anonymous">

    <!-- Custom styles for this template -->
    <link href="/css/kudu.css" rel="stylesheet"/>
    <link href="/css/asciidoc.css" rel="stylesheet"/>
    <link rel="shortcut icon" href="/img/logo-favicon.ico" />
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.1/css/font-awesome.min.css" />

    

    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
    <!--[if lt IE 9]>
        <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
        <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
        <![endif]-->
  </head>
  <body>
    <div class="kudu-site container-fluid">
      <!-- Static navbar -->
        <nav class="navbar navbar-default">
          <div class="container-fluid">
            <div class="navbar-header">
              <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
                <span class="sr-only">Toggle navigation</span>
                <span class="icon-bar"></span>
                <span class="icon-bar"></span>
                <span class="icon-bar"></span>
              </button>
              
              <a class="logo" href="/"><img
                src="//d3dr9sfxru4sde.cloudfront.net/i/k/apachekudu_logo_0716_80px.png"
                srcset="//d3dr9sfxru4sde.cloudfront.net/i/k/apachekudu_logo_0716_80px.png 1x, //d3dr9sfxru4sde.cloudfront.net/i/k/apachekudu_logo_0716_160px.png 2x"
                alt="Apache Kudu"/></a>
              
            </div>
            <div id="navbar" class="collapse navbar-collapse">
              <ul class="nav navbar-nav navbar-right">
                <li >
                  <a href="/">Home</a>
                </li>
                <li >
                  <a href="/overview.html">Overview</a>
                </li>
                <li class="active">
                  <a href="/docs/">Documentation</a>
                </li>
                <li >
                  <a href="/releases/">Releases</a>
                </li>
                <li >
                  <a href="/blog/">Blog</a>
                </li>
                <!-- NOTE: this dropdown menu does not appear on Mobile, so don't add anything here
                     that doesn't also appear elsewhere on the site. -->
                <li class="dropdown">
                  <a href="/community.html" role="button" aria-haspopup="true" aria-expanded="false">Community <span class="caret"></span></a>
                  <ul class="dropdown-menu">
                    <li class="dropdown-header">GET IN TOUCH</li>
                    <li><a class="icon email" href="/community.html">Mailing Lists</a></li>
                    <li><a class="icon slack" href="https://getkudu-slack.herokuapp.com/">Slack Channel</a></li>
                    <li role="separator" class="divider"></li>
                    <li><a href="/community.html#meetups-user-groups-and-conference-presentations">Events and Meetups</a></li>
                    <li><a href="/committers.html">Project Committers</a></li>
                    <li><a href="/ecosystem.html">Ecosystem</a></li>
                    <!--<li><a href="/roadmap.html">Roadmap</a></li>-->
                    <li><a href="/community.html#contributions">How to Contribute</a></li>
                    <li role="separator" class="divider"></li>
                    <li class="dropdown-header">DEVELOPER RESOURCES</li>
                    <li><a class="icon github" href="https://github.com/apache/incubator-kudu">GitHub</a></li>
                    <li><a class="icon gerrit" href="http://gerrit.cloudera.org:8080/#/q/status:open+project:kudu">Gerrit Code Review</a></li>
                    <li><a class="icon jira" href="https://issues.apache.org/jira/browse/KUDU">JIRA Issue Tracker</a></li>
                    <li role="separator" class="divider"></li>
                    <li class="dropdown-header">SOCIAL MEDIA</li>
                    <li><a class="icon twitter" href="https://twitter.com/ApacheKudu">Twitter</a></li>
                    <li><a href="https://www.reddit.com/r/kudu/">Reddit</a></li>
                    <li role="separator" class="divider"></li>
                    <li class="dropdown-header">APACHE SOFTWARE FOUNDATION</li>
                    <li><a href="https://www.apache.org/security/" target="_blank">Security</a></li>
                    <li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Sponsorship</a></li>
                    <li><a href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a></li>
                    <li><a href="https://www.apache.org/licenses/" target="_blank">License</a></li>
                  </ul>
                </li>
                <li >
                  <a href="/faq.html">FAQ</a>
                </li>
              </ul><!-- /.nav -->
            </div><!-- /#navbar -->
          </div><!-- /.container-fluid -->
        </nav>

<!--

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->


<div class="container">
  <div class="row">
    <div class="col-md-9">

<h1>Apache Kudu Troubleshooting</h1>
      <div id="preamble">
<div class="sectionbody">
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<i class="fa icon-note" title="Note"></i>
</td>
<td class="content">
This document applies to Apache Kudu version 1.16.0. Please consult the
<a href="http://kudu.apache.org/releases/">documentation of the appropriate release</a> that&#8217;s applicable
to the version of the Kudu cluster.
</td>
</tr>
</table>
</div>
</div>
</div>
<div class="sect1">
<h2 id="_startup_errors"><a class="link" href="#_startup_errors">Startup Errors</a></h2>
<div class="sectionbody">
<div class="sect2">
<h3 id="req_hole_punching"><a class="link" href="#req_hole_punching">Errors During Hole Punching Test</a></h3>
<div class="paragraph">
<p>Kudu requires hole punching capabilities in order to be efficient. Hole punching support
depends upon your operation system kernel version and local filesystem implementation.</p>
</div>
<div class="ulist">
<ul>
<li>
<p>RHEL or CentOS 6.4 or later, patched to kernel version of 2.6.32-358 or later.
Unpatched RHEL or CentOS 6.4 does not include a kernel with support for hole punching.</p>
</li>
<li>
<p>Ubuntu 14.04 includes version 3.13 of the Linux kernel, which supports hole punching.</p>
</li>
<li>
<p>Newer versions of the ext4 and xfs filesystems support hole punching. Older versions
that do not support hole punching will cause Kudu to emit an error message such as the
following and to fail to start:</p>
<div class="listingblock">
<div class="content">
<pre>Error during hole punch test. The log block manager requires a
filesystem with hole punching support such as ext4 or xfs. On el6,
kernel version 2.6.32-358 or newer is required. To run without hole
punching (at the cost of some efficiency and scalability), reconfigure
Kudu to use the file block manager. Refer to the Kudu documentation for
more details. WARNING: the file block manager is not suitable for
production use and should be used only for small-scale evaluation and
development on systems where hole-punching is not available. It's
impossible to switch between block managers after data is written to the
server. Raw error message follows</pre>
</div>
</div>
</li>
</ul>
</div>
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<i class="fa icon-note" title="Note"></i>
</td>
<td class="content">
ext4 mountpoints may actually be backed by ext2 or ext3 formatted devices, which do not
support hole punching. The hole punching test will fail when run on such filesystems. There
are several different ways to determine whether an ext4 mountpoint is backed by an ext2,
ext3, or ext4 formatted device; see <a href="https://unix.stackexchange.com/q/60723">this Stack
Exchange post</a> for details.
</td>
</tr>
</table>
</div>
<div class="paragraph">
<p>Without hole punching support, the log block manager is unsafe to use. It won&#8217;t
ever delete blocks, and will consume ever more space on disk.</p>
</div>
<div class="paragraph">
<p>You can run the following sequence of commands on a mounted file system to
check whether it supports hole punching (the originally allocated 10MiB
turns into 5MiB after punching a 5MiB hole in the file at 1MiB offset):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ dd if=/dev/zero of=hole_punch bs=1M count=10 2&gt;/dev/null
$ du -h hole_punch
10M     hole_punch
$ fallocate -p -o 1M -l 5M hole_punch
$ du -h hole_punch
5.0M    hole_punch</pre>
</div>
</div>
<div class="paragraph">
<p>If you can&#8217;t use hole punching in your environment, you can still
try Kudu. Enable the file block manager instead of the log block manager by
adding the <code>--block_manager=file</code> flag to the commands you use to start the master
and tablet servers. The file block manager does not scale as well as the log block
manager.</p>
</div>
<div class="admonitionblock warning">
<table>
<tr>
<td class="icon">
<i class="fa icon-warning" title="Warning"></i>
</td>
<td class="content">
<div class="paragraph">
<p>The file block manager is known to scale and perform poorly, and should
only be used for small-scale evaluation and development, and only on systems
where hole punching is unavailable.</p>
</div>
<div class="paragraph">
<p>The file block manager uses one file per block. As multiple blocks are written
for each rowset, the number of blocks can be very high, especially for actively
written tablets. This can cause performance issues compared to the log block
manager even with a small amount of data and it&#8217;s <strong>impossible to switch between
block managers</strong> without wiping and reinitializing the tablet servers.</p>
</div>
</td>
</tr>
</table>
</div>
</div>
<div class="sect2">
<h3 id="disk_issues"><a class="link" href="#disk_issues">Already present: FS layout already exists</a></h3>
<div class="paragraph">
<p>When Kudu starts, it checks each configured data directory, expecting either for all to be
initialized or for all to be empty. If a server fails to start with a log message like</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Check failed: _s.ok() Bad status: Already present: FS layout already exists; not overwriting existing layout: FSManager roots already exist: /data0/kudu/data</pre>
</div>
</div>
<div class="paragraph">
<p>then this precondition has failed. This could be because Kudu was configured with non-empty data
directories on first startup, or because a previously-running, healthy Kudu process was restarted
and at least one data directory was deleted or is somehow corrupted, perhaps because of a disk
error. If in the latter situation, consult the
<a href="administration.html#change_dir_config">Changing Directory Configurations</a> documentation.</p>
</div>
</div>
<div class="sect2">
<h3 id="ntp"><a class="link" href="#ntp">NTP Clock Synchronization</a></h3>
<div class="paragraph">
<p>The local clock of the machine where Kudu master or tablet server is running
must be synchronized using the Network Time Protocol (NTP) if using the <code>system</code>
time source. The time source is controlled by the <code>--time_source</code> flag and
by default is set to <code>system</code>.</p>
</div>
<div class="paragraph">
<p>Kudu requires the <strong>maximum clock error</strong> (not to be mistaken with the estimated
error) of the NTP-synchronized clock be below a configurable threshold.
The default threshold value is 10 seconds and it can be customized using the
<code>--max_clock_sync_error_usec</code> flag.</p>
</div>
<div class="paragraph">
<p>When running with the <code>system</code> time source, Kudu will not start and will emit
a message such as below if the local clock is reported unsynchronized:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>F0924 20:24:36.336809 14550 hybrid_clock.cc:191 Couldn't get the current time: Clock unsynchronized. Status: Service unavailable: Error reading clock. Clock considered unsynchronized.</pre>
</div>
</div>
<div class="paragraph">
<p>If the machine&#8217;s clock is synchronized, but the maximum clock error is too high,
the user will see a message such as:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Sep 17, 8:13:09.873 PM FATAL hybrid_clock.cc:196 Couldn't get the current time: Clock synchronized, but error: 11130000, is past the maximum allowable error: 10000000</pre>
</div>
</div>
<div class="paragraph">
<p>or</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Sep 17, 8:32:31.135 PM FATAL tablet_server_main.cc:38 Check failed: _s.ok() Bad status: Service unavailable: Cannot initialize clock: Cannot initialize HybridClock. Clock synchronized but error was too high (11711000 us).</pre>
</div>
</div>
<div class="paragraph">
<p>In this and following NTP-related paragraphs, when talking about the
'synchronization' with true time using NTP, we are referring to a couple of
things:
- the synchronization status of the NTP server which drives the local clock
  of the machine
- the synchronization status of the local machine&#8217;s clock itself as reported
  by the kernel&#8217;s NTP discipline</p>
</div>
<div class="paragraph">
<p>The former can be retrieved using the <code>ntpstat</code>, <code>ntpq</code>, and <code>ntpdc</code> utilities
if using <code>ntpd</code> (they are included in the <code>ntp</code> package) or the <code>chronyc</code>
utility if using <code>chronyd</code> (that&#8217;s a part of the <code>chrony</code> package). The latter
can be retrieved using either the <code>ntptime</code> utility (the <code>ntptime</code> utility is
also a part of the <code>ntp</code> package) or the <code>chronyc</code> utility if using <code>chronyd</code>.
For more information, see the manual pages of the mentioned utilities and the
paragraphs below.</p>
</div>
<div class="sect3">
<h4 id="_ntp_related_packages"><a class="link" href="#_ntp_related_packages">NTP-related Packages</a></h4>
<div class="paragraph">
<p>For a long time, <code>ntpd</code> has been the recommended NTP server to use on Kudu
nodes to synchronize local machines' clocks. Newer releases of Linux OS offer
<code>chronyd</code> as an alternative to <code>ntpd</code> for network time synchronization. Both
have been tested and proven to provide necessary functionality for clock
synchronisation in a Kudu cluster.</p>
</div>
<div class="sect4">
<h5 id="_installing_and_running_ntpd"><a class="link" href="#_installing_and_running_ntpd">Installing And Running <code>ntpd</code></a></h5>
<div class="paragraph">
<p><code>ntpd</code> is the NTP server from the ubiquitous <code>ntp</code> suite.</p>
</div>
<div class="paragraph">
<p>To install <code>ntpd</code> and other NTP-related utilities, use the appropriate command
for your operating system:</p>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 50%;">
<col style="width: 50%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">OS</th>
<th class="tableblock halign-left valign-top">Command</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">Debian/Ubuntu</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo apt-get install ntp</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">RHEL/CentOS</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo yum install ntp</code></p></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p>If <code>ntpd</code> is installed but not running, start it using one of these commands
(don&#8217;t forget to run <code>ntpdate</code> first):</p>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 50%;">
<col style="width: 50%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">OS</th>
<th class="tableblock halign-left valign-top">Command</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">Debian/Ubuntu</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo service ntp restart</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">RHEL/CentOS</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo service ntpd restart</code></p></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p>Make sure <code>ntpdate</code> is in the list of services running when the machine starts:
<code>ntpdate</code> should be run prior starting <code>ntpd</code> to avoid long synchronization
delay of the machine&#8217;s local clock with the true time. The smaller the offset
between local machine&#8217;s clock and the true time, the faster the NTP server can
synchronize the clock.</p>
</div>
<div class="paragraph">
<p>When running <code>ntpdate</code>, make sure the tool reports success: check its exit
status and output. In case of issues connecting to the NTP servers, make sure
NTP traffic is not being blocked by a firewall (NTP generates UDP traffic on
port 123 by default) or other network connectivity issue.</p>
</div>
<div class="paragraph">
<p>Below are a few examples of configuration files for <code>ntpd</code>. By default, <code>ntpd</code>
uses <code>/etc/ntp.conf</code> configuration file.</p>
</div>
<div class="listingblock">
<div class="content">
<pre># Use my organization's internal NTP server (server in a local network).
server ntp1.myorg.internal iburst maxpoll 7
# Add servers from the NTP public pool for redundancy and robustness.
server 0.pool.ntp.org iburst maxpoll 8
server 1.pool.ntp.org iburst maxpoll 8
server 2.pool.ntp.org iburst maxpoll 8
server 3.pool.ntp.org iburst maxpoll 8</pre>
</div>
</div>
<div class="listingblock">
<div class="content">
<pre># AWS case: use dedicated NTP server available via link-local IP address.
server 169.254.169.123 iburst</pre>
</div>
</div>
<div class="listingblock">
<div class="content">
<pre># GCE case: use dedicated NTP server available from within cloud instance.
server metadata.google.internal iburst</pre>
</div>
</div>
<div class="paragraph">
<p>Sometimes it takes too long to synchronize the machine&#8217;s local clock with the
true time even if the <code>ntpstat</code> utility reports that the NTP daemon is
synchronized with one of the reference NTP servers. This manifests as the
following: the utilities which report on the synchronization status of the NTP
daemon claim that all is well, but <code>ntptime</code> claims that the status of the
local clock is unsynchronized and Kudu tablet servers and masters refuse to
start, outputting an error like the one mentioned above. This situation often
happens if the <code>ntpd</code> is run with the <code>-x</code> option. According to the manual
page of <code>ntpd</code>, the <code>-x</code> flag configures the NTP server to only slew the clock.
Without <code>-x</code>, the NTP server would do a step adjustment instead:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>  -x     Normally, the time is slewed if the offset is less than the
         step threshold, which is 128 ms by default, and stepped if
         above the threshold. This option sets the threshold to 600 s,
         which is well within the accuracy window to set the clock manually.
         Note: Since the slew rate of typical Unix kernels is limited to
         0.5 ms/s, each second of	adjustment requires an amortization
         interval of 2000 s. Thus, an adjustment as much as 600 s
         will take almost 14 days to complete.</pre>
</div>
</div>
<div class="paragraph">
<p>In such cases, removing the <code>-x</code> option will help synchronize the local clock
faster.</p>
</div>
<div class="paragraph">
<p>More information on best practices and examples of practical resolution of
various NTP synchronization issues can be found found at
<a href="https://www.redhat.com/en/blog/avoiding-clock-drift-vms">clock-drift</a></p>
</div>
</div>
<div class="sect4">
<h5 id="_monitoring_clock_synchronization_status_with_the_ntp_suite"><a class="link" href="#_monitoring_clock_synchronization_status_with_the_ntp_suite">Monitoring Clock Synchronization Status With The <code>ntp</code> Suite</a></h5>
<div class="paragraph">
<p>When the <code>ntp</code> package is installed, you can monitor the synchronization status
of the machine&#8217;s clock by running <code>ntptime</code>. For example, a system
with a local clock that is synchronized may report:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>ntp_gettime() returns code 0 (OK)
  time de24c0cf.8d5da274  Tue, Feb  6 2018 16:03:27.552, (.552210980),
  maximum error 224455 us, estimated error 383 us, TAI offset 0
ntp_adjtime() returns code 0 (OK)
  modes 0x0 (),
  offset 1279.543 us, frequency 2.500 ppm, interval 1 s,
  maximum error 224455 us, estimated error 383 us,
  status 0x2001 (PLL,NANO),
  time constant 10, precision 0.001 us, tolerance 500 ppm,</pre>
</div>
</div>
<div class="paragraph">
<p>Note the following most important pieces of output:</p>
</div>
<div class="ulist">
<ul>
<li>
<p><code>maximum error 22455 us</code>: this value is well under the 10-second maximum
error required by Kudu.</p>
</li>
<li>
<p><code>status 0x2001 (PLL,NANO)</code>: this indicates the local clock is synchronized
with the true time up to the maximum error above</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>In contrast, a system with unsynchronized local clock would report something
like the following:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>ntp_gettime() returns code 5 (ERROR)
  time de24c240.0c006000  Tue, Feb  6 2018 16:09:36.046, (.046881),
  maximum error 16000000 us, estimated error 16000000 us, TAI offset 0
ntp_adjtime() returns code 5 (ERROR)
  modes 0x0 (),
  offset 0.000 us, frequency 2.500 ppm, interval 1 s,
  maximum error 16000000 us, estimated error 16000000 us,
  status 0x40 (UNSYNC),
  time constant 10, precision 1.000 us, tolerance 500 ppm,</pre>
</div>
</div>
<div class="paragraph">
<p>The <code>UNSYNC</code> status means the local clock is not synchronized with the
true time. Because of that, the maximum reported error doesn&#8217;t convey any
meaningful estimation of the actual error.</p>
</div>
<div class="paragraph">
<p>The <code>ntpstat</code> utility reports a summary on the synchronization status of
the NTP daemon itself. For example, a system which have <code>ntpd</code> running and
synchronized with one of its reference servers may report:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ ntpstat
synchronised to NTP server (172.18.7.3) at stratum 4
   time correct to within 160 ms
   polling server every 1024 s</pre>
</div>
</div>
<div class="paragraph">
<p>Keep in mind that the synchronization status of the NTP daemon itself doesn&#8217;t
reflect the synchronization status of the local clock. The way NTP daemon
drives the local clock is subject to many constraints, and it may take the NTP
daemon some time to synchronize the local clock after it itself has latched
to one of the reference servers.</p>
</div>
<div class="paragraph">
<p>If more detailed information is needed on the synchronization status of the
NTP server (but not the synchronization status of the local clock), the <code>ntpq</code>
or <code>ntpdc</code> tools can be used to get detailed information about what NTP server
is currently acting as the source of the true time and which are considered
as candidates (either viable or not):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ ntpq -nc lpeers
     remote           refid      st t when poll reach   delay   offset  jitter
==============================================================================
-108.59.2.24     130.133.1.10     2 u   13   64    1   71.743    0.373   0.016
+192.96.202.120  129.6.15.28      2 u   12   64    1   72.583   -0.426   0.028
-69.10.161.7     204.26.59.157    3 u   11   64    1   15.741    2.641   0.021
-173.255.206.154 45.56.123.24     3 u   10   64    1   43.502    0.199   0.029
-69.195.159.158  128.138.140.44   2 u    9   64    1   53.885   -0.016   0.013
*216.218.254.202 .CDMA.           1 u    6   64    1    1.475   -0.400   0.012
+129.250.35.250  249.224.99.213   2 u    7   64    1    1.342   -0.640   0.018

$ ntpq -nc opeers
     remote           local      st t when poll reach   delay   offset    disp
==============================================================================
-108.59.2.24     10.17.100.238    2 u   17   64    1   71.743    0.373 187.573
+192.96.202.120  10.17.100.238    2 u   16   64    1   72.583   -0.426 187.594
-69.10.161.7     10.17.100.238    3 u   15   64    1   15.741    2.641 187.569
-173.255.206.154 10.17.100.238    3 u   14   64    1   43.502    0.199 187.580
-69.195.159.158  10.17.100.238    2 u   13   64    1   53.885   -0.016 187.561
*216.218.254.202 10.17.100.238    1 u   10   64    1    1.475   -0.400 187.543
+129.250.35.250  10.17.100.238    2 u   11   64    1    1.342   -0.640 187.588</pre>
</div>
</div>
<div class="admonitionblock tip">
<table>
<tr>
<td class="icon">
<i class="fa icon-tip" title="Tip"></i>
</td>
<td class="content">
Both <code>lpeers</code> and <code>opeers</code> may be helpful as <code>lpeers</code> lists refid and
jitter, while <code>opeers</code> lists clock dispersion.
</td>
</tr>
</table>
</div>
</div>
<div class="sect4">
<h5 id="chronyd"><a class="link" href="#chronyd">Installing And Running <code>chronyd</code></a></h5>
<div class="paragraph">
<p>Kudu has been tested and is supported on machines whose local clock is
synchronized with NTP using <code>chronyd</code> version 3.2 and newer.</p>
</div>
<div class="paragraph">
<p>The OS package is called <code>chrony</code> and contains both the NTP server <code>chronyd</code>
and the <code>chronyc</code> command line utility. To install the <code>chronyd</code> NTP server
and other utilities, use the appropriate command for your operating system:</p>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 50%;">
<col style="width: 50%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">OS</th>
<th class="tableblock halign-left valign-top">Command</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">Debian/Ubuntu</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo apt-get install chrony</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">RHEL/CentOS</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo yum install chrony</code></p></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p>If <code>chronyd</code> is installed but not yet running, start it using one of these
commands (don&#8217;t forget to run <code>chronyd -q</code> first):</p>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 50%;">
<col style="width: 50%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">OS</th>
<th class="tableblock halign-left valign-top">Command</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">Debian/Ubuntu</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo service chrony restart</code></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">RHEL/CentOS</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><code>sudo service chronyd restart</code></p></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p>By default, <code>chronyd</code> uses <code>/etc/chrony.conf</code> configuration file. The <code>rtcsync</code>
option must be enabled in <code>chrony.conf</code>. Without <code>rtcsync</code>, the local machine&#8217;s
clock will always be reported as unsynchronized and Kudu masters and tablet
servers will not be able to start. The following
<a href="https://github.com/mlichvar/chrony/blob/994409a03697b8df68115342dc8d1e7ceeeb40bd/sys_timex.c#L162-L166">code</a>
explains the observed behavior of <code>chronyd</code> when setting the synchronization
status of the local clock on Linux.</p>
</div>
<div class="paragraph">
<p>As verified at RHEL7.5/CentOS7.5 with <code>chronyd</code> 3.2 and newer, the default
configuration file is good enough to satisfy Kudu requirements for the system
clock if running on a machine that has Internet access.</p>
</div>
<div class="paragraph">
<p>An <a href="https://chrony.tuxfamily.org/faq.html#_what_is_the_minimum_recommended_configuration_for_an_ntp_client">example of a minimum viable configuration</a> for <code>chronyd</code> is:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>pool pool.ntp.org iburst
driftfile /var/lib/chrony/drift
makestep 1 3
rtcsync</pre>
</div>
</div>
</div>
<div class="sect4">
<h5 id="_monitoring_clock_synchronization_status_with_the_chrony_suite"><a class="link" href="#_monitoring_clock_synchronization_status_with_the_chrony_suite">Monitoring Clock Synchronization Status With The <code>chrony</code> Suite</a></h5>
<div class="paragraph">
<p>When the <code>chrony</code> package is installed, you can monitor the synchronization
status of the machine&#8217;s clock by running <code>chronyc tracking</code> (add <code>-n</code> option
if no resolution of IP addresses back to FQDNs is desired:
<code>chronyc -n tracking</code>).</p>
</div>
<div class="paragraph">
<p>For example, a system where <code>chronyd</code> hasn&#8217;t synchronized the local clock yet
may report something like the following:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Reference ID    : 00000000 ()
Stratum         : 0
Ref time (UTC)  : Thu Jan 01 00:00:00 1970
System time     : 0.000000000 seconds fast of NTP time
Last offset     : +0.000000000 seconds
RMS offset      : 0.000000000 seconds
Frequency       : 69.422 ppm slow
Residual freq   : +0.000 ppm
Skew            : 0.000 ppm
Root delay      : 1.000000000 seconds
Root dispersion : 1.000000000 seconds
Update interval : 0.0 seconds
Leap status     : Not synchronised</pre>
</div>
</div>
<div class="paragraph">
<p>A system with its local clock already synchronized may report:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Reference ID    : A9FEA9FE (169.254.169.254)
Stratum         : 3
Ref time (UTC)  : Tue Mar 03 06:33:23 2020
System time     : 0.000011798 seconds fast of NTP time
Last offset     : +0.000014285 seconds
RMS offset      : 0.001493311 seconds
Frequency       : 69.417 ppm slow
Residual freq   : +0.000 ppm
Skew            : 0.006 ppm
Root delay      : 0.000786347 seconds
Root dispersion : 0.000138749 seconds
Update interval : 1036.7 seconds
Leap status     : Normal</pre>
</div>
</div>
<div class="paragraph">
<p>Note the following important pieces of output:</p>
</div>
<div class="ulist">
<ul>
<li>
<p><code>Root delay</code>: the total of the network path delays (round trips)
to the Stratum 1 server with which this <code>chronyd</code> instance is synchronized.</p>
</li>
<li>
<p><code>Root dispersion</code>: the total dispersion accumulated through all the paths up
to the Stratum 1 server with which this <code>chronyd</code> instance is synchronized.</p>
</li>
<li>
<p><code>Leap status</code>: whether the local clock is synchronized with the true time
up to the maximum error (see below). The <code>Normal</code> status means the clock is
synchronized, and <code>Not synchronised</code> naturally means otherwise.</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>An absolute bound on the error of the clock maintained internally by <code>chronyd</code>
at the time of the last NTP update can be expressed as:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>clock_error &lt;= abs(last_offset) + (root_delay / 2) + root_dispersion</pre>
</div>
</div>
<div class="paragraph">
<p><code>chronyc sources</code> reports on the list of reference NTP servers:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>210 Number of sources = 4
MS Name/IP address         Stratum Poll Reach LastRx Last sample
===============================================================================
^* 169.254.169.254               2  10   377   371   +240us[ +254us] +/-  501us
^- 64.62.190.177                 3  11   377   102  +1033us[+1033us] +/-   81ms
^- 64.246.132.14                 1  11   377   129   +323us[ +323us] +/-   16ms
^- 184.105.182.16                2  10   377   130  -4719us[-4719us] +/-   55ms</pre>
</div>
</div>
<div class="paragraph">
<p>To get more details on the measurement stats for reference NTP servers use
<code>chronyc sourcestats</code>:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>210 Number of sources = 4
Name/IP Address            NP  NR  Span  Frequency  Freq Skew  Offset  Std Dev
==============================================================================
169.254.169.254            46  27  323m     +0.000      0.006    +72ns    68us
64.62.190.177              12  10  224m     +0.071      0.050  +1240us   154us
64.246.132.14              21  13  326m     +0.012      0.030   +434us   230us
184.105.182.16              6   3   86m     +0.252      0.559  -5097us   306us</pre>
</div>
</div>
<div class="paragraph">
<p>Use <code>chronyc ntpdata [server]</code> to get information on a particular reference
server (or all servers if the <code>server</code> parameter is omitted):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Remote address  : 169.254.169.254 (A9FEA9FE)
Remote port     : 123
Local address   : 172.31.113.1 (AC1F7101)
Leap status     : Normal
Version         : 4
Mode            : Server
Stratum         : 2
Poll interval   : 10 (1024 seconds)
Precision       : -20 (0.000000954 seconds)
Root delay      : 0.000229 seconds
Root dispersion : 0.000107 seconds
Reference ID    : 474F4F47 ()
Reference time  : Tue Mar 03 06:33:24 2020
Offset          : -0.000253832 seconds
Peer delay      : 0.000557465 seconds
Peer dispersion : 0.000000987 seconds
Response time   : 0.000000001 seconds
Jitter asymmetry: +0.50
NTP tests       : 111 111 1111
Interleaved     : No
Authenticated   : No
TX timestamping : Daemon
RX timestamping : Kernel
Total TX        : 50
Total RX        : 50
Total valid RX  : 50</pre>
</div>
</div>
<div class="paragraph">
<p>For troubleshooting tips on clock synchronisation with chronyd see
<a href="https://chrony.tuxfamily.org/faq.html#_computer_is_not_synchronising">this
useful guide</a>.</p>
</div>
</div>
</div>
<div class="sect3">
<h4 id="_ntp_configuration_best_practices"><a class="link" href="#_ntp_configuration_best_practices">NTP Configuration Best Practices</a></h4>
<div class="paragraph">
<p>In order to provide stable time synchronization with low maximum error, follow
these best NTP configuration best practices.</p>
</div>
<div class="paragraph">
<p><strong>Run <code>ntpdate</code> (or its alternatives <code>ntpd -q</code> or <code>chronyd -q</code> in case of chrony)
prior to running the NTP server.</strong> If the offset of the local clock is too far
from the true time, it can take a long time before the NTP server synchronizes
the local clock, even if it&#8217;s allowed to perform step adjustments. So, after
configuring <code>ntpd</code> or <code>chronyd</code>, first run the <code>ntpdate</code> tool with the same set
of NTP servers or run <code>ntpd -q/chronyd -q</code>. It&#8217;s assumed that the NTP server
is not running when <code>ntpdate/ntpd -q/chronyd -q</code> is run. On RHEL/CentOS, if
using the <code>ntp</code> suite, enable the <code>ntpdate</code> service; if using the <code>chrony</code>
suite, enable the <code>chrony-wait</code> service.</p>
</div>
<div class="paragraph">
<p><strong>In certain public cloud environments, use the highly-available NTP server
accessible via link-local IP address or other dedicated NTP server provided
as a service.</strong> If your cluster is running in a public cloud environment,
consult the cloud provider&#8217;s documentation for the recommended NTP setup.
Both AWS and GCE clouds offer dedicated highly available NTP servers accessible
from within a cloud instance via link-local IP address.</p>
</div>
<div class="paragraph">
<p><strong>Unless using highly-available NTP reference server accessible via link-local
address, always configure at least four time sources for NTP server at the
local machine.</strong> In addition to providing redundancy in case one of time sources
becomes unavailable, this might make the configuration more robust since the
NTP is designed to increase its accuracy with a diversity of sources in networks
with higher round-trip times and jitter.</p>
</div>
<div class="paragraph">
<p><strong>Use the <code>iburst</code> option for faster synchronization at startup</strong>. The <code>iburst</code>
option instructs the NTP server (both <code>ntpd</code> and <code>chronyd</code>) to send an initial
"burst" of time queries at startup.  This results in a faster synchronization
of the <code>ntpd/chronyd</code> with their reference servers upon startup.</p>
</div>
<div class="paragraph">
<p><strong>If the maximum clock error goes beyond the default threshold set by Kudu
(10 seconds), consider setting lower value for the <code>maxpoll</code> option for every
NTP server in <code>ntp.conf/chrony.conf</code></strong>. For example, consider setting the
<code>maxpoll</code> to 7 which will cause the NTP daemon to make requests to the
corresponding NTP server at least every 128 seconds. The default maximum poll
interval is 10 (1024 seconds) for both <code>ntpd</code> and <code>chronyd</code>.</p>
</div>
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<i class="fa icon-note" title="Note"></i>
</td>
<td class="content">
<div class="paragraph">
<p>If using custom <code>maxpoll</code> interval, don&#8217;t set <code>maxpoll</code> too low (e.g., lower
than 6) to avoid flooding NTP servers, especially the public ones. Otherwise
they may blacklist the client (i.e. the NTP daemon at your machine) and cease
providing NTP service at all. If in doubt, consult the <code>ntp.conf</code> or
<code>chrony.conf</code> manual page correspondingly.</p>
</div>
</td>
</tr>
</table>
</div>
</div>
<div class="sect3">
<h4 id="_troubleshooting_ntp_stability_problems"><a class="link" href="#_troubleshooting_ntp_stability_problems">Troubleshooting NTP Stability Problems</a></h4>
<div class="paragraph">
<p>As of Kudu 1.6.0, both <code>kudu-master</code> and <code>kudu-tserver</code> are able to continue to
operate during a brief loss of clock synchronization. If clock synchronization
is lost for several hours, they may crash. If <code>kudu-master</code> or <code>kudu-tserver</code>
process crashes due to clock synchronization issues, consult the <code>ERROR</code> log
for a dump of related information which may help to diagnose the issue.</p>
</div>
<div class="admonitionblock tip">
<table>
<tr>
<td class="icon">
<i class="fa icon-tip" title="Tip"></i>
</td>
<td class="content">
Kudu 1.5.0 and earlier versions were less resilient to brief NTP outages. In
addition, they contained a <a href="https://issues.apache.org/jira/browse/KUDU-2209">bug</a>
which could cause Kudu to incorrectly measure the maximum error, resulting in
crashes. If you experience crashes related to clock synchronization on these
earlier versions of Kudu and it appears that the system&#8217;s NTP configuration
is correct, consider upgrading to Kudu 1.6.0 or later.
</td>
</tr>
</table>
</div>
<div class="admonitionblock tip">
<table>
<tr>
<td class="icon">
<i class="fa icon-tip" title="Tip"></i>
</td>
<td class="content">
If using other than link-local NTP servers, it may take some time for the
NTP server running on a local machine to synchronize with one of its reference
servers in case of network connectivity issues. In case of a spotty network
between the machine and the reference NTP servers, <code>ntpd/chronyd</code> may become
unsynchronized with its reference NTP servers. If that happens, consider finding
other set of reference NTP servers: the best bet is to use NTP servers in the
local network or *.pool.ntp.org servers.
</td>
</tr>
</table>
</div>
</div>
</div>
</div>
</div>
<div class="sect1">
<h2 id="disk_space_usage"><a class="link" href="#disk_space_usage">Disk Space Usage</a></h2>
<div class="sectionbody">
<div class="paragraph">
<p>When using the log block manager (the default on Linux), Kudu uses
<a href="https://en.wikipedia.org/wiki/Sparse_file">sparse files</a> to store data. A
sparse file has a different apparent size than the actual amount of disk space
it uses. This means that some tools may inaccurately report the disk space
used by Kudu. For example, the size listed by <code>ls -l</code> does not accurately
reflect the disk space used by Kudu data files:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ ls -lh /data/kudu/tserver/data
total 117M
-rw------- 1 kudu kudu 160M Mar 26 19:37 0b9807b8b17d48a6a7d5b16bf4ac4e6d.data
-rw------- 1 kudu kudu 4.4K Mar 26 19:37 0b9807b8b17d48a6a7d5b16bf4ac4e6d.metadata
-rw------- 1 kudu kudu  32M Mar 26 19:37 2f26eeacc7e04b65a009e2c9a2a8bd20.data
-rw------- 1 kudu kudu 4.3K Mar 26 19:37 2f26eeacc7e04b65a009e2c9a2a8bd20.metadata
-rw------- 1 kudu kudu 672M Mar 26 19:37 30a2dd2cd3554d8a9613f588a8d136ff.data
-rw------- 1 kudu kudu 4.4K Mar 26 19:37 30a2dd2cd3554d8a9613f588a8d136ff.metadata
-rw------- 1 kudu kudu  32M Mar 26 19:37 7434c83c5ec74ae6af5974e4909cbf82.data
-rw------- 1 kudu kudu 4.3K Mar 26 19:37 7434c83c5ec74ae6af5974e4909cbf82.metadata
-rw------- 1 kudu kudu 672M Mar 26 19:37 772d070347a04f9f8ad2ad3241440090.data
-rw------- 1 kudu kudu 4.4K Mar 26 19:37 772d070347a04f9f8ad2ad3241440090.metadata
-rw------- 1 kudu kudu 160M Mar 26 19:37 86e50a95531f46b6a79e671e6f5f4151.data
-rw------- 1 kudu kudu 4.4K Mar 26 19:37 86e50a95531f46b6a79e671e6f5f4151.metadata
-rw------- 1 kudu kudu  687 Mar 26 19:26 block_manager_instance</pre>
</div>
</div>
<div class="paragraph">
<p>Notice that the total size reported is 117MiB, while the first file&#8217;s size is
listed as 160MiB. Adding the <code>-s</code> option to <code>ls</code> will cause <code>ls</code> to output the
file&#8217;s disk space usage.</p>
</div>
<div class="paragraph">
<p>The <code>du</code> and <code>df</code> utilities report the actual disk space usage by default.</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ du -h /data/kudu/tserver/data
118M   /data/kudu/tserver/data</pre>
</div>
</div>
<div class="paragraph">
<p>The apparent size can be shown with the <code>--apparent-size</code> flag to <code>du</code>.</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ du -h --apparent-size /data/kudu/tserver/data
1.7G  /data/kudu/tserver/data</pre>
</div>
</div>
</div>
</div>
<div class="sect1">
<h2 id="crash_reporting"><a class="link" href="#crash_reporting">Reporting Kudu Crashes</a></h2>
<div class="sectionbody">
<div class="paragraph">
<p>Kudu uses the
<a href="https://chromium.googlesource.com/breakpad/breakpad/">Google Breakpad</a>
library to generate a minidump whenever Kudu experiences a crash. These
minidumps are typically only a few MB in size and are generated even if core
dump generation is disabled. At this time, generating minidumps is only
possible in Kudu on Linux builds.</p>
</div>
<div class="paragraph">
<p>A minidump file contains important debugging information about the process that
crashed, including shared libraries loaded and their versions, a list of
threads running at the time of the crash, the state of the processor registers
and a copy of the stack memory for each thread, and CPU and operating system
version information.</p>
</div>
<div class="paragraph">
<p>It is also possible to force Kudu to create a minidump without killing the
process by sending a <code>USR1</code> signal to the <code>kudu-tserver</code> or <code>kudu-master</code>
process. For example:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>sudo pkill -USR1 kudu-tserver</pre>
</div>
</div>
<div class="paragraph">
<p>By default, Kudu stores its minidumps in a subdirectory of its configured glog
directory called <code>minidumps</code>. This location can be customized by setting the
<code>--minidump_path</code> flag. Kudu will retain only a certain number of minidumps
before deleting the oldest ones, in an effort to avoid filling up the disk with
minidump files. The maximum number of minidumps that will be retained can be
controlled by setting the <code>--max_minidumps</code> gflag.</p>
</div>
<div class="paragraph">
<p>Minidumps contain information specific to the binary that created them and so
are not usable without access to the exact binary that crashed, or a very
similar binary. For more information on processing and using minidump files,
see scripts/dump_breakpad_symbols.py.</p>
</div>
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<i class="fa icon-note" title="Note"></i>
</td>
<td class="content">
A minidump can be emailed to a Kudu developer or attached to a JIRA in
order to help a Kudu developer debug a crash. In order for it to be useful, the
developer will need to know the exact version of Kudu and the operating system
where the crash was observed. Note that while a minidump does not contain a
heap memory dump, it does contain stack memory and therefore it is possible for
application data to appear in a minidump. If confidential or personal
information is stored on the cluster, do not share minidump files.
</td>
</tr>
</table>
</div>
</div>
</div>
<div class="sect1">
<h2 id="_performance_troubleshooting"><a class="link" href="#_performance_troubleshooting">Performance Troubleshooting</a></h2>
<div class="sectionbody">
<div class="sect2">
<h3 id="kudu_tracing"><a class="link" href="#kudu_tracing">Kudu Tracing</a></h3>
<div class="paragraph">
<p>The <code>kudu-master</code> and <code>kudu-tserver</code> daemons include built-in tracing support
based on the open source
<a href="https://www.chromium.org/developers/how-tos/trace-event-profiling-tool">Chromium Tracing</a>
framework. You can use tracing to help diagnose latency issues or other problems
on Kudu servers.</p>
</div>
<div class="sect3">
<h4 id="_accessing_the_tracing_interface"><a class="link" href="#_accessing_the_tracing_interface">Accessing the tracing interface</a></h4>
<div class="paragraph">
<p>The tracing interface is accessed via a web browser as part of the
embedded web server in each of the Kudu daemons.</p>
</div>
<table class="tableblock frame-all grid-all stretch">
<caption class="title">Table 1. Tracing Interface URLs</caption>
<colgroup>
<col style="width: 50%;">
<col style="width: 50%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">Daemon</th>
<th class="tableblock halign-left valign-top">URL</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">Tablet Server</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="http://tablet-server-1.example.com:8050/tracing.html" class="bare">http://tablet-server-1.example.com:8050/tracing.html</a></p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">Master</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><a href="http://master-1.example.com:8051/tracing.html" class="bare">http://master-1.example.com:8051/tracing.html</a></p></td>
</tr>
</tbody>
</table>
<div class="admonitionblock warning">
<table>
<tr>
<td class="icon">
<i class="fa icon-warning" title="Warning"></i>
</td>
<td class="content">
The tracing interface is known to work in recent versions of Google Chrome.
Other browsers may not work as expected.
</td>
</tr>
</table>
</div>
</div>
<div class="sect3">
<h4 id="_collecting_a_trace"><a class="link" href="#_collecting_a_trace">Collecting a trace</a></h4>
<div class="paragraph">
<p>After navigating to the tracing interface, click the <strong>Record</strong> button on the top left corner
of the screen. When beginning to diagnose a problem, start by selecting all categories.
Click <strong>Record</strong> to begin recording a trace.</p>
</div>
<div class="paragraph">
<p>During the trace collection, events are collected into an in-memory ring buffer.
This ring buffer is fixed in size, so it will eventually fill up to 100%. However, new events
are still being collected while older events are being removed. While recording the trace,
trigger the behavior or workload you are interested in exploring.</p>
</div>
<div class="paragraph">
<p>After collecting for several seconds, click <strong>Stop</strong>. The collected trace will be
downloaded and displayed. Use the <strong>?</strong> key to display help text about using the tracing
interface to explore the trace.</p>
</div>
</div>
<div class="sect3">
<h4 id="_saving_a_trace"><a class="link" href="#_saving_a_trace">Saving a trace</a></h4>
<div class="paragraph">
<p>You can save collected traces as JSON files for later analysis by clicking <strong>Save</strong>
after collecting the trace. To load and analyze a saved JSON file, click <strong>Load</strong>
and choose the file.</p>
</div>
</div>
</div>
<div class="sect2">
<h3 id="_rpc_timeout_traces"><a class="link" href="#_rpc_timeout_traces">RPC Timeout Traces</a></h3>
<div class="paragraph">
<p>If client applications are experiencing RPC timeouts, the Kudu tablet server
<code>WARNING</code> level logs should contain a log entry which includes an RPC-level trace. For example:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>W0922 00:56:52.313848 10858 inbound_call.cc:193] Call kudu.consensus.ConsensusService.UpdateConsensus
from 192.168.1.102:43499 (request call id 3555909) took 1464ms (client timeout 1000).
W0922 00:56:52.314888 10858 inbound_call.cc:197] Trace:
0922 00:56:50.849505 (+     0us) service_pool.cc:97] Inserting onto call queue
0922 00:56:50.849527 (+    22us) service_pool.cc:158] Handling call
0922 00:56:50.849574 (+    47us) raft_consensus.cc:1008] Updating replica for 2 ops
0922 00:56:50.849628 (+    54us) raft_consensus.cc:1050] Early marking committed up to term: 8 index: 880241
0922 00:56:50.849968 (+   340us) raft_consensus.cc:1056] Triggering prepare for 2 ops
0922 00:56:50.850119 (+   151us) log.cc:420] Serialized 1555 byte log entry
0922 00:56:50.850213 (+    94us) raft_consensus.cc:1131] Marking committed up to term: 8 index: 880241
0922 00:56:50.850218 (+     5us) raft_consensus.cc:1148] Updating last received op as term: 8 index: 880243
0922 00:56:50.850219 (+     1us) raft_consensus.cc:1195] Filling consensus response to leader.
0922 00:56:50.850221 (+     2us) raft_consensus.cc:1169] Waiting on the replicates to finish logging
0922 00:56:52.313763 (+1463542us) raft_consensus.cc:1182] finished
0922 00:56:52.313764 (+     1us) raft_consensus.cc:1190] UpdateReplicas() finished
0922 00:56:52.313788 (+    24us) inbound_call.cc:114] Queueing success response</pre>
</div>
</div>
<div class="paragraph">
<p>These traces can give an indication of which part of the request was slow. Please
include them in bug reports related to RPC latency outliers.</p>
</div>
</div>
<div class="sect2">
<h3 id="_kernel_stack_watchdog_traces"><a class="link" href="#_kernel_stack_watchdog_traces">Kernel Stack Watchdog Traces</a></h3>
<div class="paragraph">
<p>Each Kudu server process has a background thread called the Stack Watchdog, which
monitors the other threads in the server in case they have blocked for
longer-than-expected periods of time. These traces can indicate operating system issues
or bottlenecked storage.</p>
</div>
<div class="paragraph">
<p>When the watchdog thread identifies a case of thread blockage, it logs an entry
in the <code>WARNING</code> log like the following:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>W0921 23:51:54.306350 10912 kernel_stack_watchdog.cc:111] Thread 10937 stuck at /data/kudu/consensus/log.cc:505 for 537ms:
Kernel stack:
[&lt;ffffffffa00b209d&gt;] do_get_write_access+0x29d/0x520 [jbd2]
[&lt;ffffffffa00b2471&gt;] jbd2_journal_get_write_access+0x31/0x50 [jbd2]
[&lt;ffffffffa00fe6d8&gt;] __ext4_journal_get_write_access+0x38/0x80 [ext4]
[&lt;ffffffffa00d9b23&gt;] ext4_reserve_inode_write+0x73/0xa0 [ext4]
[&lt;ffffffffa00d9b9c&gt;] ext4_mark_inode_dirty+0x4c/0x1d0 [ext4]
[&lt;ffffffffa00d9e90&gt;] ext4_dirty_inode+0x40/0x60 [ext4]
[&lt;ffffffff811ac48b&gt;] __mark_inode_dirty+0x3b/0x160
[&lt;ffffffff8119c742&gt;] file_update_time+0xf2/0x170
[&lt;ffffffff8111c1e0&gt;] __generic_file_aio_write+0x230/0x490
[&lt;ffffffff8111c4c8&gt;] generic_file_aio_write+0x88/0x100
[&lt;ffffffffa00d3fb1&gt;] ext4_file_write+0x61/0x1e0 [ext4]
[&lt;ffffffff81180f5b&gt;] do_sync_readv_writev+0xfb/0x140
[&lt;ffffffff81181ee6&gt;] do_readv_writev+0xd6/0x1f0
[&lt;ffffffff81182046&gt;] vfs_writev+0x46/0x60
[&lt;ffffffff81182102&gt;] sys_pwritev+0xa2/0xc0
[&lt;ffffffff8100b072&gt;] system_call_fastpath+0x16/0x1b
[&lt;ffffffffffffffff&gt;] 0xffffffffffffffff

User stack:
    @       0x3a1ace10c4  (unknown)
    @          0x1262103  (unknown)
    @          0x12622d4  (unknown)
    @          0x12603df  (unknown)
    @           0x8e7bfb  (unknown)
    @           0x8f478b  (unknown)
    @           0x8f55db  (unknown)
    @          0x12a7b6f  (unknown)
    @       0x3a1b007851  (unknown)
    @       0x3a1ace894d  (unknown)
    @              (nil)  (unknown)</pre>
</div>
</div>
<div class="paragraph">
<p>These traces can be useful for diagnosing root-cause latency issues when they are caused by systems
below Kudu, such as disk controllers or filesystems.</p>
</div>
</div>
<div class="sect2">
<h3 id="memory_limits"><a class="link" href="#memory_limits">Memory Limits</a></h3>
<div class="paragraph">
<p>Kudu has a hard and soft memory limit. The hard memory limit is the maximum amount a Kudu process
is allowed to use, and is controlled by the <code>--memory_limit_hard_bytes</code> flag. The soft memory limit
is a percentage of the hard memory limit, controlled by the flag <code>memory_limit_soft_percentage</code> and
with a default value of 80%, that determines the amount of memory a process may use before it will
start rejecting some write operations.</p>
</div>
<div class="paragraph">
<p>If the logs or RPC traces contain messages like</p>
</div>
<div class="listingblock">
<div class="content">
<pre>Service unavailable: Soft memory limit exceeded (at 96.35% of capacity)</pre>
</div>
</div>
<div class="paragraph">
<p>then Kudu is rejecting writes due to memory backpressure. This may result in write timeouts. There
are several ways to relieve the memory pressure on Kudu:</p>
</div>
<div class="ulist">
<ul>
<li>
<p>If the host has more memory available for Kudu, increase <code>--memory_limit_hard_bytes</code>.</p>
</li>
<li>
<p>Increase the rate at which Kudu can flush writes from memory to disk by increasing the number of
disks or increasing the number of maintenance manager threads <code>--maintenance_manager_num_threads</code>.
Generally, the recommended ratio of maintenance manager threads to data directories is 1:3.</p>
</li>
<li>
<p>Reduce the volume of writes flowing to Kudu on the application side.</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>Finally, on versions of Kudu prior to 1.8, check the value of
<code>--block_cache_capacity_mb</code>. This setting determines the maximum size of Kudu&#8217;s
block cache. While a higher value can help with read and write performance,
do not raise <code>--block_cache_capacity_mb</code> above the memory pressure threshold,
which is <code>--memory_pressure_percentage</code> (default 60%) of
<code>--memory_limit_hard_bytes</code>, as this will cause Kudu to flush aggressively even
if write throughput is low. Keeping <code>--block_cache_capacity_mb</code> below 50% of the
memory pressure threshold is recommended. With the defaults, this means
<code>--block_cache_capacity_mb</code> should not exceed 30% of
<code>--memory_limit_hard_bytes</code>. On Kudu 1.8 and higher, servers will refuse to
start if the block cache capacity exceeds the memory pressure threshold.</p>
</div>
</div>
<div class="sect2">
<h3 id="block_cache_size"><a class="link" href="#block_cache_size">Block Cache Size</a></h3>
<div class="paragraph">
<p>Kudu uses an LRU cache for recently read data. On workloads that scan a subset
of the data repeatedly, raising the size of this cache can offer significant
performance benefits. To increase the amount of memory dedicated to the block
cache, increase the value of the flag <code>--block_cache_capacity_mb</code>. The default
is 512MiB.</p>
</div>
<div class="paragraph">
<p>Kudu provides a set of useful metrics for evaluating the performance of the
block cache, which can be found on the <code>/metrics</code> endpoint of the web UI. An
example set:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>{
  "name": "block_cache_inserts",
  "value": 64
},
{
  "name": "block_cache_lookups",
  "value": 512
},
{
  "name": "block_cache_evictions",
  "value": 0
},
{
  "name": "block_cache_misses",
  "value": 96
},
{
  "name": "block_cache_misses_caching",
  "value": 64
},
{
  "name": "block_cache_hits",
  "value": 0
},
{
  "name": "block_cache_hits_caching",
  "value": 352
},
{
  "name": "block_cache_usage",
  "value": 6976
}</pre>
</div>
</div>
<div class="paragraph">
<p>To judge the efficiency of the block cache on a tablet server, first wait until
the server has been running and serving normal requests for some time, so the
cache is not cold. Unless the server stores very little data or is idle,
<code>block_cache_usage</code> should be equal or nearly equal to <code>block_cache_capacity_mb</code>.
Once the cache has reached steady state, compare <code>block_cache_lookups</code> to
<code>block_cache_misses_caching</code>. The latter metric counts the number of blocks that
Kudu expected to read from cache but which weren&#8217;t found in the cache. If a
significant amount of lookups result in misses on expected cache hits, and the
<code>block_cache_evictions</code> metric is significant compared to <code>block_cache_inserts</code>,
then raising the size of the block cache may provide a performance boost.
However, the utility of the block cache is highly dependent on workload, so it&#8217;s
necessary to test the benefits of a larger block cache.</p>
</div>
<div class="admonitionblock warning">
<table>
<tr>
<td class="icon">
<i class="fa icon-warning" title="Warning"></i>
</td>
<td class="content">
Do not raise the block cache size <code>--block_cache_capacity_mb</code> higher
than the memory pressure threshold (defaults to 60% of <code>--memory_limit_hard_bytes</code>).
As this would cause poor flushing behavior, Kudu servers version 1.8 and higher
will refuse to start when misconfigured in this way.
</td>
</tr>
</table>
</div>
</div>
<div class="sect2">
<h3 id="heap_sampling"><a class="link" href="#heap_sampling">Heap Sampling</a></h3>
<div class="paragraph">
<p>For advanced debugging of memory usage, released builds of Kudu enable Heap Sampling by default.
This allows Kudu developers to associate memory usage with the specific lines of code and data
structures responsible. When reporting a bug related to memory usage or an apparent memory leak,
heap profiling can give quantitative data to pinpoint the issue.</p>
</div>
<div class="paragraph">
<p>If heap sampling is enabled, the current sampled heap occupancy can be retrieved over HTTP
by visiting <code><a href="http://tablet-server.example.com:8050/pprof/heap" class="bare">http://tablet-server.example.com:8050/pprof/heap</a></code> or
<code><a href="http://master.example.com:8051/pprof/heap" class="bare">http://master.example.com:8051/pprof/heap</a></code>. The output is a machine-readable dump of the
stack traces with their associated heap usage.</p>
</div>
<div class="paragraph">
<p>Rather than visiting the heap profile page directly in a web browser, it is typically
more useful to use the <code>pprof</code> tool that is distributed as part of the <code>gperftools</code>
open source project. For example, a developer with a local build tree can use the
following command to collect the sampled heap usage and output an SVG diagram:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>thirdparty/installed/uninstrumented/bin/pprof -svg  'http://localhost:8051/pprof/heap' &gt; /tmp/heap.svg</pre>
</div>
</div>
<div class="paragraph">
<p>The resulting SVG may be visualized in a web browser or sent to the Kudu community to help
troubleshoot memory occupancy issues.</p>
</div>
<div class="admonitionblock tip">
<table>
<tr>
<td class="icon">
<i class="fa icon-tip" title="Tip"></i>
</td>
<td class="content">
Heap samples contain only summary information about allocations and do not contain any
<em>data</em> from the heap. It is safe to share heap samples in public without fear of exposing
confidential or sensitive data.
</td>
</tr>
</table>
</div>
</div>
<div class="sect2">
<h3 id="slow_dns_nscd"><a class="link" href="#slow_dns_nscd">Slow DNS Lookups and <code>nscd</code></a></h3>
<div class="paragraph">
<p>For better scalability on nodes hosting many replicas, we recommend that you use
<code>nscd</code> (name service cache daemon) to cache both DNS name resolution and static name resolution (via <code>/etc/hosts</code>).</p>
</div>
<div class="paragraph">
<p>When DNS lookups are slow, you will see a log message similar to the following:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>W0926 11:19:01.339553 27231 net_util.cc:193] Time spent resolve address for kudu-tserver.example.com: real 4.647s    user 0.000s     sys 0.000s</pre>
</div>
</div>
<div class="paragraph">
<p><code>nscd</code> (name service cache daemon) can alleviate slow name resolution by providing
a cache for the most common name service requests, such as for passwords, groups,
and hosts.</p>
</div>
<div class="paragraph">
<p>Refer to your operating system documentation for how to install and enable <code>nscd</code>.</p>
</div>
</div>
</div>
</div>
<div class="sect1">
<h2 id="_issues_using_kudu"><a class="link" href="#_issues_using_kudu">Issues using Kudu</a></h2>
<div class="sectionbody">
<div class="sect2">
<h3 id="hive_handler"><a class="link" href="#hive_handler">ClassNotFoundException: com.cloudera.kudu.hive.KuduStorageHandler</a></h3>
<div class="paragraph">
<p>Users will encounter this exception when trying to use a Kudu table via Hive. This
is not a case of a missing jar, but simply that Impala stores Kudu metadata in
Hive in a format that&#8217;s unreadable to other tools, including Hive itself and Spark.
There is no workaround for Hive users. Spark users need to create temporary tables.</p>
</div>
</div>
<div class="sect2">
<h3 id="too_many_threads"><a class="link" href="#too_many_threads">Runtime error: Could not create thread: Resource temporarily unavailable (error 11)</a></h3>
<div class="paragraph">
<p>Users will encounter this error when Kudu is unable to create more threads,
usually on versions of Kudu older than 1.7. It happens on tablet servers, and
is a sign that the tablet server hosts too many tablet replicas. To fix the
issue, users can raise the <code>nproc</code> ulimit as detailed in the documentation for
their operating system or distribution. However, the better solution is to
reduce the number of replicas on the tablet server. This may involve rethinking
the table&#8217;s partitioning schema. For the recommended limits on number of
replicas per tablet server, see the known issues and scaling limitations
documentation for the appropriate Kudu release. The
<a href="http://kudu.apache.org/releases/">releases page</a> has links to documentation
for previous versions of Kudu; for the latest release, see the
<a href="known_issues.html">known issues page</a>.</p>
</div>
</div>
<div class="sect2">
<h3 id="tombstoned_or_stopped_tablets"><a class="link" href="#tombstoned_or_stopped_tablets">Tombstoned or STOPPED tablet replicas</a></h3>
<div class="paragraph">
<p>Users may notice some replicas on a tablet server are in a STOPPED state, and
remain on the server indefinitely. These replicas are tombstones. A tombstone
indicates that the tablet server once held a bona fide replica of its tablet.
For example, if a tablet server goes down and its replicas are re-replicated
elsewhere, if the tablet server rejoins the cluster its replicas will become
tombstones. A tombstone will remain until the table it belongs to is deleted, or
a new replica of the same tablet is placed on the tablet server. A count of
tombstoned replicas and details of each one are available on the /tablets page
of the tablet server web UI.</p>
</div>
<div class="paragraph">
<p>The Raft consensus algorithm that Kudu uses for replication requires tombstones
for correctness in certain rare situations. They consume minimal resources and
hold no data. They must not be deleted.</p>
</div>
</div>
<div class="sect2">
<h3 id="cfile_corruption"><a class="link" href="#cfile_corruption">Corruption: checksum error on CFile block</a></h3>
<div class="paragraph">
<p>In versions prior to Kudu 1.8.0, if the data on disk becomes corrupt, users
will encounter warnings containing "Corruption: checksum error on CFile block"
in the tablet server logs and client side errors when trying to scan tablets
with corrupt CFile blocks. Fixing this corruption is a manual process.</p>
</div>
<div class="paragraph">
<p>To fix the issue, users can first identify all the affected tablets by
running a checksum scan on the affected tables or tablets using the
<code><a href="command_line_tools_reference.html#cluster-ksck">ksck</a></code> tool.</p>
</div>
<div class="listingblock">
<div class="content">
<pre>sudo -u kudu kudu cluster ksck &lt;master_addresses&gt; -checksum_scan -tables=&lt;tables&gt;
sudo -u kudu kudu cluster ksck &lt;master_addresses&gt; -checksum_scan -tablets=&lt;tablets&gt;</pre>
</div>
</div>
<div class="paragraph">
<p>If there is at least one replica for each tablet that does not return a corruption
error, you can repair the bad copies by deleting them and forcing them to be
re-replicated from the leader using the
<code><a href="command_line_tools_reference.html#remote_replica-delete">remote_replica delete</a> tool</code>.</p>
</div>
<div class="listingblock">
<div class="content">
<pre>sudo -u kudu kudu remote_replica delete &lt;tserver_address&gt; &lt;tablet_id&gt; "Cfile Corruption"</pre>
</div>
</div>
<div class="paragraph">
<p>If all of the replica are corrupt, then some data loss has occurred.
Until <a href="https://issues.apache.org/jira/browse/KUDU-2526">KUDU-2526</a> is
completed this can happen if the corrupt replica became the leader and the
existing follower replicas are replaced.</p>
</div>
<div class="paragraph">
<p>If data has been lost, you can repair the table by replacing the corrupt tablet
with an empty one using the
<code><a href="command_line_tools_reference.html#tablet-unsafe_replace_tablet">unsafe_replace_tablet</a></code> tool.</p>
</div>
<div class="listingblock">
<div class="content">
<pre>sudo -u kudu kudu tablet unsafe_replace_tablet &lt;master_addresses&gt; &lt;tablet_id&gt;</pre>
</div>
</div>
<div class="paragraph">
<p>From versions 1.8.0 onwards, Kudu will mark the affected replicas as failed,
leading to their automatic re-replication elsewhere.</p>
</div>
</div>
<div class="sect2">
<h3 id="symbolizing_stack_traces"><a class="link" href="#symbolizing_stack_traces">Symbolizing stack traces</a></h3>
<div class="paragraph">
<p>Sometimes you might see the following in the logs:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>0323 03:59:31.091198 (+607857us) spinlock_profiling.cc:243] Waited 492 ms on lock 0x4cb0960. stack: 0000000002398852 0000000000ad8c69 0000000000aa62ba 000000000221aaa8 000000000221b1a8 00000000023a8f83 00007fa8b818be24 00007fa8b646a34c</pre>
</div>
</div>
<div class="paragraph">
<p>That&#8217;s usually a sign of high contention among threads to acquire a lock, and
in this case the reported time shows how long a thread spent on a CPU before
acquiring the lock. The call stack addresses listed helps to restore the stack
trace of the waiting thread and pinpoint the problem in the code.</p>
</div>
<div class="paragraph">
<p>It&#8217;s possible to translate the addresses into the name of functions and lines
in the code having the binary that produced the output (in this example, it&#8217;s
<code>kudu-master</code>). If the binary is stripped of symbols and debug information,
it&#8217;s possible do so as well if separate debug information for the binary is
available.</p>
</div>
<div class="paragraph">
<p>Assuming both the stripped release binary and the debug information are
available as RPMs, unpack them into a directory (e.g., <code>sysroot</code>):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ mkdir sysroot &amp;&amp; cd sysroot
$ rpm2cpio ../kudu-1.10.0.el7.x86_64.rpm | cpio -idmv
$ rpm2cpio ../kudu-debuginfo-1.10.0.el7.x86_64.rpm | cpio -idmv</pre>
</div>
</div>
<div class="paragraph">
<p>Use <code>addr2line</code> to find the line in the code for the stack address (in case if
the binary is not stripped of debug information, supply the actual binary with
<code>-e</code> option instead of the debug info file):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>addr2line -C -f -e usr/lib/debug/usr/lib/kudu/sbin-release/kudu-master.debug 0x0000000000aa62ba
kudu::master::MasterServiceImpl::ConnectToMaster(kudu::master::ConnectToMasterRequestPB const*, kudu::master::ConnectToMasterResponsePB*, kudu::rpc::RpcContext*)
/usr/src/debug/kudu-1.10.0/src/kudu/master/master_service.cc:504</pre>
</div>
</div>
<div class="paragraph">
<p>To achieve the same with <code>gdb</code>, first find the address of the <code>.text</code> section
in the symbol file (in the example, <code>0000000000a2cdb0</code>):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>$ readelf -S usr/lib/debug/usr/lib/kudu/sbin-release/kudu-master.debug | grep .text
  [13] .text             NOBITS           0000000000a2cdb0  000002c0</pre>
</div>
</div>
<div class="paragraph">
<p>Then start up <code>gdb</code>, pointing it to the <code>kudu-master</code> executable (that&#8217;s the
executable that produced the output in the log file):</p>
</div>
<div class="listingblock">
<div class="content">
<pre>gdb usr/lib/kudu/sbin-release/kudu-master</pre>
</div>
</div>
<div class="paragraph">
<p>Now load the <code>.debug</code> symbols into <code>gdb</code> using the address found above, tell
<code>gdb</code> where to find source files, and set the sysroot:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>(gdb) add-symbol-file usr/lib/debug/usr/lib/kudu/sbin-release/kudu-master.debug 0x0000000000a2cdb0
(gdb) set substitute-path /usr/src/debug/kudu-1.10.0 usr/src/debug/kudu-1.10.0
(gdb) set sysroot .</pre>
</div>
</div>
<div class="paragraph">
<p>To translate the address into line number and function information, use
<code>info line * &lt;address&gt;</code>:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>(gdb) info line * 0x0000000000aa62ba
Line 504 of "/usr/src/debug/kudu-1.10.0/src/kudu/master/master_service.cc"
   starts at address 0xaa62af &lt;kudu::master::MasterServiceImpl::ConnectToMaster(kudu::master::ConnectToMasterRequestPB const*, kudu::master::ConnectToMasterResponsePB*, kudu::rpc::RpcContext*)+47&gt;
   and ends at 0xaa62bb &lt;kudu::master::MasterServiceImpl::ConnectToMaster(kudu::master::ConnectToMasterRequestPB const*, kudu::master::ConnectToMasterResponsePB*, kudu::rpc::RpcContext*)+59&gt;.</pre>
</div>
</div>
</div>
</div>
</div>
    </div>
    <div class="col-md-3">

  <div id="toc" data-spy="affix" data-offset-top="70">
  <ul>

      <li>

          <a href="index.html">Introducing Kudu</a> 
      </li> 
      <li>

          <a href="release_notes.html">Kudu Release Notes</a> 
      </li> 
      <li>

          <a href="quickstart.html">Quickstart Guide</a> 
      </li> 
      <li>

          <a href="installation.html">Installation Guide</a> 
      </li> 
      <li>

          <a href="configuration.html">Configuring Kudu</a> 
      </li> 
      <li>

          <a href="hive_metastore.html">Using the Hive Metastore with Kudu</a> 
      </li> 
      <li>

          <a href="kudu_impala_integration.html">Using Impala with Kudu</a> 
      </li> 
      <li>

          <a href="administration.html">Administering Kudu</a> 
      </li> 
      <li>
<span class="active-toc">Troubleshooting Kudu</span>
            <ul class="sectlevel1">
<li><a href="#_startup_errors">Startup Errors</a>
<ul class="sectlevel2">
<li><a href="#req_hole_punching">Errors During Hole Punching Test</a></li>
<li><a href="#disk_issues">Already present: FS layout already exists</a></li>
<li><a href="#ntp">NTP Clock Synchronization</a></li>
</ul>
</li>
<li><a href="#disk_space_usage">Disk Space Usage</a></li>
<li><a href="#crash_reporting">Reporting Kudu Crashes</a></li>
<li><a href="#_performance_troubleshooting">Performance Troubleshooting</a>
<ul class="sectlevel2">
<li><a href="#kudu_tracing">Kudu Tracing</a></li>
<li><a href="#_rpc_timeout_traces">RPC Timeout Traces</a></li>
<li><a href="#_kernel_stack_watchdog_traces">Kernel Stack Watchdog Traces</a></li>
<li><a href="#memory_limits">Memory Limits</a></li>
<li><a href="#block_cache_size">Block Cache Size</a></li>
<li><a href="#heap_sampling">Heap Sampling</a></li>
<li><a href="#slow_dns_nscd">Slow DNS Lookups and <code>nscd</code></a></li>
</ul>
</li>
<li><a href="#_issues_using_kudu">Issues using Kudu</a>
<ul class="sectlevel2">
<li><a href="#hive_handler">ClassNotFoundException: com.cloudera.kudu.hive.KuduStorageHandler</a></li>
<li><a href="#too_many_threads">Runtime error: Could not create thread: Resource temporarily unavailable (error 11)</a></li>
<li><a href="#tombstoned_or_stopped_tablets">Tombstoned or STOPPED tablet replicas</a></li>
<li><a href="#cfile_corruption">Corruption: checksum error on CFile block</a></li>
<li><a href="#symbolizing_stack_traces">Symbolizing stack traces</a></li>
</ul>
</li>
</ul> 
      </li> 
      <li>

          <a href="developing.html">Developing Applications with Kudu</a> 
      </li> 
      <li>

          <a href="schema_design.html">Kudu Schema Design</a> 
      </li> 
      <li>

          <a href="scaling_guide.html">Kudu Scaling Guide</a> 
      </li> 
      <li>

          <a href="security.html">Kudu Security</a> 
      </li> 
      <li>

          <a href="transaction_semantics.html">Kudu Transaction Semantics</a> 
      </li> 
      <li>

          <a href="background_tasks.html">Background Maintenance Tasks</a> 
      </li> 
      <li>

          <a href="configuration_reference.html">Kudu Configuration Reference</a> 
      </li> 
      <li>

          <a href="command_line_tools_reference.html">Kudu Command Line Tools Reference</a> 
      </li> 
      <li>

          <a href="metrics_reference.html">Kudu Metrics Reference</a> 
      </li> 
      <li>

          <a href="known_issues.html">Known Issues and Limitations</a> 
      </li> 
      <li>

          <a href="contributing.html">Contributing to Kudu</a> 
      </li> 
      <li>

          <a href="export_control.html">Export Control Notice</a> 
      </li> 
  </ul>
  </div>
    </div>
  </div>
</div>
      <footer class="footer">
        <div class="row">
          <div class="col-md-9">
            <p class="small">
            Copyright &copy; 2020 The Apache Software Foundation.  Last updated 2022-04-11 16:50:08 +0200 
            </p>
            <p class="small">
            Apache Kudu, Kudu, Apache, the Apache feather logo, and the Apache Kudu
            project logo are either registered trademarks or trademarks of The
            Apache Software Foundation in the United States and other countries.
            </p>
          </div>
          <div class="col-md-3">
            <a class="pull-right" href="https://www.apache.org/events/current-event.html">
                <img src="https://www.apache.org/events/current-event-234x60.png"/>
            </a>
          </div>
        </div>
      </footer>
    </div>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
    <script>
      // Try to detect touch-screen devices. Note: Many laptops have touch screens.
      $(document).ready(function() {
        if ("ontouchstart" in document.documentElement) {
          $(document.documentElement).addClass("touch");
        } else {
          $(document.documentElement).addClass("no-touch");
        }
      });
    </script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"
            integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS"
            crossorigin="anonymous"></script>
    <script>
      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
      })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

      ga('create', 'UA-68448017-1', 'auto');
      ga('send', 'pageview');
    </script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/anchor-js/3.1.0/anchor.js"></script>
    <script>
      anchors.options = {
        placement: 'right',
        visible: 'touch',
      };
      anchors.add();
    </script>
  </body>
</html>

