blob: a4bb5315b8dbc6c038a16c2a929c747c34950680 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="The Apache Cassandra database is the right choice when you need scalability and high availability without compromising performance. Linear scalability and proven fault-tolerance on commodity hardware or cloud infrastructure make it the perfect platform for mission-critical data. Cassandra's support for replicating across multiple datacenters is best-in-class, providing lower latency for your users and the peace of mind of knowing that you can survive regional outages.
">
<meta name="keywords" content="cassandra, apache, apache cassandra, distributed storage, key value store, scalability, bigtable, dynamo" />
<meta name="robots" content="index,follow" />
<meta name="language" content="en" />
<title>Documentation</title>
<link rel="canonical" href="http://cassandra.apache.org/doc/latest/troubleshooting/finding_nodes.html">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
<link rel="stylesheet" href="./../../../css/style.css">
<link rel="stylesheet" href="./../../../css/sphinx.css">
<link rel="top" title="Apache Cassandra Documentation v4.0-alpha3" href="../index.html"/> <link rel="up" title="Troubleshooting" href="index.html"/> <link rel="next" title="Cassandra Logs" href="reading_logs.html"/> <link rel="prev" title="Troubleshooting" href="index.html"/>
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.2.0/css/all.css" integrity="sha384-hWVjflwFxL6sNzntih27bfxkr27PmbbK/iSvJ+a4+0owXq79v+lsFkW54bOGbiDQ" crossorigin="anonymous">
<link type="application/atom+xml" rel="alternate" href="http://cassandra.apache.org/feed.xml" title="Apache Cassandra Website" />
</head>
<body>
<!-- breadcrumbs -->
<div class="topnav">
<div class="container breadcrumb-container">
<ul class="breadcrumb">
<li>
<div class="dropdown">
<img class="asf-logo" src="./../../../img/asf_feather.png" />
<a data-toggle="dropdown" href="#">Apache Software Foundation <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu" aria-labelledby="dLabel">
<li><a href="http://www.apache.org">Apache Homepage</a></li>
<li><a href="http://www.apache.org/licenses/">License</a></li>
<li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
<li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
<li><a href="http://www.apache.org/security/">Security</a></li>
</ul>
</div>
</li>
<li><a href="./../../../">Apache Cassandra</a></li>
<li><a href="./../../../doc">Documentation</a></li>
<li><a href="./">Troubleshooting</a></li>
<li>Find The Misbehaving Nodes</li>
</ul>
</div>
<!-- navbar -->
<nav class="navbar navbar-default navbar-static-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#cassandra-menu" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand" href="./../../../"><img src="./../../../img/cassandra_logo.png" alt="Apache Cassandra logo" /></a>
</div><!-- /.navbar-header -->
<div id="cassandra-menu" class="collapse navbar-collapse">
<ul class="nav navbar-nav navbar-right">
<li><a href="./../../../">Home</a></li>
<li><a href="./../../../download/">Download</a></li>
<li><a href="./../../../doc/">Documentation</a></li>
<li><a href="./../../../community/">Community</a></li>
<li>
<a href="./../../../blog/">Blog</a>
</li>
</ul>
</div><!-- /#cassandra-menu -->
</div>
</nav><!-- /.navbar -->
</div><!-- /.topnav -->
<div class="container-fluid">
<div class="row">
<div class="col-md-3">
<div class="doc-navigation">
<div class="doc-menu" role="navigation">
<div class="navbar-header">
<button type="button" class="pull-left navbar-toggle" data-toggle="collapse" data-target=".sidebar-navbar-collapse">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
</div>
<div class="navbar-collapse collapse sidebar-navbar-collapse">
<form id="doc-search-form" class="navbar-form" action="../search.html" method="get" role="search">
<div class="form-group">
<input type="text" size="30" class="form-control input-sm" name="q" placeholder="Search docs">
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</div>
</form>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../getting_started/index.html">Getting Started</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/index.html">Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../data_modeling/index.html">Data Modeling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cql/index.html">The Cassandra Query Language (CQL)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../configuration/index.html">Configuring Cassandra</a></li>
<li class="toctree-l1"><a class="reference internal" href="../operating/index.html">Operating Cassandra</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tools/index.html">Cassandra Tools</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Troubleshooting</a><ul class="current">
<li class="toctree-l2 current"><a class="current reference internal" href="#">Find The Misbehaving Nodes</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#client-logs-and-errors">Client Logs and Errors</a></li>
<li class="toctree-l3"><a class="reference internal" href="#metrics">Metrics</a></li>
<li class="toctree-l3"><a class="reference internal" href="#next-step-investigate-the-node-s">Next Step: Investigate the Node(s)</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="reading_logs.html">Cassandra Logs</a></li>
<li class="toctree-l2"><a class="reference internal" href="use_nodetool.html">Use Nodetool</a></li>
<li class="toctree-l2"><a class="reference internal" href="use_tools.html">Diving Deep, Use External Tools</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../development/index.html">Contributing to Cassandra</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq/index.html">Frequently Asked Questions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../plugins/index.html">Third-Party Plugins</a></li>
<li class="toctree-l1"><a class="reference internal" href="../bugs.html">Reporting Bugs</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contactus.html">Contact us</a></li>
</ul>
</div><!--/.nav-collapse -->
</div>
</div>
</div>
<div class="col-md-8">
<div class="content doc-content">
<div class="content-container">
<div class="section" id="find-the-misbehaving-nodes">
<h1>Find The Misbehaving Nodes<a class="headerlink" href="#find-the-misbehaving-nodes" title="Permalink to this headline"></a></h1>
<p>The first step to troubleshooting a Cassandra issue is to use error messages,
metrics and monitoring information to identify if the issue lies with the
clients or the server and if it does lie with the server find the problematic
nodes in the Cassandra cluster. The goal is to determine if this is a systemic
issue (e.g. a query pattern that affects the entire cluster) or isolated to a
subset of nodes (e.g. neighbors holding a shared token range or even a single
node with bad hardware).</p>
<p>There are many sources of information that help determine where the problem
lies. Some of the most common are mentioned below.</p>
<div class="section" id="client-logs-and-errors">
<h2>Client Logs and Errors<a class="headerlink" href="#client-logs-and-errors" title="Permalink to this headline"></a></h2>
<p>Clients of the cluster often leave the best breadcrumbs to follow. Perhaps
client latencies or error rates have increased in a particular datacenter
(likely eliminating other datacenter’s nodes), or clients are receiving a
particular kind of error code indicating a particular kind of problem.
Troubleshooters can often rule out many failure modes just by reading the error
messages. In fact, many Cassandra error messages include the last coordinator
contacted to help operators find nodes to start with.</p>
<p>Some common errors (likely culprit in parenthesis) assuming the client has
similar error names as the Datastax <a class="reference internal" href="../getting_started/drivers.html#client-drivers"><span class="std std-ref">drivers</span></a>:</p>
<ul class="simple">
<li><code class="docutils literal notranslate"><span class="pre">SyntaxError</span></code> (<strong>client</strong>). This and other <code class="docutils literal notranslate"><span class="pre">QueryValidationException</span></code>
indicate that the client sent a malformed request. These are rarely server
issues and usually indicate bad queries.</li>
<li><code class="docutils literal notranslate"><span class="pre">UnavailableException</span></code> (<strong>server</strong>): This means that the Cassandra
coordinator node has rejected the query as it believes that insufficent
replica nodes are available. If many coordinators are throwing this error it
likely means that there really are (typically) multiple nodes down in the
cluster and you can identify them using <a class="reference internal" href="use_nodetool.html#nodetool-status"><span class="std std-ref">nodetool status</span></a> If only a single coordinator is throwing this error it may
mean that node has been partitioned from the rest.</li>
<li><code class="docutils literal notranslate"><span class="pre">OperationTimedOutException</span></code> (<strong>server</strong>): This is the most frequent
timeout message raised when clients set timeouts and means that the query
took longer than the supplied timeout. This is a <em>client side</em> timeout
meaning that it took longer than the client specified timeout. The error
message will include the coordinator node that was last tried which is
usually a good starting point. This error usually indicates either
aggressive client timeout values or latent server coordinators/replicas.</li>
<li><code class="docutils literal notranslate"><span class="pre">ReadTimeoutException</span></code> or <code class="docutils literal notranslate"><span class="pre">WriteTimeoutException</span></code> (<strong>server</strong>): These
are raised when clients do not specify lower timeouts and there is a
<em>coordinator</em> timeouts based on the values supplied in the <code class="docutils literal notranslate"><span class="pre">cassandra.yaml</span></code>
configuration file. They usually indicate a serious server side problem as
the default values are usually multiple seconds.</li>
</ul>
</div>
<div class="section" id="metrics">
<h2>Metrics<a class="headerlink" href="#metrics" title="Permalink to this headline"></a></h2>
<p>If you have Cassandra <a class="reference internal" href="../operating/metrics.html#monitoring-metrics"><span class="std std-ref">metrics</span></a> reporting to a
centralized location such as <a class="reference external" href="https://graphiteapp.org/">Graphite</a> or
<a class="reference external" href="https://grafana.com/">Grafana</a> you can typically use those to narrow down
the problem. At this stage narrowing down the issue to a particular
datacenter, rack, or even group of nodes is the main goal. Some helpful metrics
to look at are:</p>
<div class="section" id="errors">
<h3>Errors<a class="headerlink" href="#errors" title="Permalink to this headline"></a></h3>
<p>Cassandra refers to internode messaging errors as “drops”, and provided a
number of <a class="reference internal" href="../operating/metrics.html#dropped-metrics"><span class="std std-ref">Dropped Message Metrics</span></a> to help narrow
down errors. If particular nodes are dropping messages actively, they are
likely related to the issue.</p>
</div>
<div class="section" id="latency">
<h3>Latency<a class="headerlink" href="#latency" title="Permalink to this headline"></a></h3>
<p>For timeouts or latency related issues you can start with <a class="reference internal" href="../operating/metrics.html#table-metrics"><span class="std std-ref">Table
Metrics</span></a> by comparing Coordinator level metrics e.g.
<code class="docutils literal notranslate"><span class="pre">CoordinatorReadLatency</span></code> or <code class="docutils literal notranslate"><span class="pre">CoordinatorWriteLatency</span></code> with their associated
replica metrics e.g. <code class="docutils literal notranslate"><span class="pre">ReadLatency</span></code> or <code class="docutils literal notranslate"><span class="pre">WriteLatency</span></code>. Issues usually show
up on the <code class="docutils literal notranslate"><span class="pre">99th</span></code> percentile before they show up on the <code class="docutils literal notranslate"><span class="pre">50th</span></code> percentile or
the <code class="docutils literal notranslate"><span class="pre">mean</span></code>. While <code class="docutils literal notranslate"><span class="pre">maximum</span></code> coordinator latencies are not typically very
helpful due to the exponentially decaying reservoir used internally to produce
metrics, <code class="docutils literal notranslate"><span class="pre">maximum</span></code> replica latencies that correlate with increased <code class="docutils literal notranslate"><span class="pre">99th</span></code>
percentiles on coordinators can help narrow down the problem.</p>
<p>There are usually three main possibilities:</p>
<ol class="arabic simple">
<li>Coordinator latencies are high on all nodes, but only a few node’s local
read latencies are high. This points to slow replica nodes and the
coordinator’s are just side-effects. This usually happens when clients are
not token aware.</li>
<li>Coordinator latencies and replica latencies increase at the
same time on the a few nodes. If clients are token aware this is almost
always what happens and points to slow replicas of a subset of token
ranges (only part of the ring).</li>
<li>Coordinator and local latencies are high on many nodes. This usually
indicates either a tipping point in the cluster capacity (too many writes or
reads per second), or a new query pattern.</li>
</ol>
<p>It’s important to remember that depending on the client’s load balancing
behavior and consistency levels coordinator and replica metrics may or may
not correlate. In particular if you use <code class="docutils literal notranslate"><span class="pre">TokenAware</span></code> policies the same
node’s coordinator and replica latencies will often increase together, but if
you just use normal <code class="docutils literal notranslate"><span class="pre">DCAwareRoundRobin</span></code> coordinator latencies can increase
with unrelated replica node’s latencies. For example:</p>
<ul class="simple">
<li><code class="docutils literal notranslate"><span class="pre">TokenAware</span></code> + <code class="docutils literal notranslate"><span class="pre">LOCAL_ONE</span></code>: should always have coordinator and replica
latencies on the same node rise together</li>
<li><code class="docutils literal notranslate"><span class="pre">TokenAware</span></code> + <code class="docutils literal notranslate"><span class="pre">LOCAL_QUORUM</span></code>: should always have coordinator and
multiple replica latencies rise together in the same datacenter.</li>
<li><code class="docutils literal notranslate"><span class="pre">TokenAware</span></code> + <code class="docutils literal notranslate"><span class="pre">QUORUM</span></code>: replica latencies in other datacenters can
affect coordinator latencies.</li>
<li><code class="docutils literal notranslate"><span class="pre">DCAwareRoundRobin</span></code> + <code class="docutils literal notranslate"><span class="pre">LOCAL_ONE</span></code>: coordinator latencies and unrelated
replica node’s latencies will rise together.</li>
<li><code class="docutils literal notranslate"><span class="pre">DCAwareRoundRobin</span></code> + <code class="docutils literal notranslate"><span class="pre">LOCAL_QUORUM</span></code>: different coordinator and replica
latencies will rise together with little correlation.</li>
</ul>
</div>
<div class="section" id="query-rates">
<h3>Query Rates<a class="headerlink" href="#query-rates" title="Permalink to this headline"></a></h3>
<p>Sometimes the <a class="reference internal" href="../operating/metrics.html#table-metrics"><span class="std std-ref">Table</span></a> query rate metrics can help
narrow down load issues as “small” increase in coordinator queries per second
(QPS) may correlate with a very large increase in replica level QPS. This most
often happens with <code class="docutils literal notranslate"><span class="pre">BATCH</span></code> writes, where a client may send a single <code class="docutils literal notranslate"><span class="pre">BATCH</span></code>
query that might contain 50 statements in it, which if you have 9 copies (RF=3,
three datacenters) means that every coordinator <code class="docutils literal notranslate"><span class="pre">BATCH</span></code> write turns into 450
replica writes! This is why keeping <code class="docutils literal notranslate"><span class="pre">BATCH</span></code>’s to the same partition is so
critical, otherwise you can exhaust significant CPU capacitity with a “single”
query.</p>
</div>
</div>
<div class="section" id="next-step-investigate-the-node-s">
<h2>Next Step: Investigate the Node(s)<a class="headerlink" href="#next-step-investigate-the-node-s" title="Permalink to this headline"></a></h2>
<p>Once you have narrowed down the problem as much as possible (datacenter, rack
, node), login to one of the nodes using SSH and proceed to debug using
<a class="reference internal" href="reading_logs.html#reading-logs"><span class="std std-ref">logs</span></a>, <a class="reference internal" href="use_nodetool.html#use-nodetool"><span class="std std-ref">nodetool</span></a>, and
<a class="reference internal" href="use_tools.html#use-os-tools"><span class="std std-ref">os tools</span></a>. If you are not able to login you may still
have access to <a class="reference internal" href="reading_logs.html#reading-logs"><span class="std std-ref">logs</span></a> and <a class="reference internal" href="use_nodetool.html#use-nodetool"><span class="std std-ref">nodetool</span></a>
remotely.</p>
</div>
</div>
<div class="doc-prev-next-links" role="navigation" aria-label="footer navigation">
<a href="reading_logs.html" class="btn btn-default pull-right " role="button" title="Cassandra Logs" accesskey="n">Next <span class="glyphicon glyphicon-circle-arrow-right" aria-hidden="true"></span></a>
<a href="index.html" class="btn btn-default" role="button" title="Troubleshooting" accesskey="p"><span class="glyphicon glyphicon-circle-arrow-left" aria-hidden="true"></span> Previous</a>
</div>
</div>
</div>
</div>
</div>
</div>
<hr />
<footer>
<div class="container">
<div class="col-md-4 social-blk">
<span class="social">
<a href="https://twitter.com/cassandra"
class="twitter-follow-button"
data-show-count="false" data-size="large">Follow @cassandra</a>
<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
<a href="https://twitter.com/intent/tweet?button_hashtag=cassandra"
class="twitter-hashtag-button"
data-size="large"
data-related="ApacheCassandra">Tweet #cassandra</a>
<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
</span>
<a class="subscribe-rss icon-link" href="/feed.xml" title="Subscribe to Blog via RSS">
<span><i class="fa fa-rss"></i></span>
</a>
</div>
<div class="col-md-8 trademark">
<p>&copy; 2016 <a href="http://apache.org">The Apache Software Foundation</a>.
Apache, the Apache feather logo, and Apache Cassandra are trademarks of The Apache Software Foundation.
<p>
</div>
</div><!-- /.container -->
</footer>
<!-- Javascript. Placed here so pages load faster -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
<script src="./../../../js/underscore-min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
<script src="./../../../js/doctools.js"></script>
<script src="./../../../js/searchtools.js"></script>
<script type="text/javascript"> var DOCUMENTATION_OPTIONS = { URL_ROOT: "", VERSION: "", COLLAPSE_INDEX: false, FILE_SUFFIX: ".html", HAS_SOURCE: false, SOURCELINK_SUFFIX: ".txt" }; </script>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
try {
var pageTracker = _gat._getTracker("UA-11583863-1");
pageTracker._trackPageview();
} catch(err) {}
</script>
</body>
</html>