blob: b962e8f9a3e4035a6f57bc90f2407314f6c4b0a2 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta name="description" content="A new open source Apache Hadoop ecosystem project, Apache Kudu completes Hadoop's storage layer to enable fast analytics on fast data" />
<meta name="author" content="Cloudera" />
<title>Apache Kudu - Kudu FAQ</title>
<!-- Bootstrap core CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css"
integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7"
crossorigin="anonymous">
<!-- Custom styles for this template -->
<link href="/css/kudu.css" rel="stylesheet"/>
<link href="/css/asciidoc.css" rel="stylesheet"/>
<link rel="shortcut icon" href="/img/logo-favicon.ico" />
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.1/css/font-awesome.min.css" />
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<div class="kudu-site container-fluid">
<!-- Static navbar -->
<nav class="navbar navbar-default">
<div class="container-fluid">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="logo" href="/"><img
src="//d3dr9sfxru4sde.cloudfront.net/i/k/apachekudu_logo_0716_80px.png"
srcset="//d3dr9sfxru4sde.cloudfront.net/i/k/apachekudu_logo_0716_80px.png 1x, //d3dr9sfxru4sde.cloudfront.net/i/k/apachekudu_logo_0716_160px.png 2x"
alt="Apache Kudu"/></a>
</div>
<div id="navbar" class="collapse navbar-collapse">
<ul class="nav navbar-nav navbar-right">
<li >
<a href="/">Home</a>
</li>
<li >
<a href="/overview.html">Overview</a>
</li>
<li >
<a href="/docs/">Documentation</a>
</li>
<li >
<a href="/releases/">Releases</a>
</li>
<li >
<a href="/blog/">Blog</a>
</li>
<!-- NOTE: this dropdown menu does not appear on Mobile, so don't add anything here
that doesn't also appear elsewhere on the site. -->
<li class="dropdown">
<a href="/community.html" role="button" aria-haspopup="true" aria-expanded="false">Community <span class="caret"></span></a>
<ul class="dropdown-menu">
<li class="dropdown-header">GET IN TOUCH</li>
<li><a class="icon email" href="/community.html">Mailing Lists</a></li>
<li><a class="icon slack" href="https://getkudu-slack.herokuapp.com/">Slack Channel</a></li>
<li role="separator" class="divider"></li>
<li><a href="/community.html#meetups-user-groups-and-conference-presentations">Events and Meetups</a></li>
<li><a href="/committers.html">Project Committers</a></li>
<!--<li><a href="/roadmap.html">Roadmap</a></li>-->
<li><a href="/community.html#contributions">How to Contribute</a></li>
<li role="separator" class="divider"></li>
<li class="dropdown-header">DEVELOPER RESOURCES</li>
<li><a class="icon github" href="https://github.com/apache/incubator-kudu">GitHub</a></li>
<li><a class="icon gerrit" href="http://gerrit.cloudera.org:8080/#/q/status:open+project:kudu">Gerrit Code Review</a></li>
<li><a class="icon jira" href="https://issues.apache.org/jira/browse/KUDU">JIRA Issue Tracker</a></li>
<li role="separator" class="divider"></li>
<li class="dropdown-header">SOCIAL MEDIA</li>
<li><a class="icon twitter" href="https://twitter.com/ApacheKudu">Twitter</a></li>
<li><a href="https://www.reddit.com/r/kudu/">Reddit</a></li>
<li role="separator" class="divider"></li>
<li class="dropdown-header">APACHE SOFTWARE FOUNDATION</li>
<li><a href="https://www.apache.org/security/" target="_blank">Security</a></li>
<li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Sponsorship</a></li>
<li><a href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a></li>
<li><a href="https://www.apache.org/licenses/" target="_blank">License</a></li>
</ul>
</li>
<li class="active">
<a href="/faq.html">FAQ</a>
</li>
</ul><!-- /.nav -->
</div><!-- /#navbar -->
</div><!-- /.container-fluid -->
</nav>
<div class="row-fluid">
<div class="col-lg-12 faq">
<h2 class="no_toc" id="frequently-asked-questions">Frequently Asked Questions</h2>
<ul id="markdown-toc">
<li><a href="#project-motivation" id="markdown-toc-project-motivation">Project Motivation</a></li>
<li><a href="#project-status" id="markdown-toc-project-status">Project Status</a></li>
<li><a href="#getting-started" id="markdown-toc-getting-started">Getting Started</a></li>
<li><a href="#storage-details" id="markdown-toc-storage-details">Storage Details</a></li>
<li><a href="#consistency-and-cap-theorem" id="markdown-toc-consistency-and-cap-theorem">Consistency and CAP Theorem</a></li>
<li><a href="#working-with-other-storage-systems" id="markdown-toc-working-with-other-storage-systems">Working With Other Storage Systems</a></li>
<li><a href="#hardware-and-operations" id="markdown-toc-hardware-and-operations">Hardware and Operations</a></li>
<li><a href="#security" id="markdown-toc-security">Security</a></li>
<li><a href="#schema-design" id="markdown-toc-schema-design">Schema Design</a></li>
<li><a href="#indexes" id="markdown-toc-indexes">Indexes</a></li>
<li><a href="#transactions" id="markdown-toc-transactions">Transactions</a></li>
</ul>
<h3 id="project-motivation">Project Motivation</h3>
<h4 id="why-use-column-storage-format-would-a-row-wise-format-increase-performance">Why use column storage format? Would a row-wise format increase performance?</h4>
<p>Analytic use-cases almost exclusively use a subset of the columns in the queried
table and generally aggregate values over a broad range of rows. This access pattern
is greatly accelerated by column oriented data. Operational use-cases are more
likely to access most or all of the columns in a row, and might be more appropriately
served by row oriented storage. A column oriented storage format was chosen for
Kudu because it’s primarily targeted at analytic use-cases.</p>
<p>There’s nothing that precludes Kudu from providing a row-oriented option, and it
could be included in a potential release.</p>
<h4 id="why-build-a-new-storage-engine-why-not-just-improve-apache-hbase-to-increase-its-scan-speed">Why build a new storage engine? Why not just improve Apache HBase to increase its scan speed?</h4>
<p>Kudu shares some characteristics with HBase. Like HBase, it is a real-time store
that supports key-indexed record lookup and mutation.</p>
<p>However, Kudu’s design differs from HBase in some fundamental ways:</p>
<ul>
<li>Kudu’s data model is more traditionally relational, while HBase is schemaless.</li>
<li>Kudu’s on-disk representation is truly columnar and follows an entirely different
storage design than HBase/BigTable.</li>
</ul>
<p>Making these fundamental changes in HBase would require a massive redesign, as opposed
to a series of simple changes. HBase is the right design for many classes of
applications and use cases and will continue to be the best storage engine for those
workloads.</p>
<h3 id="project-status">Project Status</h3>
<h4 id="is-apache-kudu-ready-to-be-deployed-into-production-yet">Is Apache Kudu ready to be deployed into production yet?</h4>
<p>Yes! Kudu has been battle tested in production at many major corporations.</p>
<h4 id="is-kudu-open-source">Is Kudu open source?</h4>
<p>Yes, Kudu is open source and licensed under the Apache Software License, version 2.0.
Apache Kudu is a top level project (TLP) under the umbrella of the Apache Software Foundation.</p>
<h4 id="why-was-kudu-developed-internally-at-cloudera-before-its-release">Why was Kudu developed internally at Cloudera before its release?</h4>
<p>We believe strongly in the value of open source for the long-term sustainable
development of a project. We also believe that it is easier to work with a small
group of colocated developers when a project is very young. Being in the same
organization allowed us to move quickly during the initial design and development
of the system.</p>
<p>Now that Kudu is public and is part of the Apache Software Foundation, we look
forward to working with a larger community during its next phase of development.</p>
<h3 id="getting-started">Getting Started</h3>
<h4 id="is-training-available">Is training available?</h4>
<p>Training is not provided by the Apache Software Foundation, but may be provided
by third-party vendors.</p>
<p>As of January 2016, Cloudera offers an
<a href="https://university.cloudera.com/content/cloudera-university-ondemand-introduction-to-apache-kudu">on-demand training course</a>
entitled “Introduction to Apache Kudu”.
This training covers what Kudu is, and how it compares to other Hadoop-related
storage systems, use cases that will benefit from using Kudu, and how to create,
store, and access data in Kudu tables with Apache Impala.</p>
<p>Aside from training, you can also get help with using Kudu through
<a href="docs/index.html">documentation</a>,
the <a href="community.html">mailing lists</a>,
and the <a href="https://getkudu-slack.herokuapp.com/">Kudu chat room</a>.</p>
<h4 id="is-there-a-quickstart">Is there a quickstart?</h4>
<p>Yes. Instructions on getting up and running on Kudu via a Docker based quickstart are provided in Kudu’s
<a href="http://kudu.apache.org/docs/quickstart.html">quickstart guide</a>.</p>
<h3 id="storage-details">Storage Details</h3>
<h4 id="how-does-kudu-store-its-data-is-the-underlying-data-storage-readable-without-going-through-kudu">How does Kudu store its data? Is the underlying data storage readable without going through Kudu?</h4>
<p>Kudu accesses storage devices through the local filesystem, and works best with Ext4 or
XFS. Kudu handles striping across <abbr title="just a bunch of disks">JBOD</abbr> mount
points, and does not require <abbr title="redundant array of inexpensive disks">RAID</abbr>.
Kudu’s write-ahead logs (WALs) can be stored on separate locations from the data files,
which means that WALs can be stored on <abbr title="solid state drives">SSDs</abbr> to
enable lower-latency writes on systems with both SSDs and magnetic disks.</p>
<p>Kudu’s on-disk data format closely resembles Parquet, with a few differences to
support efficient random access as well as updates. The underlying data is not
directly queryable without using the Kudu client APIs. The Kudu developers have worked hard
to ensure that Kudu’s scan performance is performant, and has focused on storing data
efficiently without making the trade-offs that would be required to allow direct access
to the data files.</p>
<h4 id="is-kudu-an-in-memory-database">Is Kudu an in-memory database?</h4>
<p>Kudu is not an
<a href="https://en.wikipedia.org/wiki/In-memory_database">in-memory database</a>
since it primarily relies on disk storage. This should not be confused with Kudu’s
experimental use of
<a href="https://en.wikipedia.org/wiki/Non-volatile_memory">persistent memory</a>
which is integrated in the block cache. In the future, this integration this will
allow the cache to survive tablet server restarts, so that it never starts “cold”.</p>
<p>In addition, Kudu’s C++ implementation can scale to very large heaps. Coupled
with its CPU-efficient design, Kudu’s heap scalability offers outstanding
performance for data sets that fit in memory.</p>
<h4 id="does-kudu-run-its-own-format-type-or-does-it-use-parquet-what-is-the-compression-recommendation">Does Kudu run its own format type or does it use Parquet? What is the compression recommendation?</h4>
<p>Kudu’s on-disk data format closely resembles Parquet, with a few differences to
support efficient random access as well as updates. The underlying data is not
directly queryable without using the Kudu client APIs. The Kudu developers have worked
hard to ensure that Kudu’s scan performance is performant, and has focused on
storing data efficiently without making the trade-offs that would be required to
allow direct access to the data files.</p>
<p>The recommended compression codec is dependent on the appropriate trade-off
between cpu utilization and storage efficiency and is therefore use-case dependent.</p>
<h4 id="should-compactions-be-managed">Should compactions be managed?</h4>
<p>Compactions in Kudu are designed to be small and to always be running in the
background. They operate under a (configurable) budget to prevent tablet servers
from unexpectedly attempting to rewrite tens of GB of data at a time. Since compactions
are so predictable, the only tuning knob available is the number of threads dedicated
to flushes and compactions in the <em>maintenance manager</em>.</p>
<h4 id="what-is-the-compaction-performance-like">What is the compaction performance like?</h4>
<p>Kudu runs a background compaction process that incrementally and constantly
compacts data. Constant small compactions provide predictable latency by avoiding
major compaction operations that could monopolize CPU and IO resources.</p>
<h4 id="is-there-a-time-to-live-property-as-in-hbase-to-delete-a-record-automatically">Is there a time-to-live property as in HBase to delete a record automatically?</h4>
<p>No, Kudu does not currently support such a feature.</p>
<h4 id="do-the-tablet-servers-require-a-linux-filesystem-or-control-the-storage-devices-directly">Do the tablet servers require a Linux filesystem or control the storage devices directly?</h4>
<p>The tablet servers store data on the Linux filesystem. We recommend ext4 or xfs
mount points for the storage directories. Typically, a Kudu tablet server will
share the same partitions as existing HDFS datanodes.</p>
<h4 id="are-there-chances-of-region-server-hotspotting-like-with-hbase-and-how-does-kudu-mitigate-this">Are there chances of region server hotspotting like with HBase and how does Kudu mitigate this?</h4>
<p>Hotspotting in HBase is an attribute inherited from the distribution strategy used.</p>
<p>By default, HBase uses range based distribution. Range based partitioning stores
ordered values that fit within a specified range of a provided key contiguously
on disk. Range based partitioning is efficient when there are large numbers of
concurrent small queries, as only servers in the cluster that have values within
the range specified by the query will be recruited to process that query. Range
partitioning is susceptible to hotspots, either because the key(s) used to
specify the range exhibits “data skew” (the number of rows within each range
is not uniform), or some data is queried more frequently creating “workload
skew”.</p>
<p>In contrast, hash based distribution specifies a certain number of “buckets”
and distribution keys are passed to a hash function that produces the value of
the bucket that the row is assigned to. If the distribution key is chosen
carefully (a unique key with no business meaning is ideal) hash distribution
will result in each server in the cluster having a uniform number of rows. Hash
based distribution protects against both data skew and workload skew.
Additionally, it provides the highest possible throughput for any individual
query because all servers are recruited in parallel as data will be evenly
spread across every server in the cluster. However, optimizing for throughput by
recruiting every server in the cluster for every query comes compromises the
maximum concurrency that the cluster can achieve. HBase can use hash based
distribution by “salting” the row key.</p>
<p>Kudu supports both approaches, giving you the ability choose to emphasize
concurrency at the expense of potential data and workload skew with range
partitioning, or query throughput at the expense of concurrency through hash
partitioning.</p>
<h4 id="does-kudu-support-dynamic-partitioning">Does Kudu support dynamic partitioning?</h4>
<p>Kudu is a storage engine, not a SQL engine. Dynamic partitions are created at
execution time rather than at query time, but in either case the process will
look the same from Kudu’s perspective: the query engine will pass down
partition keys to Kudu.</p>
<h3 id="consistency-and-cap-theorem">Consistency and CAP Theorem</h3>
<h4 id="what-is-kudus-consistency-model-is-kudu-a-cp-or-ap-system">What is Kudu’s consistency model? Is Kudu a CP or AP system?</h4>
<p>In the parlance of the CAP theorem, Kudu is a
<abbr title="consistent (but not available) under network partitions">CP</abbr>
type of storage engine. Writing to a tablet will be delayed if the server that hosts that
tablet’s leader replica fails until a quorum of servers is able to elect a new leader and
acknowledge a given write request.</p>
<p>Kudu gains the following properties by using Raft consensus:</p>
<ul>
<li>Leader elections are fast. As soon as the leader misses 3 heartbeats (half a second each), the
remaining followers will elect a new leader which will start accepting operations right away.
This whole process usually takes less than 10 seconds.</li>
<li>Follower replicas don’t allow writes, but they do allow reads when fully up-to-date data is not
required. Thus, queries against historical data (even just a few minutes old) can be
sent to any of the replicas. If that replica fails, the query can be sent to another
replica immediately.</li>
</ul>
<p>In current releases, some of these properties are not be fully implemented and
may suffer from some deficiencies. See the answer to
<a href="#is-kudus-consistency-level-tunable">Is Kudu’s consistency level tunable?</a>
for more information.</p>
<h4 id="is-kudus-consistency-level-tunable">Is Kudu’s consistency level tunable?</h4>
<p>Yes, Kudu’s consistency level is partially tunable, both for writes and reads (scans):</p>
<ul>
<li>Writes to a single tablet are always internally consistent. When writing to multiple tablets,
with multiple clients, the user has a choice between no consistency (the default) and
enforcing “external consistency” in two different ways: one that optimizes for latency
requires the user to perform additional work and another that requires no additional
work but can result in some additional latency.</li>
<li>Scans have “Read Committed” consistency by default. If the user requires strict-serializable
scans it can choose the <code class="language-plaintext highlighter-rouge">READ_AT_SNAPSHOT</code> mode and, optionally, provide a timestamp. The default
option is non-blocking but the <code class="language-plaintext highlighter-rouge">READ_AT_SNAPSHOT</code> option may block when reading from non-leader
replicas.</li>
</ul>
<p>Kudu’s transactional semantics are a work in progress, see
<a href="docs/transaction_semantics.html">Kudu Transaction Semantics</a> for
further information and caveats.</p>
<h4 id="how-does-kudu-handle-dirty-reads">How does Kudu handle dirty reads?</h4>
<p>Neither “read committed” nor “READ_AT_SNAPSHOT” consistency modes permit dirty reads.</p>
<h4 id="where-is-kudus-jepsen-report">Where is Kudu’s Jepsen report?</h4>
<p>Kudu hasn’t been publicly tested with Jepsen but it is possible to run a set of tests following
<a href="https://github.com/apache/kudu/blob/master/java/kudu-jepsen/README.adoc">these instructions</a>.</p>
<h3 id="working-with-other-storage-systems">Working With Other Storage Systems</h3>
<h4 id="can-data-be-loaded-directly-into-kudu-what-ingest-tools-can-be-used">Can data be loaded directly into Kudu? What ingest tools can be used?</h4>
<p>Kudu provides direct access via Java and C++ APIs. An experimental Python API is
also available and is expected to be fully supported in the future. The easiest
way to load data into Kudu is to use a <code class="language-plaintext highlighter-rouge">CREATE TABLE ... AS SELECT * FROM ...</code>
statement in Impala. Additionally, data is commonly ingested into Kudu using
Spark, Nifi, and Flume.</p>
<h4 id="whats-the-most-efficient-way-to-bulk-load-data-into-kudu">What’s the most efficient way to bulk load data into Kudu?</h4>
<p>The easiest way to load data into Kudu is if the data is already managed by Impala.
In this case, a simple <code class="language-plaintext highlighter-rouge">INSERT INTO TABLE some_kudu_table SELECT * FROM some_csv_table</code>
does the trick.</p>
<p>You can also use Kudu’s Spark integration to load data from or
any other Spark compatible data store.</p>
<p>No tool is provided to load data directly into Kudu’s on-disk data format. We
have found that for many workloads, the insert performance of Kudu is comparable
to bulk load performance of other systems.</p>
<h4 id="what-kinds-of-data-can-kudu-store-can-it-accept-json">What kinds of data can Kudu store? Can it accept JSON?</h4>
<p>Kudu uses typed storage and currently does not have a specific type for semi-
structured data such as JSON. Semi-structured data can be stored in a STRING or
BINARY column, but large values (10s of KB or more) are likely to cause
performance or stability problems in current versions.</p>
<p>Fuller support for semi-structured types like JSON and protobuf will be added in
the future, contingent on demand.</p>
<h4 id="is-there-a-jdbc-driver-available">Is there a JDBC driver available?</h4>
<p>Kudu is not a SQL engine. The availability of JDBC and ODBC drivers will be
dictated by the SQL engine used in combination with Kudu.</p>
<h4 id="do-you-need-hadoop-to-run-kudu">Do you need Hadoop to run Kudu?</h4>
<p>Kudu does not rely on any Hadoop components if it is accessed using its
programmatic APIs. However, most usage of Kudu will include at least one Hadoop
component such as MapReduce, Spark, or Impala. Components that have been
modified to take advantage of Kudu storage, such as Impala, might have Hadoop
dependencies.</p>
<h4 id="what-is-the-relationship-between-kudu-and-hdfs-does-kudu-require-hdfs">What is the relationship between Kudu and HDFS? Does Kudu require HDFS?</h4>
<p>Kudu is a separate storage system. It does not rely on or run on top of HDFS.
Kudu can coexist with HDFS on the same cluster.</p>
<h4 id="why-doesnt-kudu-store-its-data-in-hdfs">Why doesn’t Kudu store its data in HDFS?</h4>
<p>We considered a design which stored data on HDFS, but decided to go in a different
direction, for the following reasons:</p>
<ul>
<li>Kudu handles replication at the logical level using Raft consensus, which makes
HDFS replication redundant. We could have mandated a replication level of 1, but
that is not HDFS’s best use case.</li>
<li>Filesystem-level snapshots provided by HDFS do not directly translate to Kudu support for
snapshots, because it is hard to predict when a given piece of data will be flushed
from memory. In addition, snapshots only make sense if they are provided on a per-table
level, which would be difficult to orchestrate through a filesystem-level snapshot.</li>
<li>HDFS security doesn’t translate to table- or column-level ACLs. Similar to HBase
ACLs, Kudu would need to implement its own security system and would not get much
benefit from the HDFS security model.</li>
<li>Kudu’s scan performance is already within the same ballpark as Parquet files stored
on HDFS, so there’s no need to accomodate reading Kudu’s data files directly.</li>
</ul>
<h4 id="what-frameworks-are-integrated-with-kudu-for-data-access">What frameworks are integrated with Kudu for data access?</h4>
<p>Kudu is integrated with Impala, Spark, Nifi, MapReduce, and more. Additional
frameworks are expected, with Hive being the current highest priority addition.</p>
<h4 id="can-i-colocate-kudu-with-hdfs-on-the-same-servers">Can I colocate Kudu with HDFS on the same servers?</h4>
<p>Kudu can be colocated with HDFS on the same data disk mount points. This is similar
to colocating Hadoop and HBase workloads. Kudu has been extensively tested
in this type of configuration, with no stability issues. For latency-sensitive workloads,
consider dedicating an SSD to Kudu’s WAL files.</p>
<h3 id="hardware-and-operations">Hardware and Operations</h3>
<h4 id="what-are-kudus-runtime-dependencies">What are Kudu’s runtime dependencies?</h4>
<p>Kudu itself doesn’t have any service dependencies and can run on a cluster without Hadoop,
Impala, Spark, or any other project.</p>
<p>If you want to use Impala, note that Impala depends on Hive’s metadata server, which has
its own dependencies on Hadoop. It is not currently possible to have a pure Kudu+Impala
deployment.</p>
<h4 id="should-the-master-node-have-more-ram-than-worker-nodes">Should the master node have more RAM than worker nodes?</h4>
<p>For small clusters with fewer than 100 nodes, with reasonable numbers of tables
and tablets, the master node requires very little RAM, typically 1 GB or less.
For workloads with large numbers of tables or tablets, more RAM will be
required, but not more RAM than typical Hadoop worker nodes.</p>
<h4 id="is-the-master-node-a-single-point-of-failure">Is the master node a single point of failure?</h4>
<p>No. Kudu includes support for running multiple Master nodes, using the same Raft
consensus algorithm that is used for durability of data.</p>
<h4 id="does-kudu-require-the-use-of-ssds">Does Kudu require the use of SSDs?</h4>
<p>No, SSDs are not a requirement of Kudu. Kudu is designed to take full advantage
of fast storage and large amounts of memory if present, but neither is required.</p>
<h4 id="can-a-kudu-deployment-be-geo-distributed">Can a Kudu deployment be geo-distributed?</h4>
<p>We don’t recommend geo-distributing tablet servers this time because of the possibility
of higher write latencies. In addition, Kudu is not currently aware of data placement.
This could lead to a situation where the master might try to put all replicas
in the same datacenter. We plan to implement the necessary features for geo-distribution
in a future release.</p>
<h4 id="where-is-the-kudu-shell">Where is the Kudu shell?</h4>
<p>Kudu doesn’t yet have a command-line shell. If the Kudu-compatible version of Impala is
installed on your cluster then you can use it as a replacement for a shell. See also the
docs for the <a href="docs/kudu_impala_integration.html">Kudu Impala Integration</a>.</p>
<h4 id="is-the-kudu-master-a-bottleneck">Is the Kudu Master a bottleneck?</h4>
<p>Although the Master is not sharded, it is not expected to become a bottleneck for
the following reasons.</p>
<ul>
<li>Like many other systems, the master is not on the hot path once the tablet
locations are cached.</li>
<li>The Kudu master process is extremely efficient at keeping everything in memory.
In our testing on an 80-node cluster, the 99.99th percentile latency for getting
tablet locations was on the order of hundreds of microseconds (not a typo).</li>
</ul>
<h4 id="what-operating-systems-does-kudu-support">What operating systems does Kudu support?</h4>
<p>Linux is required to run Kudu. See the <a href="docs/installation.html#prerequisites_and_requirements">installation
guide</a> for details. OSX
is supported as a development platform in Kudu 0.6.0 and newer. The Java client
can be used on any JVM 7+ platform.</p>
<h4 id="what-linux-based-operating-systems-are-known-not-to-work-with-kudu">What Linux-based operating systems are known NOT to work with Kudu?</h4>
<p><strong>RHEL 5</strong>: the kernel is missing critical features for handling disk space
reclamation (such as hole punching), and it is not possible to run applications
which use C++11 language features.</p>
<p><strong>Debian 7</strong>: ships with gcc 4.7.2 which produces broken Kudu optimized code,
and there is insufficient support for applications which use C++11 language
features.</p>
<p><strong>SLES 11</strong>: it is not possible to run applications which use C++11 language
features.</p>
<h4 id="how-can-i-back-up-my-kudu-data">How can I back up my Kudu data?</h4>
<p>As of Kudu 1.10.0, Kudu supports both full and incremental table backups via a
job implemented using Apache Spark. Additionally it supports restoring tables
from full and incremental backups via a restore job implemented using Apache Spark.
See the <a href="docs/administration.html">administration documentation</a> for details.</p>
<p>For older versions which do not have a built-in backup mechanism, Impala can
help if you have it available. You can use it to copy your data into Parquet
format using a statement like:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>INSERT INTO TABLE some_parquet_table SELECT * FROM kudu_table
</code></pre></div></div>
<p>then use <a href="http://hadoop.apache.org/docs/current/hadoop-distcp/DistCp.html">distcp</a>
to copy the Parquet data to another cluster.</p>
<h4 id="can-the-wal-transaction-logs-be-used-to-build-a-disaster-recovery-site">Can the WAL transaction logs be used to build a disaster recovery site?</h4>
<p>Currently, Kudu does not support any mechanism for shipping or replaying WALs
between sites.</p>
<h4 id="is-there-a-single-wal-per-tablet-or-per-table">Is there a single WAL per tablet or per table?</h4>
<p>There is one WAL per tablet.</p>
<h3 id="security">Security</h3>
<h4 id="how-is-security-handled-in-kudu">How is security handled in Kudu?</h4>
<p>Kudu supports strong authentication and is designed to interoperate with other
secure Hadoop components by utilizing Kerberos. It also supports coarse-grained
authorization of client requests and TLS encryption of communication among
servers and between clients and servers. To learn more, please refer to the
<a href="https://kudu.apache.org/docs/security.html">security guide</a>.</p>
<h3 id="schema-design">Schema Design</h3>
<h4 id="can-kudu-tolerate-changing-schemas">Can Kudu tolerate changing schemas?</h4>
<p>Yes, Kudu provides the ability to add, drop, and rename columns/tables.
Currently it is not possible to change the type of a column in-place, though
this is expected to be added to a subsequent Kudu release.</p>
<h4 id="are-there-best-practices-in-terms-of-data-modeling">Are there best practices in terms of data modeling?</h4>
<p>Kudu tables must have a unique primary key. Kudu has not been tested with
columns containing large values (10s of KB and higher) and performance problems
when using large values are anticipated. See
<a href="http://kudu.apache.org/docs/schema_design.html">Schema Design</a>.</p>
<h4 id="can-kudu-be-used-to-replace-lambda-architectures">Can Kudu be used to replace Lambda Architectures?</h4>
<p>In many cases Kudu’s combination of real-time and analytic performance will
allow the complexity inherent to Lambda architectures to be simplified through
the use of a single storage engine.</p>
<h4 id="is-there-a-way-to-force-the-order-of-execution-of-a-list-statement-ie-force-an-update-on-table-a-after-a-previous-insert-on-table-b">Is there a way to force the order of execution of a list statement? (ie force an update on table A after a previous insert on table B)?</h4>
<p>When using the Kudu API, users can choose to perform synchronous operations.
If a sequence of synchronous operations is made, Kudu guarantees that timestamps
are assigned in a corresponding order.</p>
<h4 id="should-i-use-kudu-for-oltp-type-workloads-how-does-kudu-relate-to-spanner-from-an-oltp-standpoint">Should I use Kudu for OLTP-type workloads? How does Kudu relate to Spanner from an OLTP standpoint?</h4>
<p>Kudu is inspired by Spanner in that it uses a consensus-based replication design and
timestamps for consistency control, but the on-disk layout is pretty different.</p>
<p>Kudu was designed and optimized for OLAP workloads and lacks features such as multi-row
transactions and secondary indexing typically needed to support OLTP.</p>
<p>As a true column store, Kudu is not as efficient for OLTP as a row store would be. There are also
currently some implementation issues that hurt Kudu’s performance on Zipfian distribution
updates (see the YCSB results in the performance evaluation of our <a href="kudu.pdf">draft paper</a>.</p>
<p>We anticipate that future releases will continue to improve performance for these workloads,
but Kudu is not designed to be a full replacement for OLTP stores for all workloads. Please
consider other storage engines such as Apache HBase or a traditional RDBMS.</p>
<h3 id="indexes">Indexes</h3>
<h4 id="can-multi-column-indexes-be-created">Can multi-column indexes be created?</h4>
<p>Kudu supports compound primary keys. Secondary indexes, compound or not, are not
currently supported.</p>
<h4 id="does-kudu-support-secondary-indexes">Does Kudu support secondary indexes?</h4>
<p>No, Kudu does not support secondary indexes. Random access is only possible through the
primary key. For analytic drill-down queries, Kudu has very fast single-column scans which
allow it to produce sub-second results when querying across billions of rows on small
clusters.</p>
<h4 id="are-index-updates-maintained-automatically">Are index updates maintained automatically?</h4>
<p>Kudu’s primary key is automatically maintained. Secondary indexes, manually or
automatically maintained, are not currently supported.</p>
<h4 id="is-there-a-concept-like-partitioning-keys-like-with-cassandra-primary-and-secondary-index-concepts">Is there a concept like partitioning keys like with Cassandra (primary and secondary index concepts)?</h4>
<p>Kudu’s primary key can be either simple (a single column) or compound
(multiple columns). Within any tablet, rows are written in the sort order of the
primary key. In the case of a compound key, sorting is determined by the order
that the columns in the key are declared. For hash-based distribution, a hash of
the entire key is used to determine the “bucket” that values will be placed in.</p>
<p>With either type of partitioning, it is possible to partition based on only a
subset of the primary key column. For example, a primary key of “(host, timestamp)”
could be range-partitioned on only the timestamp column.</p>
<h4 id="does-kudu-have-relational-features-like-autoincrement-column-pkfk-constraints-or-built-in-indexes">Does Kudu have relational features like autoincrement column, PK/FK constraints, or built-in indexes?</h4>
<p>Kudu tables have a primary key that is used for uniqueness as well as providing
quick access to individual rows. Auto-incrementing columns, foreign key constraints,
and secondary indexes are not currently supported, but could be added in subsequent
Kudu releases.</p>
<h3 id="transactions">Transactions</h3>
<h4 id="does-kudu-support-multi-row-transactions">Does Kudu support multi-row transactions?</h4>
<p>No, Kudu does not support multi-row transactions at this time. However, single row
operations are atomic within that row.</p>
<h4 id="does-kudu-offer-acid-compliance">Does Kudu offer ACID compliance?</h4>
<p>Kudu is designed to eventually be fully ACID compliant. However, multi-row
transactions are not yet implemented. The single-row transaction guarantees it
currently provides are very similar to HBase.</p>
<h4 id="is-a-rollback-concept-supported">Is a rollback concept supported?</h4>
<p>Kudu does not currently support transaction rollback.</p>
</div>
</div>
<footer class="footer">
<div class="row">
<div class="col-md-9">
<p class="small">
Copyright &copy; 2019 The Apache Software Foundation.
</p>
<p class="small">
Apache Kudu, Kudu, Apache, the Apache feather logo, and the Apache Kudu
project logo are either registered trademarks or trademarks of The
Apache Software Foundation in the United States and other countries.
</p>
</div>
<div class="col-md-3">
<a class="pull-right" href="https://www.apache.org/events/current-event.html">
<img src="https://www.apache.org/events/current-event-234x60.png"/>
</a>
</div>
</div>
</footer>
</div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
<script>
// Try to detect touch-screen devices. Note: Many laptops have touch screens.
$(document).ready(function() {
if ("ontouchstart" in document.documentElement) {
$(document.documentElement).addClass("touch");
} else {
$(document.documentElement).addClass("no-touch");
}
});
</script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"
integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS"
crossorigin="anonymous"></script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-68448017-1', 'auto');
ga('send', 'pageview');
</script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/anchor-js/3.1.0/anchor.js"></script>
<script>
anchors.options = {
placement: 'right',
visible: 'touch',
};
anchors.add();
</script>
</body>
</html>