|  | <!DOCTYPE html> | 
|  | <html lang="en"> | 
|  | <head> | 
|  | <meta charset="UTF-8" /> | 
|  | <meta name="viewport" content="width=device-width, initial-scale=1.0"> | 
|  | <meta name="description" content="Apache Druid"> | 
|  | <meta name="keywords" content="druid,kafka,database,analytics,streaming,real-time,real time,apache,open source"> | 
|  | <meta name="author" content="Apache Software Foundation"> | 
|  |  | 
|  | <title>Druid | Ingestion</title> | 
|  |  | 
|  | <link rel="alternate" type="application/atom+xml" href="/feed"> | 
|  | <link rel="shortcut icon" href="/img/favicon.png"> | 
|  |  | 
|  | <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.7.2/css/all.css" integrity="sha384-fnmOCqbTlWIlj8LyTjo7mOUStjsKC4pOpQbqyi7RrhN7udi9RwhKkMHpvLbHG9Sr" crossorigin="anonymous"> | 
|  |  | 
|  | <link href='//fonts.googleapis.com/css?family=Open+Sans+Condensed:300,700,300italic|Open+Sans:300italic,400italic,600italic,400,300,600,700' rel='stylesheet' type='text/css'> | 
|  |  | 
|  | <link rel="stylesheet" href="/css/bootstrap-pure.css?v=1.1"> | 
|  | <link rel="stylesheet" href="/css/base.css?v=1.1"> | 
|  | <link rel="stylesheet" href="/css/header.css?v=1.1"> | 
|  | <link rel="stylesheet" href="/css/footer.css?v=1.1"> | 
|  | <link rel="stylesheet" href="/css/syntax.css?v=1.1"> | 
|  | <link rel="stylesheet" href="/css/docs.css?v=1.1"> | 
|  |  | 
|  | <script> | 
|  | (function() { | 
|  | var cx = '000162378814775985090:molvbm0vggm'; | 
|  | var gcse = document.createElement('script'); | 
|  | gcse.type = 'text/javascript'; | 
|  | gcse.async = true; | 
|  | gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + | 
|  | '//cse.google.com/cse.js?cx=' + cx; | 
|  | var s = document.getElementsByTagName('script')[0]; | 
|  | s.parentNode.insertBefore(gcse, s); | 
|  | })(); | 
|  | </script> | 
|  |  | 
|  |  | 
|  | </head> | 
|  |  | 
|  | <body> | 
|  | <!-- Start page_header include --> | 
|  | <script src="//ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script> | 
|  |  | 
|  | <div class="top-navigator"> | 
|  | <div class="container"> | 
|  | <div class="left-cont"> | 
|  | <a class="logo" href="/"><span class="druid-logo"></span></a> | 
|  | </div> | 
|  | <div class="right-cont"> | 
|  | <ul class="links"> | 
|  | <li class=""><a href="/technology">Technology</a></li> | 
|  | <li class=""><a href="/use-cases">Use Cases</a></li> | 
|  | <li class=""><a href="/druid-powered">Powered By</a></li> | 
|  | <li class=""><a href="/docs/latest/design/">Docs</a></li> | 
|  | <li class=""><a href="/community/">Community</a></li> | 
|  | <li class="header-dropdown"> | 
|  | <a>Apache</a> | 
|  | <div class="header-dropdown-menu"> | 
|  | <a href="https://www.apache.org/" target="_blank">Foundation</a> | 
|  | <a href="https://www.apache.org/events/current-event" target="_blank">Events</a> | 
|  | <a href="https://www.apache.org/licenses/" target="_blank">License</a> | 
|  | <a href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a> | 
|  | <a href="https://www.apache.org/security/" target="_blank">Security</a> | 
|  | <a href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Sponsorship</a> | 
|  | </div> | 
|  | </li> | 
|  | <li class=" button-link"><a href="/downloads.html">Download</a></li> | 
|  | </ul> | 
|  | </div> | 
|  | </div> | 
|  | <div class="action-button menu-icon"> | 
|  | <span class="fa fa-bars"></span> MENU | 
|  | </div> | 
|  | <div class="action-button menu-icon-close"> | 
|  | <span class="fa fa-times"></span> MENU | 
|  | </div> | 
|  | </div> | 
|  |  | 
|  | <script type="text/javascript"> | 
|  | var $menu = $('.right-cont'); | 
|  | var $menuIcon = $('.menu-icon'); | 
|  | var $menuIconClose = $('.menu-icon-close'); | 
|  |  | 
|  | function showMenu() { | 
|  | $menu.fadeIn(100); | 
|  | $menuIcon.fadeOut(100); | 
|  | $menuIconClose.fadeIn(100); | 
|  | } | 
|  |  | 
|  | $menuIcon.click(showMenu); | 
|  |  | 
|  | function hideMenu() { | 
|  | $menu.fadeOut(100); | 
|  | $menuIconClose.fadeOut(100); | 
|  | $menuIcon.fadeIn(100); | 
|  | } | 
|  |  | 
|  | $menuIconClose.click(hideMenu); | 
|  |  | 
|  | $(window).resize(function() { | 
|  | if ($(window).width() >= 840) { | 
|  | $menu.fadeIn(100); | 
|  | $menuIcon.fadeOut(100); | 
|  | $menuIconClose.fadeOut(100); | 
|  | } | 
|  | else { | 
|  | $menu.fadeOut(100); | 
|  | $menuIcon.fadeIn(100); | 
|  | $menuIconClose.fadeOut(100); | 
|  | } | 
|  | }); | 
|  | </script> | 
|  |  | 
|  | <!-- Stop page_header include --> | 
|  |  | 
|  |  | 
|  | <div class="container doc-container"> | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | <p> Looking for the <a href="/docs/0.17.1/">latest stable documentation</a>?</p> | 
|  |  | 
|  |  | 
|  | <div class="row"> | 
|  | <div class="col-md-9 doc-content"> | 
|  | <p> | 
|  | <a class="btn btn-default btn-xs visible-xs-inline-block visible-sm-inline-block" href="#toc">Table of Contents</a> | 
|  | </p> | 
|  | <!-- | 
|  | ~ Licensed to the Apache Software Foundation (ASF) under one | 
|  | ~ or more contributor license agreements.  See the NOTICE file | 
|  | ~ distributed with this work for additional information | 
|  | ~ regarding copyright ownership.  The ASF licenses this file | 
|  | ~ to you under the Apache License, Version 2.0 (the | 
|  | ~ "License"); you may not use this file except in compliance | 
|  | ~ with the License.  You may obtain a copy of the License at | 
|  | ~ | 
|  | ~   http://www.apache.org/licenses/LICENSE-2.0 | 
|  | ~ | 
|  | ~ Unless required by applicable law or agreed to in writing, | 
|  | ~ software distributed under the License is distributed on an | 
|  | ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|  | ~ KIND, either express or implied.  See the License for the | 
|  | ~ specific language governing permissions and limitations | 
|  | ~ under the License. | 
|  | --> | 
|  |  | 
|  | <h1 id="ingestion">Ingestion</h1> | 
|  |  | 
|  | <h2 id="overview">Overview</h2> | 
|  |  | 
|  | <p><a name="datasources" /></p> | 
|  |  | 
|  | <h3 id="datasources-and-segments">Datasources and segments</h3> | 
|  |  | 
|  | <p>Apache Druid (incubating) data is stored in "datasources", which are similar to tables in a traditional RDBMS. Each datasource is | 
|  | partitioned by time and, optionally, further partitioned by other attributes. Each time range is called a "chunk" (for | 
|  | example, a single day, if your datasource is partitioned by day). Within a chunk, data is partitioned into one or more | 
|  | "segments". Each segment is a single file, typically comprising up to a few million rows of data. Since segments are | 
|  | organized into time chunks, it's sometimes helpful to think of segments as living on a timeline like the following:</p> | 
|  |  | 
|  | <p><img src="../../img/druid-timeline.png" width="800" /></p> | 
|  |  | 
|  | <p>A datasource may have anywhere from just a few segments, up to hundreds of thousands and even millions of segments. Each | 
|  | segment starts life off being created on a MiddleManager, and at that point, is mutable and uncommitted. The segment | 
|  | building process includes the following steps, designed to produce a data file that is compact and supports fast | 
|  | queries:</p> | 
|  |  | 
|  | <ul> | 
|  | <li>Conversion to columnar format</li> | 
|  | <li>Indexing with bitmap indexes</li> | 
|  | <li>Compression using various algorithms | 
|  |  | 
|  | <ul> | 
|  | <li>Dictionary encoding with id storage minimization for String columns</li> | 
|  | <li>Bitmap compression for bitmap indexes</li> | 
|  | <li>Type-aware compression for all columns</li> | 
|  | </ul></li> | 
|  | </ul> | 
|  |  | 
|  | <p>Periodically, segments are published (committed). At this point, they are written to deep storage, become immutable, and | 
|  | move from MiddleManagers to the Historical processes. An entry about the segment is also written to the metadata store. | 
|  | This entry is a self-describing bit of metadata about the segment, including things like the schema of the segment, its | 
|  | size, and its location on deep storage. These entries are what the Coordinator uses to know what data <em>should</em> be | 
|  | available on the cluster.</p> | 
|  |  | 
|  | <p>For details on the segment file format, please see <a href="../design/segments.html">segment files</a>.</p> | 
|  |  | 
|  | <p>For details on modeling your data in Druid, see <a href="schema-design.html">schema design</a>.</p> | 
|  |  | 
|  | <h4 id="segment-identifiers">Segment identifiers</h4> | 
|  |  | 
|  | <p>Segments all have a four-part identifier with the following components:</p> | 
|  |  | 
|  | <ul> | 
|  | <li>Datasource name.</li> | 
|  | <li>Time interval (for the time chunk containing the segment; this corresponds to the <code>segmentGranularity</code> specified | 
|  | at ingestion time).</li> | 
|  | <li>Version number (generally an ISO8601 timestamp corresponding to when the segment set was first started).</li> | 
|  | <li>Partition number (an integer, unique within a datasource+interval+version; may not necessarily be contiguous).</li> | 
|  | </ul> | 
|  |  | 
|  | <p>For example, this is the identifier for a segment in datasource <code>clarity-cloud0</code>, time chunk | 
|  | <code>2018-05-21T16:00:00.000Z/2018-05-21T17:00:00.000Z</code>, version <code>2018-05-21T15:56:09.909Z</code>, and partition number 1:</p> | 
|  | <div class="highlight"><pre><code class="language-text" data-lang="text"><span></span>clarity-cloud0_2018-05-21T16:00:00.000Z_2018-05-21T17:00:00.000Z_2018-05-21T15:56:09.909Z_1 | 
|  | </code></pre></div> | 
|  | <p>Segments with partition number 0 (the first partition in a chunk) omit the partition number, like the following | 
|  | example, which is a segment in the same time chunk as the previous one, but with partition number 0 instead of 1:</p> | 
|  | <div class="highlight"><pre><code class="language-text" data-lang="text"><span></span>clarity-cloud0_2018-05-21T16:00:00.000Z_2018-05-21T17:00:00.000Z_2018-05-21T15:56:09.909Z | 
|  | </code></pre></div> | 
|  | <h4 id="segment-versioning">Segment versioning</h4> | 
|  |  | 
|  | <p>You may be wondering what the "version number" described in the previous section is for. Or, you might not be, in which | 
|  | case good for you and you can skip this section!</p> | 
|  |  | 
|  | <p>It's there to support batch-mode overwriting. In Druid, if all you ever do is append data, then there will be just a | 
|  | single version for each time chunk. But when you overwrite data, what happens behind the scenes is that a new set of | 
|  | segments is created with the same datasource, same time interval, but a higher version number. This is a signal to the | 
|  | rest of the Druid system that the older version should be removed from the cluster, and the new version should replace | 
|  | it.</p> | 
|  |  | 
|  | <p>The switch appears to happen instantaneously to a user, because Druid handles this by first loading the new data (but | 
|  | not allowing it to be queried), and then, as soon as the new data is all loaded, switching all new queries to use those | 
|  | new segments. Then it drops the old segments a few minutes later.</p> | 
|  |  | 
|  | <h4 id="segment-states">Segment states</h4> | 
|  |  | 
|  | <p>Segments can be either <em>available</em> or <em>unavailable</em>, which refers to whether or not they are currently served by some | 
|  | Druid server process. They can also be <em>published</em> or <em>unpublished</em>, which refers to whether or not they have been | 
|  | written to deep storage and the metadata store. And published segments can be either <em>used</em> or <em>unused</em>, which refers to | 
|  | whether or not Druid considers them active segments that should be served.</p> | 
|  |  | 
|  | <p>Putting these together, there are five basic states that a segment can be in:</p> | 
|  |  | 
|  | <ul> | 
|  | <li><strong>Published, available, and used:</strong> These segments are published in deep storage and the metadata store, and they are | 
|  | served by Historical processes. They are the majority of active data in a Druid cluster (they include everything except | 
|  | in-flight realtime data).</li> | 
|  | <li><strong>Published, available, and unused:</strong> These segments are being served by Historicals, but won't be for very long. They | 
|  | may be segments that have recently been overwritten (see <a href="#segment-versioning">Segment versioning</a>) or dropped for | 
|  | other reasons (like drop rules, or being dropped manually).</li> | 
|  | <li><strong>Published, unavailable, and used:</strong> These segments are published in deep storage and the metadata store, and | 
|  | <em>should</em> be served, but are not actually being served. If segments stay in this state for more than a few minutes, it's | 
|  | usually because something is wrong. Some of the more common causes include: failure of a large number of Historicals, | 
|  | Historicals being out of capacity to download more segments, and some issue with coordination that prevents the | 
|  | Coordinator from telling Historicals to load new segments.</li> | 
|  | <li><strong>Published, unavailable, and unused:</strong> These segments are published in deep storage and the metadata store, but | 
|  | are inactive (because they have been overwritten or dropped). They lie dormant, and can potentially be resurrected | 
|  | by manual action if needed (in particular: setting the "used" flag to true).</li> | 
|  | <li><strong>Unpublished and available:</strong> This is the state that segments are in while they are being built by Druid ingestion | 
|  | tasks. This includes all "realtime" data that has not been handed off to Historicals yet. Segments in this state may or | 
|  | may not be replicated. If all replicas are lost, then the segment must be rebuilt from scratch. This may or may not be | 
|  | possible. (It is possible with Kafka, and happens automatically; it is possible with S3/HDFS by restarting the job; and | 
|  | it is <em>not</em> possible with Tranquility, so in that case, data will be lost.)</li> | 
|  | </ul> | 
|  |  | 
|  | <p>The sixth state in this matrix, "unpublished and unavailable," isn't possible. If a segment isn't published and isn't | 
|  | being served then does it really exist?</p> | 
|  |  | 
|  | <h4 id="indexing-and-handoff">Indexing and handoff</h4> | 
|  |  | 
|  | <p><em>Indexing</em> is the mechanism by which new segments are created, and <em>handoff</em> is the mechanism by which they are published | 
|  | and begin being served by Historical processes. The mechanism works like this on the indexing side:</p> | 
|  |  | 
|  | <ol> | 
|  | <li>An <em>indexing task</em> starts running and building a new segment. It must determine the identifier of the segment before | 
|  | it starts building it. For a task that is appending (like a Kafka task, or an index task in append mode) this will be | 
|  | done by calling an "allocate" API on the Overlord to potentially add a new partition to an existing set of segments. For | 
|  | a task that is overwriting (like a Hadoop task, or an index task <em>not</em> in append mode) this is done by locking an | 
|  | interval and creating a new version number and new set of segments.</li> | 
|  | <li>If the indexing task is a realtime task (like a Kafka task) then the segment is immediately queryable at this point. | 
|  | It's available, but unpublished.</li> | 
|  | <li>When the indexing task has finished reading data for the segment, it pushes it to deep storage and then publishes it | 
|  | by writing a record into the metadata store.</li> | 
|  | <li>If the indexing task is a realtime task, at this point it waits for a Historical process to load the segment. If the | 
|  | indexing task is not a realtime task, it exits immediately.</li> | 
|  | </ol> | 
|  |  | 
|  | <p>And like this on the Coordinator / Historical side:</p> | 
|  |  | 
|  | <ol> | 
|  | <li>The Coordinator polls the metadata store periodically (by default, every 1 minute) for newly published segments.</li> | 
|  | <li>When the Coordinator finds a segment that is published and used, but unavailable, it chooses a Historical process | 
|  | to load that segment and instructs that Historical to do so.</li> | 
|  | <li>The Historical loads the segment and begins serving it.</li> | 
|  | <li>At this point, if the indexing task was waiting for handoff, it will exit.</li> | 
|  | </ol> | 
|  |  | 
|  | <h2 id="ingestion-methods">Ingestion methods</h2> | 
|  |  | 
|  | <p>In most ingestion methods, this work is done by Druid | 
|  | MiddleManager processes. One exception is Hadoop-based ingestion, where this work is instead done using a Hadoop MapReduce | 
|  | job on YARN (although MiddleManager processes are still involved in starting and monitoring the Hadoop jobs).</p> | 
|  |  | 
|  | <p>Once segments have been generated and stored in <a href="../dependencies/deep-storage.html">deep storage</a>, they will be loaded by Druid Historical processes. Some Druid | 
|  | ingestion methods additionally support <em>real-time queries</em>, meaning you can query in-flight data on MiddleManager processes | 
|  | before it is finished being converted and written to deep storage. In general, a small amount of data will be in-flight | 
|  | on MiddleManager processes relative to the larger amount of historical data being served from Historical processes.</p> | 
|  |  | 
|  | <p>See the <a href="../design/index.html">Design</a> page for more details on how Druid stores and manages your data.</p> | 
|  |  | 
|  | <p>The table below lists Druid's most common data ingestion methods, along with comparisons to help you choose | 
|  | the best one for your situation.</p> | 
|  |  | 
|  | <table><thead> | 
|  | <tr> | 
|  | <th>Method</th> | 
|  | <th>How it works</th> | 
|  | <th>Can append and overwrite?</th> | 
|  | <th>Can handle late data?</th> | 
|  | <th>Exactly-once ingestion?</th> | 
|  | <th>Real-time queries?</th> | 
|  | </tr> | 
|  | </thead><tbody> | 
|  | <tr> | 
|  | <td><a href="native_tasks.html">Native batch</a></td> | 
|  | <td>Druid loads data directly from S3, HTTP, NFS, or other networked storage.</td> | 
|  | <td>Append or overwrite</td> | 
|  | <td>Yes</td> | 
|  | <td>Yes</td> | 
|  | <td>No</td> | 
|  | </tr> | 
|  | <tr> | 
|  | <td><a href="hadoop.html">Hadoop</a></td> | 
|  | <td>Druid launches Hadoop Map/Reduce jobs to load data files.</td> | 
|  | <td>Overwrite</td> | 
|  | <td>Yes</td> | 
|  | <td>Yes</td> | 
|  | <td>No</td> | 
|  | </tr> | 
|  | <tr> | 
|  | <td><a href="../development/extensions-core/kafka-ingestion.html">Kafka indexing service</a></td> | 
|  | <td>Druid reads directly from Kafka.</td> | 
|  | <td>Append only</td> | 
|  | <td>Yes</td> | 
|  | <td>Yes</td> | 
|  | <td>Yes</td> | 
|  | </tr> | 
|  | <tr> | 
|  | <td><a href="stream-push.html">Tranquility</a></td> | 
|  | <td>You use Tranquility, a client side library, to push individual records into Druid.</td> | 
|  | <td>Append only</td> | 
|  | <td>No - late data is dropped</td> | 
|  | <td>No - may drop or duplicate data</td> | 
|  | <td>Yes</td> | 
|  | </tr> | 
|  | </tbody></table> | 
|  |  | 
|  | <h2 id="partitioning">Partitioning</h2> | 
|  |  | 
|  | <p>Druid is a distributed data store, and it partitions your data in order to process it in parallel. Druid | 
|  | <a href="../design/index.html">datasources</a> are always partitioned first by time based on the | 
|  | <a href="../ingestion/index.html#granularityspec">segmentGranularity</a> parameter of your ingestion spec. Each of these time partitions is called | 
|  | a <em>time chunk</em>, and each time chunk contains one or more <a href="../design/segments.html">segments</a>. The segments within a | 
|  | particular time chunk may be partitioned further using options that vary based on the ingestion method you have chosen.</p> | 
|  |  | 
|  | <ul> | 
|  | <li>With <a href="hadoop.html">Hadoop</a> you can do hash- or range-based partitioning on one or more columns.</li> | 
|  | <li>With <a href="native_tasks.html">Native batch</a> you can partition on a hash of dimension columns. This is useful when | 
|  | rollup is enabled, since it maximizes your space savings.</li> | 
|  | <li>With <a href="../development/extensions-core/kafka-ingestion.html">Kafka indexing</a>, partitioning is based on Kafka | 
|  | partitions, and is not configurable through Druid. You can configure it on the Kafka side by using the partitioning | 
|  | functionality of the Kafka producer.</li> | 
|  | <li>With <a href="stream-push.html">Tranquility</a>, partitioning is done by default on a hash of all dimension columns in order | 
|  | to maximize rollup. You can also provide a custom Partitioner class; see the | 
|  | <a href="https://github.com/druid-io/tranquility/blob/master/docs/overview.md#partitioning-and-replication">Tranquility documentation</a> | 
|  | for details.</li> | 
|  | </ul> | 
|  |  | 
|  | <p>All Druid datasources are partitioned by time. Each data ingestion method must acquire a write lock on a particular | 
|  | time range when loading data, so no two methods can operate on the same time range of the same datasource at the same | 
|  | time. However, two data ingestion methods <em>can</em> operate on different time ranges of the same datasource at the same | 
|  | time. For example, you can do a batch backfill from Hadoop while also doing a real-time load from Kafka, so long as | 
|  | the backfill data and the real-time data do not need to be written to the same time partitions. (If they do, the | 
|  | real-time load will take priority.)</p> | 
|  |  | 
|  | <p>For tips on how partitioning can affect performance and storage footprint, see the | 
|  | <a href="schema-design.html#partitioning">schema design</a> page.</p> | 
|  |  | 
|  | <h2 id="rollup">Rollup</h2> | 
|  |  | 
|  | <p>Druid is able to summarize raw data at ingestion time using a process we refer to as "roll-up". | 
|  | Roll-up is a first-level aggregation operation over a selected set of "dimensions", where a set of "metrics" are aggregated.</p> | 
|  |  | 
|  | <p>Suppose we have the following raw data, representing total packet/byte counts in particular seconds for traffic between a source and destination. The <code>srcIP</code> and <code>dstIP</code> fields are dimensions, while <code>packets</code> and <code>bytes</code> are metrics.</p> | 
|  | <div class="highlight"><pre><code class="language-text" data-lang="text"><span></span>timestamp                 srcIP         dstIP          packets     bytes | 
|  | 2018-01-01T01:01:35Z      1.1.1.1       2.2.2.2            100      1000 | 
|  | 2018-01-01T01:01:51Z      1.1.1.1       2.2.2.2            200      2000 | 
|  | 2018-01-01T01:01:59Z      1.1.1.1       2.2.2.2            300      3000 | 
|  | 2018-01-01T01:02:14Z      1.1.1.1       2.2.2.2            400      4000 | 
|  | 2018-01-01T01:02:29Z      1.1.1.1       2.2.2.2            500      5000 | 
|  | 2018-01-01T01:03:29Z      1.1.1.1       2.2.2.2            600      6000 | 
|  | 2018-01-02T21:33:14Z      7.7.7.7       8.8.8.8            100      1000 | 
|  | 2018-01-02T21:33:45Z      7.7.7.7       8.8.8.8            200      2000 | 
|  | 2018-01-02T21:35:45Z      7.7.7.7       8.8.8.8            300      3000 | 
|  | </code></pre></div> | 
|  | <p>If we ingest this data into Druid with a <code>queryGranularity</code> of <code>minute</code> (which will floor timestamps to minutes), the roll-up operation is equivalent to the following pseudocode:</p> | 
|  | <div class="highlight"><pre><code class="language-text" data-lang="text"><span></span>GROUP BY TRUNCATE(timestamp, MINUTE), srcIP, dstIP :: SUM(packets), SUM(bytes) | 
|  | </code></pre></div> | 
|  | <p>After the data above is aggregated during roll-up, the following rows will be ingested:</p> | 
|  | <div class="highlight"><pre><code class="language-text" data-lang="text"><span></span>timestamp                 srcIP         dstIP          packets     bytes | 
|  | 2018-01-01T01:01:00Z      1.1.1.1       2.2.2.2            600      6000 | 
|  | 2018-01-01T01:02:00Z      1.1.1.1       2.2.2.2            900      9000 | 
|  | 2018-01-01T01:03:00Z      1.1.1.1       2.2.2.2            600      6000 | 
|  | 2018-01-02T21:33:00Z      7.7.7.7       8.8.8.8            300      3000 | 
|  | 2018-01-02T21:35:00Z      7.7.7.7       8.8.8.8            300      3000 | 
|  | </code></pre></div> | 
|  | <p>The rollup granularity is the minimum granularity you will be able to explore data at and events are floored to this granularity. | 
|  | Hence, Druid ingestion specs define this granularity as the <code>queryGranularity</code> of the data. The lowest supported <code>queryGranularity</code> is millisecond.</p> | 
|  |  | 
|  | <p>The following links may be helpful in further understanding dimensions and metrics:</p> | 
|  |  | 
|  | <ul> | 
|  | <li><p><a href="https://en.wikipedia.org/wiki/Dimension_(data_warehouse)">https://en.wikipedia.org/wiki/Dimension_(data_warehouse)</a></p></li> | 
|  | <li><p><a href="https://en.wikipedia.org/wiki/Measure_(data_warehouse)">https://en.wikipedia.org/wiki/Measure_(data_warehouse)</a></p></li> | 
|  | </ul> | 
|  |  | 
|  | <p>For tips on how to use rollup in your Druid schema designs, see the <a href="schema-design.html#rollup">schema design</a> page.</p> | 
|  |  | 
|  | <h3 id="roll-up-modes">Roll-up modes</h3> | 
|  |  | 
|  | <p>Druid supports two roll-up modes, i.e., <em>perfect roll-up</em> and <em>best-effort roll-up</em>. In the perfect roll-up mode, Druid guarantees that input data are perfectly aggregated at ingestion time. Meanwhile, in the best-effort roll-up, input data might not be perfectly aggregated and thus there can be multiple segments holding the rows which should belong to the same segment with the perfect roll-up since they have the same dimension value and their timestamps fall into the same interval.</p> | 
|  |  | 
|  | <p>The perfect roll-up mode encompasses an additional preprocessing step to determine intervals and shardSpecs before actual data ingestion if they are not specified in the ingestionSpec. This preprocessing step usually scans the entire input data which might increase the ingestion time. The <a href="../ingestion/hadoop.html">Hadoop indexing task</a> always runs with this perfect roll-up mode.</p> | 
|  |  | 
|  | <p>On the contrary, the best-effort roll-up mode doesn't require any preprocessing step, but the size of ingested data might be larger than that of the perfect roll-up. All types of <a href="../ingestion/stream-ingestion.html">streaming indexing (e.g., kafka indexing service)</a> run with this mode.</p> | 
|  |  | 
|  | <p>Finally, the <a href="../ingestion/native_tasks.html">native index task</a> supports both modes and you can choose either one which fits to your application.</p> | 
|  |  | 
|  | <h2 id="data-maintenance">Data maintenance</h2> | 
|  |  | 
|  | <h3 id="inserts-and-overwrites">Inserts and overwrites</h3> | 
|  |  | 
|  | <p>Druid can insert new data to an existing datasource by appending new segments to existing segment sets. It can also add new data by merging an existing set of segments with new data and overwriting the original set. </p> | 
|  |  | 
|  | <p>Druid does not support single-record updates by primary key.</p> | 
|  |  | 
|  | <p>Updates are described further at <a href="../ingestion/update-existing-data.html">update existing data</a>.</p> | 
|  |  | 
|  | <h3 id="compaction">Compaction</h3> | 
|  |  | 
|  | <p>Compaction is a type of overwrite operation, which reads an existing set of segments, combines them into a new set with larger but fewer segments, and overwrites the original set with the new compacted set, without changing the data that is stored.</p> | 
|  |  | 
|  | <p>For performance reasons, it is sometimes beneficial to compact a set of segments into a set of larger but fewer segments, as there is some per-segment processing and memory overhead in both the ingestion and querying paths.</p> | 
|  |  | 
|  | <p>For compaction documentation, please see <a href="../ingestion/tasks.html">tasks</a>.</p> | 
|  |  | 
|  | <h3 id="retention-and-tiering">Retention and Tiering</h3> | 
|  |  | 
|  | <p>Druid supports retention rules, which are used to define intervals of time where data should be preserved, and intervals where data should be discarded.</p> | 
|  |  | 
|  | <p>Druid also supports separating Historical processes into tiers, and the retention rules can be configured to assign data for specific intervals to specific tiers.</p> | 
|  |  | 
|  | <p>These features are useful for performance/cost management; a common use case is separating Historical processes into a "hot" tier and a "cold" tier.</p> | 
|  |  | 
|  | <p>For more information, please see <a href="../operations/rule-configuration.html">Load rules</a>.</p> | 
|  |  | 
|  | <h3 id="deletes">Deletes</h3> | 
|  |  | 
|  | <p>Druid supports permanent deletion of segments that are in an "unused" state (see the <a href="#segment-states">Segment states</a> section above).</p> | 
|  |  | 
|  | <p>The Kill Task deletes unused segments within a specified interval from metadata storage and deep storage.</p> | 
|  |  | 
|  | <p>For more information, please see <a href="../ingestion/tasks.html#kill-task">Kill Task</a>.</p> | 
|  |  | 
|  | </div> | 
|  | <div class="col-md-3"> | 
|  | <div class="searchbox"> | 
|  | <gcse:searchbox-only></gcse:searchbox-only> | 
|  | </div> | 
|  | <div id="toc" class="nav toc hidden-print"> | 
|  | </div> | 
|  | </div> | 
|  | </div> | 
|  | </div> | 
|  |  | 
|  | <!-- Start page_footer include --> | 
|  | <footer class="druid-footer"> | 
|  | <div class="container"> | 
|  | <div class="text-center"> | 
|  | <p> | 
|  | <a href="/technology">Technology</a> ·  | 
|  | <a href="/use-cases">Use Cases</a> ·  | 
|  | <a href="/druid-powered">Powered by Druid</a> ·  | 
|  | <a href="/docs/latest">Docs</a> ·  | 
|  | <a href="/community/">Community</a> ·  | 
|  | <a href="/downloads.html">Download</a> ·  | 
|  | <a href="/faq">FAQ</a> | 
|  | </p> | 
|  | </div> | 
|  | <div class="text-center"> | 
|  | <a title="Join the user group" href="https://groups.google.com/forum/#!forum/druid-user" target="_blank"><span class="fa fa-comments"></span></a> ·  | 
|  | <a title="Follow Druid" href="https://twitter.com/druidio" target="_blank"><span class="fab fa-twitter"></span></a> ·  | 
|  | <a title="GitHub" href="https://github.com/apache/druid" target="_blank"><span class="fab fa-github"></span></a> | 
|  | </div> | 
|  | <div class="text-center license"> | 
|  | Copyright © 2020 <a href="https://www.apache.org/" target="_blank">Apache Software Foundation</a>.<br> | 
|  | Except where otherwise noted, licensed under <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA 4.0</a>.<br> | 
|  | Apache Druid, Druid, and the Druid logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries. | 
|  | </div> | 
|  | </div> | 
|  | </footer> | 
|  |  | 
|  | <script async src="https://www.googletagmanager.com/gtag/js?id=UA-131010415-1"></script> | 
|  | <script> | 
|  | window.dataLayer = window.dataLayer || []; | 
|  | function gtag(){dataLayer.push(arguments);} | 
|  | gtag('js', new Date()); | 
|  | gtag('config', 'UA-131010415-1'); | 
|  | </script> | 
|  | <script> | 
|  | function trackDownload(type, url) { | 
|  | ga('send', 'event', 'download', type, url); | 
|  | } | 
|  | </script> | 
|  | <script src="//code.jquery.com/jquery.min.js"></script> | 
|  | <script src="//maxcdn.bootstrapcdn.com/bootstrap/3.2.0/js/bootstrap.min.js"></script> | 
|  | <script src="/assets/js/druid.js"></script> | 
|  | <!-- stop page_footer include --> | 
|  |  | 
|  |  | 
|  | <script> | 
|  | $(function() { | 
|  | $(".toc").load("/docs/0.14.2-incubating/toc.html"); | 
|  |  | 
|  | // There is no way to tell when .gsc-input will be async loaded into the page so just try to set a placeholder until it works | 
|  | var tries = 0; | 
|  | var timer = setInterval(function() { | 
|  | tries++; | 
|  | if (tries > 300) clearInterval(timer); | 
|  | var searchInput = $('input.gsc-input'); | 
|  | if (searchInput.length) { | 
|  | searchInput.attr('placeholder', 'Search'); | 
|  | clearInterval(timer); | 
|  | } | 
|  | }, 100); | 
|  | }); | 
|  | </script> | 
|  | </body> | 
|  | </html> |