blob: 4d026fa2ca2062ccc539ef50f6112924d7e345a8 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="Apache Druid">
<meta name="keywords" content="druid,kafka,database,analytics,streaming,real-time,real time,apache,open source">
<meta name="author" content="Apache Software Foundation">
<title>Druid | Retaining or Automatically Dropping Data</title>
<link rel="alternate" type="application/atom+xml" href="/feed">
<link rel="shortcut icon" href="/img/favicon.png">
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.7.2/css/all.css" integrity="sha384-fnmOCqbTlWIlj8LyTjo7mOUStjsKC4pOpQbqyi7RrhN7udi9RwhKkMHpvLbHG9Sr" crossorigin="anonymous">
<link href='//fonts.googleapis.com/css?family=Open+Sans+Condensed:300,700,300italic|Open+Sans:300italic,400italic,600italic,400,300,600,700' rel='stylesheet' type='text/css'>
<link rel="stylesheet" href="/css/bootstrap-pure.css?v=1.1">
<link rel="stylesheet" href="/css/base.css?v=1.1">
<link rel="stylesheet" href="/css/header.css?v=1.1">
<link rel="stylesheet" href="/css/footer.css?v=1.1">
<link rel="stylesheet" href="/css/syntax.css?v=1.1">
<link rel="stylesheet" href="/css/docs.css?v=1.1">
<script>
(function() {
var cx = '000162378814775985090:molvbm0vggm';
var gcse = document.createElement('script');
gcse.type = 'text/javascript';
gcse.async = true;
gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
'//cse.google.com/cse.js?cx=' + cx;
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(gcse, s);
})();
</script>
</head>
<body>
<!-- Start page_header include -->
<script src="//ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script>
<div class="top-navigator">
<div class="container">
<div class="left-cont">
<a class="logo" href="/"><span class="druid-logo"></span></a>
</div>
<div class="right-cont">
<ul class="links">
<li class=""><a href="/technology">Technology</a></li>
<li class=""><a href="/use-cases">Use Cases</a></li>
<li class=""><a href="/druid-powered">Powered By</a></li>
<li class=""><a href="/docs/latest/design/">Docs</a></li>
<li class=""><a href="/community/">Community</a></li>
<li class="header-dropdown">
<a>Apache</a>
<div class="header-dropdown-menu">
<a href="https://www.apache.org/" target="_blank">Foundation</a>
<a href="https://www.apache.org/events/current-event" target="_blank">Events</a>
<a href="https://www.apache.org/licenses/" target="_blank">License</a>
<a href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a>
<a href="https://www.apache.org/security/" target="_blank">Security</a>
<a href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Sponsorship</a>
</div>
</li>
<li class=" button-link"><a href="/downloads.html">Download</a></li>
</ul>
</div>
</div>
<div class="action-button menu-icon">
<span class="fa fa-bars"></span> MENU
</div>
<div class="action-button menu-icon-close">
<span class="fa fa-times"></span> MENU
</div>
</div>
<script type="text/javascript">
var $menu = $('.right-cont');
var $menuIcon = $('.menu-icon');
var $menuIconClose = $('.menu-icon-close');
function showMenu() {
$menu.fadeIn(100);
$menuIcon.fadeOut(100);
$menuIconClose.fadeIn(100);
}
$menuIcon.click(showMenu);
function hideMenu() {
$menu.fadeOut(100);
$menuIconClose.fadeOut(100);
$menuIcon.fadeIn(100);
}
$menuIconClose.click(hideMenu);
$(window).resize(function() {
if ($(window).width() >= 840) {
$menu.fadeIn(100);
$menuIcon.fadeOut(100);
$menuIconClose.fadeOut(100);
}
else {
$menu.fadeOut(100);
$menuIcon.fadeIn(100);
$menuIconClose.fadeOut(100);
}
});
</script>
<!-- Stop page_header include -->
<div class="container doc-container">
<p> Looking for the <a href="/docs/0.20.0/">latest stable documentation</a>?</p>
<div class="row">
<div class="col-md-9 doc-content">
<p>
<a class="btn btn-default btn-xs visible-xs-inline-block visible-sm-inline-block" href="#toc">Table of Contents</a>
</p>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<h1 id="retaining-or-automatically-dropping-data">Retaining or Automatically Dropping Data</h1>
<p>Coordinator nodes use rules to determine what data should be loaded to or dropped from the cluster. Rules are used for data retention and query execution, and are set on the coordinator console (http://coordinator_ip:port).</p>
<p>There are three types of rules, i.e., load rules, drop rules, and broadcast rules. Load rules indicate how segments should be assigned to different historical node tiers and how many replicas of a segment should exist in each tier.
Drop rules indicate when segments should be dropped entirely from the cluster. Finally, broadcast rules indicate how segments of different data sources should be co-located in historical nodes.</p>
<p>The coordinator loads a set of rules from the metadata storage. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The coordinator will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule.</p>
<p>Note: It is recommended that the coordinator console is used to configure rules. However, the coordinator node does have HTTP endpoints to programmatically configure rules.</p>
<p>When a rule is updated, the change may not be reflected until the next time the coordinator runs. This will be fixed in the near future.</p>
<h2 id="load-rules">Load Rules</h2>
<p>Load rules indicate how many replicas of a segment should exist in a server tier. <strong>Please note</strong>: If a Load rule is used to retain only data from a certain interval or period, it must be accompanied by a Drop rule. If a Drop rule is not included, data not within the specified interval or period will be retained by the default rule (loadForever).</p>
<h3 id="forever-load-rule">Forever Load Rule</h3>
<p>Forever load rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;loadForever&quot;</span><span class="p">,</span>
<span class="nt">&quot;tieredReplicants&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="nt">&quot;hot&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
<span class="nt">&quot;_default_tier&quot;</span> <span class="p">:</span> <span class="mi">1</span>
<span class="p">}</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;loadForever&quot;</li>
<li><code>tieredReplicants</code> - A JSON Object where the keys are the tier names and values are the number of replicas for that tier.</li>
</ul>
<h3 id="interval-load-rule">Interval Load Rule</h3>
<p>Interval load rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;loadByInterval&quot;</span><span class="p">,</span>
<span class="nt">&quot;interval&quot;</span><span class="p">:</span> <span class="s2">&quot;2012-01-01/2013-01-01&quot;</span><span class="p">,</span>
<span class="nt">&quot;tieredReplicants&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="nt">&quot;hot&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
<span class="nt">&quot;_default_tier&quot;</span> <span class="p">:</span> <span class="mi">1</span>
<span class="p">}</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;loadByInterval&quot;</li>
<li><code>interval</code> - A JSON Object representing ISO-8601 Intervals</li>
<li><code>tieredReplicants</code> - A JSON Object where the keys are the tier names and values are the number of replicas for that tier.</li>
</ul>
<h3 id="period-load-rule">Period Load Rule</h3>
<p>Period load rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;loadByPeriod&quot;</span><span class="p">,</span>
<span class="nt">&quot;period&quot;</span> <span class="p">:</span> <span class="s2">&quot;P1M&quot;</span><span class="p">,</span>
<span class="nt">&quot;tieredReplicants&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="nt">&quot;hot&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
<span class="nt">&quot;_default_tier&quot;</span> <span class="p">:</span> <span class="mi">1</span>
<span class="p">}</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;loadByPeriod&quot;</li>
<li><code>period</code> - A JSON Object representing ISO-8601 Periods</li>
<li><code>tieredReplicants</code> - A JSON Object where the keys are the tier names and values are the number of replicas for that tier.</li>
</ul>
<p>The interval of a segment will be compared against the specified period. The rule matches if the period overlaps the interval.</p>
<h2 id="drop-rules">Drop Rules</h2>
<p>Drop rules indicate when segments should be dropped from the cluster.</p>
<h3 id="forever-drop-rule">Forever Drop Rule</h3>
<p>Forever drop rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;dropForever&quot;</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;dropForever&quot;</li>
</ul>
<p>All segments that match this rule are dropped from the cluster.</p>
<h3 id="interval-drop-rule">Interval Drop Rule</h3>
<p>Interval drop rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;dropByInterval&quot;</span><span class="p">,</span>
<span class="nt">&quot;interval&quot;</span> <span class="p">:</span> <span class="s2">&quot;2012-01-01/2013-01-01&quot;</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;dropByInterval&quot;</li>
<li><code>interval</code> - A JSON Object representing ISO-8601 Periods</li>
</ul>
<p>A segment is dropped if the interval contains the interval of the segment.</p>
<h3 id="period-drop-rule">Period Drop Rule</h3>
<p>Period drop rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;dropByPeriod&quot;</span><span class="p">,</span>
<span class="nt">&quot;period&quot;</span> <span class="p">:</span> <span class="s2">&quot;P1M&quot;</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;dropByPeriod&quot;</li>
<li><code>period</code> - A JSON Object representing ISO-8601 Periods</li>
</ul>
<p>The interval of a segment will be compared against the specified period. The period is from some time in the past to the current time. The rule matches if the period contains the interval.</p>
<h2 id="broadcast-rules">Broadcast Rules</h2>
<p>Broadcast rules indicate how segments of different data sources should be co-located in historical nodes.
Once a broadcast rule is configured for a data source, all segments of the data source are broadcasted to the servers holding <em>any segments</em> of the co-located data sources.</p>
<h3 id="forever-broadcast-rule">Forever Broadcast Rule</h3>
<p>Forever broadcast rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;broadcastForever&quot;</span><span class="p">,</span>
<span class="nt">&quot;colocatedDataSources&quot;</span> <span class="p">:</span> <span class="p">[</span> <span class="s2">&quot;target_source1&quot;</span><span class="p">,</span> <span class="s2">&quot;target_source2&quot;</span> <span class="p">]</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;broadcastForever&quot;</li>
<li><code>colocatedDataSources</code> - A JSON List containing data source names to be co-located. <code>null</code> and empty list means broadcasting to every node in the cluster.</li>
</ul>
<h3 id="interval-broadcast-rule">Interval Broadcast Rule</h3>
<p>Interval broadcast rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;broadcastByInterval&quot;</span><span class="p">,</span>
<span class="nt">&quot;colocatedDataSources&quot;</span> <span class="p">:</span> <span class="p">[</span> <span class="s2">&quot;target_source1&quot;</span><span class="p">,</span> <span class="s2">&quot;target_source2&quot;</span> <span class="p">],</span>
<span class="nt">&quot;interval&quot;</span> <span class="p">:</span> <span class="s2">&quot;2012-01-01/2013-01-01&quot;</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;broadcastByInterval&quot;</li>
<li><code>colocatedDataSources</code> - A JSON List containing data source names to be co-located. <code>null</code> and empty list means broadcasting to every node in the cluster.</li>
<li><code>interval</code> - A JSON Object representing ISO-8601 Periods. Only the segments of the interval will be broadcasted.</li>
</ul>
<h3 id="period-broadcast-rule">Period Broadcast Rule</h3>
<p>Period broadcast rules are of the form:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json"><span></span><span class="p">{</span>
<span class="nt">&quot;type&quot;</span> <span class="p">:</span> <span class="s2">&quot;broadcastByPeriod&quot;</span><span class="p">,</span>
<span class="nt">&quot;colocatedDataSources&quot;</span> <span class="p">:</span> <span class="p">[</span> <span class="s2">&quot;target_source1&quot;</span><span class="p">,</span> <span class="s2">&quot;target_source2&quot;</span> <span class="p">],</span>
<span class="nt">&quot;period&quot;</span> <span class="p">:</span> <span class="s2">&quot;P1M&quot;</span>
<span class="p">}</span>
</code></pre></div>
<ul>
<li><code>type</code> - this should always be &quot;broadcastByPeriod&quot;</li>
<li><code>colocatedDataSources</code> - A JSON List containing data source names to be co-located. <code>null</code> and empty list means broadcasting to every node in the cluster.</li>
<li><code>period</code> - A JSON Object representing ISO-8601 Periods</li>
</ul>
<p>The interval of a segment will be compared against the specified period. The period is from some time in the past to the current time. The rule matches if the period contains the interval.</p>
<div class="note caution">
broadcast rules don't guarantee that segments of the data sources are always co-located because segments for the colocated data sources are not loaded together atomically.
If you want to always co-locate the segments of some data sources together, it is recommended to leave colocatedDataSources empty.
</div>
<h1 id="permanently-deleting-data">Permanently Deleting Data</h1>
<p>Druid can fully drop data from the cluster, wipe the metadata store entry, and remove the data from deep storage for any segments that are
marked as unused (segments dropped from the cluster via rules are always marked as unused). You can submit a <a href="../ingestion/tasks.html">kill task</a> to the <a href="../design/indexing-service.html">indexing service</a> to do this.</p>
<h1 id="reloading-dropped-data">Reloading Dropped Data</h1>
<p>Data that has been dropped from a Druid cluster cannot be reloaded using only rules. To reload dropped data in Druid, you must first set your retention period (i.e. changing the retention period from 1 month to 2 months), and
then enable the datasource in the Druid coordinator console, or through the Druid coordinator endpoints.</p>
</div>
<div class="col-md-3">
<div class="searchbox">
<gcse:searchbox-only></gcse:searchbox-only>
</div>
<div id="toc" class="nav toc hidden-print">
</div>
</div>
</div>
</div>
<!-- Start page_footer include -->
<footer class="druid-footer">
<div class="container">
<div class="text-center">
<p>
<a href="/technology">Technology</a>&ensp;·&ensp;
<a href="/use-cases">Use Cases</a>&ensp;·&ensp;
<a href="/druid-powered">Powered by Druid</a>&ensp;·&ensp;
<a href="/docs/latest/">Docs</a>&ensp;·&ensp;
<a href="/community/">Community</a>&ensp;·&ensp;
<a href="/downloads.html">Download</a>&ensp;·&ensp;
<a href="/faq">FAQ</a>
</p>
</div>
<div class="text-center">
<a title="Join the user group" href="https://groups.google.com/forum/#!forum/druid-user" target="_blank"><span class="fa fa-comments"></span></a>&ensp;·&ensp;
<a title="Follow Druid" href="https://twitter.com/druidio" target="_blank"><span class="fab fa-twitter"></span></a>&ensp;·&ensp;
<a title="GitHub" href="https://github.com/apache/druid" target="_blank"><span class="fab fa-github"></span></a>
</div>
<div class="text-center license">
Copyright © 2020 <a href="https://www.apache.org/" target="_blank">Apache Software Foundation</a>.<br>
Except where otherwise noted, licensed under <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA 4.0</a>.<br>
Apache Druid, Druid, and the Druid logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
</div>
</div>
</footer>
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-131010415-1"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-131010415-1');
</script>
<script>
function trackDownload(type, url) {
ga('send', 'event', 'download', type, url);
}
</script>
<script src="//code.jquery.com/jquery.min.js"></script>
<script src="//maxcdn.bootstrapcdn.com/bootstrap/3.2.0/js/bootstrap.min.js"></script>
<script src="/assets/js/druid.js"></script>
<!-- stop page_footer include -->
<script>
$(function() {
$(".toc").load("/docs/0.13.0-incubating/toc.html");
// There is no way to tell when .gsc-input will be async loaded into the page so just try to set a placeholder until it works
var tries = 0;
var timer = setInterval(function() {
tries++;
if (tries > 300) clearInterval(timer);
var searchInput = $('input.gsc-input');
if (searchInput.length) {
searchInput.attr('placeholder', 'Search');
clearInterval(timer);
}
}, 100);
});
</script>
</body>
</html>