blob: b5682dd3c6312b42ef14dce7e25055d612dc10fb [file] [log] [blame]
<!doctype html>
<html class="docs-version-0.11.0 docs-custom-styles" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-beta.14">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Hudi: User-Facing Analytics RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Hudi: User-Facing Analytics Atom Feed">
<link rel="alternate" type="application/json" href="/blog/feed.json" title="Apache Hudi: User-Facing Analytics JSON Feed">
<link rel="search" type="application/opensearchdescription+xml" title="Apache Hudi" href="/opensearch.xml">
<link rel="alternate" type="application/rss+xml" href="/videos/rss.xml" title="Apache Hudi RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/videos/atom.xml" title="Apache Hudi Atom Feed">
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro">
<link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">Clustering | Apache Hudi</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://hudi.apache.org/docs/0.11.0/clustering"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:version" content="0.11.0"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="docs-default-0.11.0"><meta data-react-helmet="true" name="keywords" content="apache hudi, data lake, lakehouse, big data, apache spark, apache flink, presto, trino, analytics, data engineering"><meta data-react-helmet="true" property="og:title" content="Clustering | Apache Hudi"><meta data-react-helmet="true" name="description" content="Background"><meta data-react-helmet="true" property="og:description" content="Background"><link data-react-helmet="true" rel="icon" href="/assets/images/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://hudi.apache.org/docs/0.11.0/clustering"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.11.0/clustering" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/cn/docs/0.11.0/clustering" hreflang="cn"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.11.0/clustering" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.052701b1.css">
<link rel="preload" href="/assets/js/runtime~main.46626068.js" as="script">
<link rel="preload" href="/assets/js/main.19b28340.js" as="script">
</head>
<body>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><div class="announcementBar_axC9" role="banner"><div class="announcementBarContent_6uhP">⭐️ If you like <b>Apache Hudi</b>, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi"><b>GitHub!<svg xmlns="http://www.w3.org/2000/svg\" width="16" height="16" fill="currentColor" class="bi bi-github" viewBox="0 -2 16 16"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"/></svg></b></a></div></div><nav class="navbar navbar--fixed-top navbarWrapper_UIa0"><div class="navbar__inner navbarInnerStyle_FVs7"><img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8f594acf-9b77-44fb-9475-3e82ead1910c" width="0" height="0" alt=""><img referrerpolicy="no-referrer-when-downgrade" src="https://analytics.apache.org/matomo.php?idsite=47&amp;rec=1" width="0" height="0" alt=""><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo navbarLogo_Bz6n"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><a class="navbar__item navbar__link navbarFontSize_x1wi" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3 navbarFontSize_x1wi"><div class="labelWrapperDropdown_Mqbj">Learn<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/talks"><div class="labelWrapperDropdown_Mqbj">Talks</div></a></li><li><a class="dropdown__link" href="/videos"><div class="labelWrapperDropdown_Mqbj">Video Guides</div></a></li><li><a class="dropdown__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">FAQ</div></a></li><li><a class="dropdown__link" href="/tech-specs"><div class="labelWrapperDropdown_Mqbj">Tech Specs</div></a></li><li><a class="dropdown__link" href="/tech-specs-1point0"><div class="labelWrapperDropdown_Mqbj">Tech Specs 1.0</div></a></li><li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Technical Wiki<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3 navbarFontSize_x1wi"><div class="labelWrapperDropdown_Mqbj">Contribute<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/contribute/how-to-contribute"><div class="labelWrapperDropdown_Mqbj">How to Contribute</div></a></li><li><a class="dropdown__link" href="/contribute/developer-setup"><div class="labelWrapperDropdown_Mqbj">Developer Setup</div></a></li><li><a class="dropdown__link" href="/contribute/rfc-process"><div class="labelWrapperDropdown_Mqbj">RFC Process</div></a></li><li><a class="dropdown__link" href="/contribute/report-security-issues"><div class="labelWrapperDropdown_Mqbj">Report Security Issues</div></a></li><li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Report Issues<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3 navbarFontSize_x1wi"><div class="labelWrapperDropdown_Mqbj">Community<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/get-involved"><div class="labelWrapperDropdown_Mqbj">Get Involved</div></a></li><li><a class="dropdown__link" href="/community/syncs"><div class="labelWrapperDropdown_Mqbj">Community Syncs</div></a></li><li><a class="dropdown__link" href="/community/office_hours"><div class="labelWrapperDropdown_Mqbj">Office Hours</div></a></li><li><a class="dropdown__link" href="/community/team"><div class="labelWrapperDropdown_Mqbj">Team</div></a></li></ul></div><a class="navbar__item navbar__link navbarFontSize_x1wi" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a><a class="navbar__item navbar__link navbarFontSize_x1wi" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who&#x27;s Using</div></a><a class="navbar__item navbar__link navbarFontSize_x1wi" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a><a class="navbar__item navbar__link navbarFontSize_x1wi" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link downloadLinkDropdownHide_aDP3" href="/docs/0.11.0/overview"><div class="labelWrapperDropdown_Mqbj">0.11.0<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/clustering"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li><a class="dropdown__link" href="/docs/clustering"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li><a class="dropdown__link" href="/docs/0.14.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li><a class="dropdown__link" href="/docs/0.13.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li><a class="dropdown__link" href="/docs/0.13.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li><a class="dropdown__link" href="/docs/0.12.3/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li><a class="dropdown__link" href="/docs/0.12.2/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li><a class="dropdown__link" href="/docs/0.12.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li><a class="dropdown__link" href="/docs/0.12.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li><a class="dropdown__link" href="/docs/0.11.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/0.11.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li><a class="dropdown__link" href="/docs/0.10.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li><a class="dropdown__link" href="/docs/0.10.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li><a class="dropdown__link" href="/docs/0.9.0/overview"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li><a class="dropdown__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li><a class="dropdown__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li><a class="dropdown__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li><a class="dropdown__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li><a class="dropdown__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li><a class="dropdown__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li><a class="dropdown__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>English</span></span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#a)"><path d="M14 6.457a6.842 6.842 0 0 0-7-6.02 6.843 6.843 0 0 0-7 6.02v1.085a6.843 6.843 0 0 0 7 6.02 6.843 6.843 0 0 0 7-6.02V6.457Zm-1.094 0h-2.625a9.92 9.92 0 0 0-.376-2.222 6.65 6.65 0 0 0 1.531-.875 5.25 5.25 0 0 1 1.444 3.097h.026Zm-8.032 0a8.479 8.479 0 0 1 .324-1.872 7.376 7.376 0 0 0 3.63 0c.175.61.284 1.239.325 1.872h-4.28Zm4.305 1.085a8.391 8.391 0 0 1-.324 1.873 7.464 7.464 0 0 0-3.658 0 8.479 8.479 0 0 1-.323-1.873h4.305Zm.35-4.375A10.342 10.342 0 0 0 8.75 1.75c.627.194 1.218.49 1.75.875a5.748 5.748 0 0 1-.998.577l.027-.035ZM7.254 1.54A8.75 8.75 0 0 1 8.46 3.552c-.48.11-.97.165-1.461.167-.492-.001-.982-.057-1.461-.167.308-.722.715-1.4 1.207-2.012h.508ZM4.498 3.202a5.748 5.748 0 0 1-.998-.577 6.029 6.029 0 0 1 1.75-.875c-.294.46-.546.947-.753 1.452Zm-1.873.15c.47.358.984.652 1.531.874A9.625 9.625 0 0 0 3.78 6.45H1.155a5.25 5.25 0 0 1 1.47-3.098ZM1.12 7.541h2.625c.038.753.164 1.5.376 2.223a6.649 6.649 0 0 0-1.531.875 5.25 5.25 0 0 1-1.47-3.098Zm3.377 3.255c.207.506.459.992.753 1.453a6.03 6.03 0 0 1-1.75-.875c.312-.226.646-.419.997-.578Zm2.25 1.663a8.594 8.594 0 0 1-1.208-2.013 6.501 6.501 0 0 1 2.922 0 8.54 8.54 0 0 1-1.207 2.013h-.508Zm2.755-1.663c.367.156.716.35 1.042.578a6.338 6.338 0 0 1-1.75.875c.275-.464.512-.95.708-1.453Zm1.873-.148a6.647 6.647 0 0 0-1.531-.875 9.45 9.45 0 0 0 .376-2.223h2.625a5.25 5.25 0 0 1-1.47 3.098Z" fill="#1C1E21"></path></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h14v14H0z"></path></clipPath></defs></svg></div></a><ul class="dropdown__menu"><li><a href="/docs/0.11.0/clustering" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active"><div class="labelWrapperDropdown_Mqbj">English</div></a></li><li><a href="/cn/docs/0.11.0/clustering" target="_self" rel="noopener noreferrer" class="dropdown__link"><div class="labelWrapperDropdown_Mqbj">Chinese</div></a></li></ul></div><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a><div class="searchBox_fBfG"><div role="button" class="searchButton_g9-U" aria-label="Search"><span class="searchText_RI6l">Search</span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="6.864" cy="6.864" r="5.243" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></circle><path d="m10.51 10.783 2.056 2.05" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><button type="button" class="clean-btn navbar-sidebar__close"><svg viewBox="0 0 15 15" width="21" height="21"><g stroke="var(--ifm-color-emphasis-600)" stroke-width="1.2"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><div class="navbar-sidebar__items"><div class="navbar-sidebar__item menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Learn</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Contribute</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Community</div></a></li><li class="menu__list-item"><a class="menu__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a></li><li class="menu__list-item"><a class="menu__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who&#x27;s Using</div></a></li><li class="menu__list-item"><a class="menu__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a></li><li class="menu__list-item"><a class="menu__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></li><li class="menu__list-item"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Versions</div></a><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/next/clustering"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/clustering"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.14.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.3/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.2/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/docs/0.11.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.1/clustering"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.0/clustering"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.9.0/overview"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Languages</span></span></div></a></li><li class="menu__list-item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="menu__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="menu__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="menu__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="menu__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="menu__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a></li></ul></div><div class="navbar-sidebar__item menu"><button type="button" class="clean-btn navbar-sidebar__back">← Back to main menu</button></div></div></div></nav><div class="main-wrapper docs-wrapper docs-doc-page"><div class="docPage_GMj9"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_i9tI" type="button"></button><aside class="docSidebarContainer_k0Pq"><div class="sidebar_a3j0"><nav class="menu thin-scrollbar menu_cyFh menuWithAnnouncementBar_+O1J"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.11.0/overview">Overview</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.11.0/quick-start-guide">Quick Start</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/quick-start-guide">Spark Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/flink-quick-start-guide">Flink Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/docker_demo">Docker Demo</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.11.0/timeline">Concepts</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.11.0/table_management">How To</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--active hasHref_TwRn" href="/docs/0.11.0/migration_guide">Services</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/migration_guide">Bootstrapping</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/compaction">Compaction</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/docs/0.11.0/clustering">Clustering</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/metadata_indexing">Metadata Indexing</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/hoodie_cleaner">Cleaning</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/transforms">Transformers</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/markers">Marker Mechanism</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/file_sizing">File Sizing</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/disaster_recovery">Disaster Recovery</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/snapshot_exporter">Exporter</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.11.0/precommit_validator">Data Quality</a></li></ul></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.11.0/configurations">Configurations</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.11.0/query_engine_setup">Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.11.0/use_cases">Use Cases</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.11.0/faq">FAQs</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.11.0/privacy">Privacy Policy</a></li></ul></nav></div></aside><main class="docMainContainer_Q970"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_zHA2"><div class="theme-doc-version-banner alert alert--warning margin-bottom--md" role="alert"><div>This is documentation for <!-- -->Apache Hudi<!-- --> <b>0.11.0</b>, which is no longer actively maintained.</div><div class="margin-top--md">For up-to-date documentation, see the <b><a href="/docs/clustering">latest version</a></b> (<!-- -->0.14.1<!-- -->).</div></div><div class="docItemContainer_oiyr"><article><span class="theme-doc-version-badge badge badge--secondary">Version: <!-- -->0.11.0</span><div class="tocCollapsible_aw-L theme-doc-toc-mobile tocMobile_Tx6Y"><button type="button" class="clean-btn tocCollapsibleButton_zr6a">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Clustering</h1></header><h2 class="anchor anchorWithStickyNavbar_y2LR" id="background">Background<a class="hash-link" href="#background" title="Direct link to heading"></a></h2><p>Apache Hudi brings stream processing to big data, providing fresh data while being an order of magnitude efficient over traditional batch processing. In a data lake/warehouse, one of the key trade-offs is between ingestion speed and query performance. Data ingestion typically prefers small files to improve parallelism and make data available to queries as soon as possible. However, query performance degrades poorly with a lot of small files. Also, during ingestion, data is typically co-located based on arrival time. However, the query engines perform better when the data frequently queried is co-located together. In most architectures each of these systems tend to add optimizations independently to improve performance which hits limitations due to un-optimized data layouts. This doc introduces a new kind of table service called clustering <a href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance" target="_blank" rel="noopener noreferrer">[RFC-19]</a> to reorganize data for improved query performance without compromising on ingestion speed.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="clustering-architecture">Clustering Architecture<a class="hash-link" href="#clustering-architecture" title="Direct link to heading"></a></h2><p>At a high level, Hudi provides different operations such as insert/upsert/bulk_insert through it’s write client API to be able to write data to a Hudi table. To be able to choose a trade-off between file size and ingestion speed, Hudi provides a knob <code>hoodie.parquet.small.file.limit</code> to be able to configure the smallest allowable file size. Users are able to configure the small file <a href="https://hudi.apache.org/docs/configurations/#hoodieparquetsmallfilelimit" target="_blank" rel="noopener noreferrer">soft limit</a> to <code>0</code> to force new data to go into a new set of filegroups or set it to a higher value to ensure new data gets “padded” to existing files until it meets that limit that adds to ingestion latencies.</p><p>To be able to support an architecture that allows for fast ingestion without compromising query performance, we have introduced a ‘clustering’ service to rewrite the data to optimize Hudi data lake file layout.</p><p>Clustering table service can run asynchronously or synchronously adding a new action type called “REPLACE”, that will mark the clustering action in the Hudi metadata timeline.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="overall-there-are-2-parts-to-clustering">Overall, there are 2 parts to clustering<a class="hash-link" href="#overall-there-are-2-parts-to-clustering" title="Direct link to heading"></a></h3><ol><li>Scheduling clustering: Create a clustering plan using a pluggable clustering strategy.</li><li>Execute clustering: Process the plan using an execution strategy to create new files and replace old files.</li></ol><h3 class="anchor anchorWithStickyNavbar_y2LR" id="scheduling-clustering">Scheduling clustering<a class="hash-link" href="#scheduling-clustering" title="Direct link to heading"></a></h3><p>Following steps are followed to schedule clustering.</p><ol><li>Identify files that are eligible for clustering: Depending on the clustering strategy chosen, the scheduling logic will identify the files eligible for clustering.</li><li>Group files that are eligible for clustering based on specific criteria. Each group is expected to have data size in multiples of ‘targetFileSize’. Grouping is done as part of ‘strategy’ defined in the plan. Additionally, there is an option to put a cap on group size to improve parallelism and avoid shuffling large amounts of data.</li><li>Finally, the clustering plan is saved to the timeline in an avro <a href="https://github.com/apache/hudi/blob/master/hudi-common/src/main/avro/HoodieClusteringPlan.avsc" target="_blank" rel="noopener noreferrer">metadata format</a>.</li></ol><h3 class="anchor anchorWithStickyNavbar_y2LR" id="running-clustering">Running clustering<a class="hash-link" href="#running-clustering" title="Direct link to heading"></a></h3><ol><li>Read the clustering plan and get the ‘clusteringGroups’ that mark the file groups that need to be clustered.</li><li>For each group, we instantiate appropriate strategy class with strategyParams (example: sortColumns) and apply that strategy to rewrite the data.</li><li>Create a “REPLACE” commit and update the metadata in <a href="https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java" target="_blank" rel="noopener noreferrer">HoodieReplaceCommitMetadata</a>.</li></ol><p>Clustering Service builds on Hudi’s MVCC based design to allow for writers to continue to insert new data while clustering action runs in the background to reformat data layout, ensuring snapshot isolation between concurrent readers and writers.</p><p>NOTE: Clustering can only be scheduled for tables / partitions not receiving any concurrent updates. In the future, concurrent updates use-case will be supported as well.</p><p><img alt="Clustering example" src="/assets/images/example_perf_improvement-acd223093d7c84fb6f0a896dcb571737.png">
<em>Figure: Illustrating query performance improvements by clustering</em></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="setting-up-clustering">Setting up clustering<a class="hash-link" href="#setting-up-clustering" title="Direct link to heading"></a></h3><p>Inline clustering can be setup easily using spark dataframe options. See sample below</p><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">import org.apache.hudi.QuickstartUtils._</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">import scala.collection.JavaConversions._</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">import org.apache.spark.sql.SaveMode._</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">import org.apache.hudi.DataSourceReadOptions._</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">import org.apache.hudi.DataSourceWriteOptions._</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">import org.apache.hudi.config.HoodieWriteConfig._</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">val df = //generate data frame</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">df.write.format(&quot;org.apache.hudi&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> options(getQuickstartWriteConfigs).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(PRECOMBINE_FIELD_OPT_KEY, &quot;ts&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(RECORDKEY_FIELD_OPT_KEY, &quot;uuid&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(PARTITIONPATH_FIELD_OPT_KEY, &quot;partitionpath&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(TABLE_NAME, &quot;tableName&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(&quot;hoodie.parquet.small.file.limit&quot;, &quot;0&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(&quot;hoodie.clustering.inline&quot;, &quot;true&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(&quot;hoodie.clustering.inline.max.commits&quot;, &quot;4&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(&quot;hoodie.clustering.plan.strategy.target.file.max.bytes&quot;, &quot;1073741824&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(&quot;hoodie.clustering.plan.strategy.small.file.limit&quot;, &quot;629145600&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> option(&quot;hoodie.clustering.plan.strategy.sort.columns&quot;, &quot;column1,column2&quot;). //optional, if sorting is needed as part of rewriting data</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> mode(Append).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> save(&quot;dfs://location&quot;);</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2 class="anchor anchorWithStickyNavbar_y2LR" id="async-clustering---strategies">Async Clustering - Strategies<a class="hash-link" href="#async-clustering---strategies" title="Direct link to heading"></a></h2><p>For more advanced usecases, async clustering pipeline can also be setup. See an example <a href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-SetupforAsyncclusteringJob" target="_blank" rel="noopener noreferrer">here</a>.</p><p>On a high level, clustering creates a plan based on a configurable strategy, groups eligible files based on specific
criteria and then executes the plan. Hudi supports <a href="https://hudi.apache.org/docs/concurrency_control#enabling-multi-writing" target="_blank" rel="noopener noreferrer">multi-writers</a> which provides
snapshot isolation between multiple table services, thus allowing writers to continue with ingestion while clustering
runs in the background.</p><p>As mentioned before, clustering plan as well as execution depends on configurable strategy. These strategies can be
broadly classified into three types: clustering plan strategy, execution strategy and update strategy.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="plan-strategy">Plan Strategy<a class="hash-link" href="#plan-strategy" title="Direct link to heading"></a></h3><p>This strategy comes into play while creating clustering plan. It helps to decide what file groups should be clustered.
Let&#x27;s look at different plan strategies that are available with Hudi. Note that these strategies are easily pluggable
using this <a href="/docs/configurations#hoodieclusteringplanstrategyclass">config</a>.</p><ol><li><code>SparkSizeBasedClusteringPlanStrategy</code>: It selects file slices based on
the <a href="/docs/configurations/#hoodieclusteringplanstrategysmallfilelimit">small file limit</a>
of base files and creates clustering groups upto max file size allowed per group. The max size can be specified using
this <a href="/docs/configurations/#hoodieclusteringplanstrategymaxbytespergroup">config</a>. This
strategy is useful for stitching together medium-sized files into larger ones to reduce lot of files spread across
cold partitions.</li><li><code>SparkRecentDaysClusteringPlanStrategy</code>: It looks back previous &#x27;N&#x27; days partitions and creates a plan that will
cluster the &#x27;small&#x27; file slices within those partitions. This is the default strategy. It could be useful when the
workload is predictable and data is partitioned by time.</li><li><code>SparkSelectedPartitionsClusteringPlanStrategy</code>: In case you want to cluster only specific partitions within a range,
no matter how old or new are those partitions, then this strategy could be useful. To use this strategy, one needs
to set below two configs additionally (both begin and end partitions are inclusive):</li></ol><div class="codeBlockContainer_J+bg theme-code-block"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.cluster.begin.partition</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.cluster.end.partition</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>All the strategies are partition-aware and the latter two are still bound by the size limits of the first strategy.</p></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="execution-strategy">Execution Strategy<a class="hash-link" href="#execution-strategy" title="Direct link to heading"></a></h3><p>After building the clustering groups in the planning phase, Hudi applies execution strategy, for each group, primarily
based on sort columns and size. The strategy can be specified using this <a href="/docs/configurations/#hoodieclusteringexecutionstrategyclass">config</a>.</p><p><code>SparkSortAndSizeExecutionStrategy</code> is the default strategy. Users can specify the columns to sort the data by, when
clustering using
this <a href="/docs/configurations/#hoodieclusteringplanstrategysortcolumns">config</a>. Apart from
that, we can also set <a href="/docs/configurations/#hoodieparquetmaxfilesize">max file size</a>
for the parquet files produced due to clustering. The strategy uses bulk insert to write data into new files, in which
case, Hudi implicitly uses a partitioner that does sorting based on specified columns. In this way, the strategy changes
the data layout in a way that not only improves query performance but also balance rewrite overhead automatically.</p><p>Now this strategy can be executed either as a single spark job or multiple jobs depending on number of clustering groups
created in the planning phase. By default, Hudi will submit multiple spark jobs and union the results. In case you want
to force Hudi to use single spark job, set the execution strategy
class <a href="/docs/configurations/#hoodieclusteringexecutionstrategyclass">config</a>
to <code>SingleSparkJobExecutionStrategy</code>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="update-strategy">Update Strategy<a class="hash-link" href="#update-strategy" title="Direct link to heading"></a></h3><p>Currently, clustering can only be scheduled for tables/partitions not receiving any concurrent updates. By default,
the <a href="/docs/configurations/#hoodieclusteringupdatesstrategy">config for update strategy</a> is
set to <strong><em>SparkRejectUpdateStrategy</em></strong>. If some file group has updates during clustering then it will reject updates and
throw an exception. However, in some use-cases updates are very sparse and do not touch most file groups. The default
strategy to simply reject updates does not seem fair. In such use-cases, users can set the config to <strong><em>SparkAllowUpdateStrategy</em></strong>.</p><p>We discussed the critical strategy configurations. All other configurations related to clustering are
listed <a href="/docs/configurations/#Clustering-Configs">here</a>. Out of this list, a few
configurations that will be very useful are:</p><table><thead><tr><th>Config key</th><th>Remarks</th><th>Default</th></tr></thead><tbody><tr><td><code>hoodie.clustering.async.enabled</code></td><td>Enable running of clustering service, asynchronously as writes happen on the table.</td><td>False</td></tr><tr><td><code>hoodie.clustering.async.max.commits</code></td><td>Control frequency of async clustering by specifying after how many commits clustering should be triggered.</td><td>4</td></tr><tr><td><code>hoodie.clustering.preserve.commit.metadata</code></td><td>When rewriting data, preserves existing _hoodie_commit_time. This means users can run incremental queries on clustered data without any side-effects.</td><td>False</td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_y2LR" id="asynchronous-clustering">Asynchronous Clustering<a class="hash-link" href="#asynchronous-clustering" title="Direct link to heading"></a></h2><p>Users can leverage <a href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-SetupforAsyncclusteringJob" target="_blank" rel="noopener noreferrer">HoodieClusteringJob</a>
to setup 2-step asynchronous clustering.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="hoodieclusteringjob">HoodieClusteringJob<a class="hash-link" href="#hoodieclusteringjob" title="Direct link to heading"></a></h3><p>By specifying the <code>scheduleAndExecute</code> mode both schedule as well as clustering can be achieved in the same step.
The appropriate mode can be specified using <code>-mode</code> or <code>-m</code> option. There are three modes:</p><ol><li><code>schedule</code>: Make a clustering plan. This gives an instant which can be passed in execute mode.</li><li><code>execute</code>: Execute a clustering plan at a particular instant. If no instant-time is specified, HoodieClusteringJob will execute for the earliest instant on the Hudi timeline.</li><li><code>scheduleAndExecute</code>: Make a clustering plan first and execute that plan immediately.</li></ol><p>Note that to run this job while the original writer is still running, please enable multi-writing:</p><div class="codeBlockContainer_J+bg theme-code-block"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.write.concurrency.mode=optimistic_concurrency_control</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>A sample spark-submit command to setup HoodieClusteringJob is as below:</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--class org.apache.hudi.utilities.HoodieClusteringJob </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/path/to/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.9.0-SNAPSHOT.jar </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--props /path/to/config/clusteringjob.properties </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--mode scheduleAndExecute </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--base-path /path/to/hudi_table/basePath </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--table-name hudi_table_schedule_clustering </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--spark-memory 1g</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>A sample <code>clusteringjob.properties</code> file:</p><div class="codeBlockContainer_J+bg theme-code-block"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.async.enabled=true</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.async.max.commits=4</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.small.file.limit=629145600</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.sort.columns=column1,column2</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="hoodiedeltastreamer">HoodieDeltaStreamer<a class="hash-link" href="#hoodiedeltastreamer" title="Direct link to heading"></a></h3><p>This brings us to our users&#x27; favorite utility in Hudi. Now, we can trigger asynchronous clustering with DeltaStreamer.
Just set the <code>hoodie.clustering.async.enabled</code> config to true and specify other clustering config in properties file
whose location can be pased as <code>—props</code> when starting the deltastreamer (just like in the case of HoodieClusteringJob).</p><p>A sample spark-submit command to setup HoodieDeltaStreamer is as below:</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/path/to/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.9.0-SNAPSHOT.jar </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--props /path/to/config/clustering_kafka.properties </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--source-class org.apache.hudi.utilities.sources.AvroKafkaSource </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--source-ordering-field impresssiontime </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--table-type COPY_ON_WRITE </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--target-base-path /path/to/hudi_table/basePath </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--target-table impressions_cow_cluster </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--op INSERT </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--hoodie-conf hoodie.clustering.async.enabled</span><span class="token operator">=</span><span class="token plain">true </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--continuous</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="spark-structured-streaming">Spark Structured Streaming<a class="hash-link" href="#spark-structured-streaming" title="Direct link to heading"></a></h3><p>We can also enable asynchronous clustering with Spark structured streaming sink as shown below.</p><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">val commonOpts = Map(</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.insert.shuffle.parallelism&quot; -&gt; &quot;4&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.upsert.shuffle.parallelism&quot; -&gt; &quot;4&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DataSourceWriteOptions.RECORDKEY_FIELD.key -&gt; &quot;_row_key&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DataSourceWriteOptions.PARTITIONPATH_FIELD.key -&gt; &quot;partition&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DataSourceWriteOptions.PRECOMBINE_FIELD.key -&gt; &quot;timestamp&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> HoodieWriteConfig.TBL_NAME.key -&gt; &quot;hoodie_test&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">def getAsyncClusteringOpts(isAsyncClustering: String, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> clusteringNumCommit: String, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> executionStrategy: String):Map[String, String] = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> commonOpts + (DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE.key -&gt; isAsyncClustering,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> HoodieClusteringConfig.ASYNC_CLUSTERING_MAX_COMMITS.key -&gt; clusteringNumCommit,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME.key -&gt; executionStrategy</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> )</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">def initStreamingWriteFuture(hudiOptions: Map[String, String]): Future[Unit] = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val streamingInput = // define the source of streaming</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Future {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> println(&quot;streaming starting&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> streamingInput</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .writeStream</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .format(&quot;org.apache.hudi&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .options(hudiOptions)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(&quot;checkpointLocation&quot;, basePath + &quot;/checkpoint&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .mode(Append)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .start()</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .awaitTermination(10000)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> println(&quot;streaming ends&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">def structuredStreamingWithClustering(): Unit = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val df = //generate data frame</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val hudiOptions = getClusteringOpts(&quot;true&quot;, &quot;1&quot;, &quot;org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val f1 = initStreamingWriteFuture(hudiOptions)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Await.result(f1, Duration.Inf)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/apache/hudi/tree/asf-site/website/versioned_docs/version-0.11.0/clustering.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_mt2f"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/0.11.0/compaction"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Compaction</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/0.11.0/metadata_indexing"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Metadata Indexing</div></a></div></nav></div></div><div class="col col--3"><div class="tableOfContents_vrFS thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#background" class="table-of-contents__link toc-highlight">Background</a></li><li><a href="#clustering-architecture" class="table-of-contents__link toc-highlight">Clustering Architecture</a><ul><li><a href="#overall-there-are-2-parts-to-clustering" class="table-of-contents__link toc-highlight">Overall, there are 2 parts to clustering</a></li><li><a href="#scheduling-clustering" class="table-of-contents__link toc-highlight">Scheduling clustering</a></li><li><a href="#running-clustering" class="table-of-contents__link toc-highlight">Running clustering</a></li><li><a href="#setting-up-clustering" class="table-of-contents__link toc-highlight">Setting up clustering</a></li></ul></li><li><a href="#async-clustering---strategies" class="table-of-contents__link toc-highlight">Async Clustering - Strategies</a><ul><li><a href="#plan-strategy" class="table-of-contents__link toc-highlight">Plan Strategy</a></li><li><a href="#execution-strategy" class="table-of-contents__link toc-highlight">Execution Strategy</a></li><li><a href="#update-strategy" class="table-of-contents__link toc-highlight">Update Strategy</a></li></ul></li><li><a href="#asynchronous-clustering" class="table-of-contents__link toc-highlight">Asynchronous Clustering</a><ul><li><a href="#hoodieclusteringjob" class="table-of-contents__link toc-highlight">HoodieClusteringJob</a></li><li><a href="#hoodiedeltastreamer" class="table-of-contents__link toc-highlight">HoodieDeltaStreamer</a></li><li><a href="#spark-structured-streaming" class="table-of-contents__link toc-highlight">Spark Structured Streaming</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/blog/2021/07/21/streaming-data-lake-platform">Our Vision</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/concepts">Concepts</a></li><li class="footer__item"><a class="footer__link-item" href="/community/team">Team</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/release-0.14.1">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/powered-by">Who&#x27;s Using</a></li></ul></div><div class="col footer__col"><div class="footer__title">Learn</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/quick-start-guide">Quick Start</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/docker_demo">Docker Demo</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/talks">Talks</a></li><li class="footer__item"><a class="footer__link-item" href="/videos">Video Guides</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Technical Wiki<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div><div class="col footer__col"><div class="footer__title">Hudi On Cloud</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/s3_hoodie">AWS</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/gcs_hoodie">Google Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/oss_hoodie">Alibaba Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/azure_hoodie">Microsoft Azure</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/cos_hoodie">Tencent Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/ibm_cos_hoodie">IBM Cloud</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/community/get-involved">Get Involved</a></li><li class="footer__item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Linkedin<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><div class="footer__title">Apache</div><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://hudi.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_SRtH"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footer__copyright">Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. <br>Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation.</div></div></div></footer></div>
<script src="/assets/js/runtime~main.46626068.js"></script>
<script src="/assets/js/main.19b28340.js"></script>
</body>
</html>