| <!doctype html> |
| <html lang="en" dir="ltr"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta name="generator" content="Docusaurus v2.0.0-beta.14"> |
| <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Hudi: User-Facing Analytics RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Hudi: User-Facing Analytics Atom Feed"> |
| <link rel="alternate" type="application/json" href="/blog/feed.json" title="Apache Hudi: User-Facing Analytics JSON Feed"> |
| <link rel="search" type="application/opensearchdescription+xml" title="Apache Hudi" href="/opensearch.xml"> |
| <link rel="alternate" type="application/rss+xml" href="/videos/rss.xml" title="Apache Hudi RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/videos/atom.xml" title="Apache Hudi Atom Feed"> |
| <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro"> |
| <link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">Asynchronous Clustering using Hudi | Apache Hudi</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://hudi.apache.org/blog/2021/08/23/async-clustering"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="default"><meta data-react-helmet="true" name="keywords" content="apache hudi, data lake, lakehouse, big data, apache spark, apache flink, presto, trino, analytics, data engineering"><meta data-react-helmet="true" property="og:title" content="Asynchronous Clustering using Hudi | Apache Hudi"><meta data-react-helmet="true" name="description" content="In one of the previous blog posts, we introduced a new"><meta data-react-helmet="true" property="og:description" content="In one of the previous blog posts, we introduced a new"><meta data-react-helmet="true" property="og:image" content="https://hudi.apache.org/assets/images/blog/clustering/example_perf_improvement.png"><meta data-react-helmet="true" name="twitter:image" content="https://hudi.apache.org/assets/images/blog/clustering/example_perf_improvement.png"><meta data-react-helmet="true" property="og:type" content="article"><meta data-react-helmet="true" property="article:published_time" content="2021-08-23T00:00:00.000Z"><meta data-react-helmet="true" property="article:tag" content="design,clustering,apache hudi"><link data-react-helmet="true" rel="icon" href="/assets/images/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://hudi.apache.org/blog/2021/08/23/async-clustering"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/blog/2021/08/23/async-clustering" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/cn/blog/2021/08/23/async-clustering" hreflang="cn"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/blog/2021/08/23/async-clustering" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.ea681a30.css"> |
| <link rel="preload" href="/assets/js/runtime~main.2cab5691.js" as="script"> |
| <link rel="preload" href="/assets/js/main.bd020950.js" as="script"> |
| </head> |
| <body> |
| <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"> |
| <div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><div class="announcementBar_axC9" role="banner"><div class="announcementBarPlaceholder_xYHE"></div><div class="announcementBarContent_6uhP">⭐️ If you like Apache Hudi, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi">GitHub</a>! ⭐</div><button type="button" class="clean-btn close announcementBarClose_A3A1" aria-label="Close"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav class="navbar navbar--fixed-top navbarWrapper_UIa0"><div class="navbar__inner"><img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8f594acf-9b77-44fb-9475-3e82ead1910c" width="0" height="0" alt=""><img referrerpolicy="no-referrer-when-downgrade" src="https://analytics.apache.org/matomo.php?idsite=47&rec=1" width="0" height="0" alt=""><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo navbarLogo_Bz6n"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><a class="navbar__item navbar__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Learn<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/talks"><div class="labelWrapperDropdown_Mqbj">Talks</div></a></li><li><a class="dropdown__link" href="/videos"><div class="labelWrapperDropdown_Mqbj">Video Guides</div></a></li><li><a class="dropdown__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">FAQ</div></a></li><li><a class="dropdown__link" href="/tech-specs"><div class="labelWrapperDropdown_Mqbj">Tech Specs</div></a></li><li><a class="dropdown__link" href="/tech-specs-1point0"><div class="labelWrapperDropdown_Mqbj">Tech Specs 1.0</div></a></li><li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Technical Wiki<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Contribute<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/contribute/how-to-contribute"><div class="labelWrapperDropdown_Mqbj">How to Contribute</div></a></li><li><a class="dropdown__link" href="/contribute/developer-setup"><div class="labelWrapperDropdown_Mqbj">Developer Setup</div></a></li><li><a class="dropdown__link" href="/contribute/rfc-process"><div class="labelWrapperDropdown_Mqbj">RFC Process</div></a></li><li><a class="dropdown__link" href="/contribute/report-security-issues"><div class="labelWrapperDropdown_Mqbj">Report Security Issues</div></a></li><li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Report Issues<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Community<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/get-involved"><div class="labelWrapperDropdown_Mqbj">Get Involved</div></a></li><li><a class="dropdown__link" href="/community/syncs"><div class="labelWrapperDropdown_Mqbj">Community Syncs</div></a></li><li><a class="dropdown__link" href="/community/office_hours"><div class="labelWrapperDropdown_Mqbj">Office Hours</div></a></li><li><a class="dropdown__link" href="/community/team"><div class="labelWrapperDropdown_Mqbj">Team</div></a></li></ul></div><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a><a class="navbar__item navbar__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who's Using</div></a><a class="navbar__item navbar__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a><a class="navbar__item navbar__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link downloadLinkDropdownHide_aDP3" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">0.14.1<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/overview"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li><a class="dropdown__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li><a class="dropdown__link" href="/docs/0.14.0/overview"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li><a class="dropdown__link" href="/docs/0.13.1/overview"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li><a class="dropdown__link" href="/docs/0.13.0/overview"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li><a class="dropdown__link" href="/docs/0.12.3/overview"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li><a class="dropdown__link" href="/docs/0.12.2/overview"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li><a class="dropdown__link" href="/docs/0.12.1/overview"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li><a class="dropdown__link" href="/docs/0.12.0/overview"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li><a class="dropdown__link" href="/docs/0.11.1/overview"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li><a class="dropdown__link" href="/docs/0.11.0/overview"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li><a class="dropdown__link" href="/docs/0.10.1/overview"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li><a class="dropdown__link" href="/docs/0.10.0/overview"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li><a class="dropdown__link" href="/docs/0.9.0/overview"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li><a class="dropdown__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li><a class="dropdown__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li><a class="dropdown__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li><a class="dropdown__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li><a class="dropdown__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li><a class="dropdown__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li><a class="dropdown__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>English</span></span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#a)"><path d="M14 6.457a6.842 6.842 0 0 0-7-6.02 6.843 6.843 0 0 0-7 6.02v1.085a6.843 6.843 0 0 0 7 6.02 6.843 6.843 0 0 0 7-6.02V6.457Zm-1.094 0h-2.625a9.92 9.92 0 0 0-.376-2.222 6.65 6.65 0 0 0 1.531-.875 5.25 5.25 0 0 1 1.444 3.097h.026Zm-8.032 0a8.479 8.479 0 0 1 .324-1.872 7.376 7.376 0 0 0 3.63 0c.175.61.284 1.239.325 1.872h-4.28Zm4.305 1.085a8.391 8.391 0 0 1-.324 1.873 7.464 7.464 0 0 0-3.658 0 8.479 8.479 0 0 1-.323-1.873h4.305Zm.35-4.375A10.342 10.342 0 0 0 8.75 1.75c.627.194 1.218.49 1.75.875a5.748 5.748 0 0 1-.998.577l.027-.035ZM7.254 1.54A8.75 8.75 0 0 1 8.46 3.552c-.48.11-.97.165-1.461.167-.492-.001-.982-.057-1.461-.167.308-.722.715-1.4 1.207-2.012h.508ZM4.498 3.202a5.748 5.748 0 0 1-.998-.577 6.029 6.029 0 0 1 1.75-.875c-.294.46-.546.947-.753 1.452Zm-1.873.15c.47.358.984.652 1.531.874A9.625 9.625 0 0 0 3.78 6.45H1.155a5.25 5.25 0 0 1 1.47-3.098ZM1.12 7.541h2.625c.038.753.164 1.5.376 2.223a6.649 6.649 0 0 0-1.531.875 5.25 5.25 0 0 1-1.47-3.098Zm3.377 3.255c.207.506.459.992.753 1.453a6.03 6.03 0 0 1-1.75-.875c.312-.226.646-.419.997-.578Zm2.25 1.663a8.594 8.594 0 0 1-1.208-2.013 6.501 6.501 0 0 1 2.922 0 8.54 8.54 0 0 1-1.207 2.013h-.508Zm2.755-1.663c.367.156.716.35 1.042.578a6.338 6.338 0 0 1-1.75.875c.275-.464.512-.95.708-1.453Zm1.873-.148a6.647 6.647 0 0 0-1.531-.875 9.45 9.45 0 0 0 .376-2.223h2.625a5.25 5.25 0 0 1-1.47 3.098Z" fill="#1C1E21"></path></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h14v14H0z"></path></clipPath></defs></svg></div></a><ul class="dropdown__menu"><li><a href="/blog/2021/08/23/async-clustering" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active"><div class="labelWrapperDropdown_Mqbj">English</div></a></li><li><a href="/cn/blog/2021/08/23/async-clustering" target="_self" rel="noopener noreferrer" class="dropdown__link"><div class="labelWrapperDropdown_Mqbj">Chinese</div></a></li></ul></div><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a><div class="searchBox_fBfG"><div role="button" class="searchButton_g9-U" aria-label="Search"><span class="searchText_RI6l">Search</span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="6.864" cy="6.864" r="5.243" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></circle><path d="m10.51 10.783 2.056 2.05" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><button type="button" class="clean-btn navbar-sidebar__close"><svg viewBox="0 0 15 15" width="21" height="21"><g stroke="var(--ifm-color-emphasis-600)" stroke-width="1.2"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><div class="navbar-sidebar__items"><div class="navbar-sidebar__item menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Learn</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Contribute</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Community</div></a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a></li><li class="menu__list-item"><a class="menu__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who's Using</div></a></li><li class="menu__list-item"><a class="menu__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a></li><li class="menu__list-item"><a class="menu__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Versions</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Languages</span></span></div></a></li><li class="menu__list-item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="menu__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="menu__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="menu__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="menu__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="menu__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a></li></ul></div><div class="navbar-sidebar__item menu"><button type="button" class="clean-btn navbar-sidebar__back">← Back to main menu</button></div></div></div></nav><div class="main-wrapper blog-wrapper blog-post-page"><div class="container margin-vert--lg"><div class="row"><main class="col col--9 col--offset-2" itemscope="" itemtype="http://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header class="postHeader_Ipb1"><div><h1 class="blogPostTitle_RC3s" itemprop="headline"><h1 class="blogPostPageTitle_bKZt" itemprop="headline">Asynchronous Clustering using Hudi</h1></h1><div class="blogInfo_1FPd margin-top--sm margin-bottom--sm"><div class="blogPostText_jBA8 row"><time datetime="2021-08-23T00:00:00.000Z" itemprop="datePublished">August 23, 2021</time><div><div><div><a itemprop="url"><span class="blogPostAuthorsList_dlEG" itemprop="name">codope</span></a></div></div></div></div><div class="blogPostData_A2Le">7 min read</div></div></div><ul class="authorTimeTags_oN88 padding--none margin-left--sm tagsWrapperPostPage_VdId"><li class="tag_MgfY tagPostPage_gnvv"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/design">design</a></li><li class="tag_MgfY tagPostPage_gnvv"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/clustering">clustering</a></li><li class="tag_MgfY tagPostPage_gnvv"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/apache-hudi">apache hudi</a></li></ul></header><div class="markdown" itemprop="articleBody"><p>In one of the <a href="/blog/2021/01/27/hudi-clustering-intro">previous blog</a> posts, we introduced a new |
| kind of table service called clustering to reorganize data for improved query performance without compromising on |
| ingestion speed. We learnt how to setup inline clustering. In this post, we will discuss what has changed since then and |
| see how asynchronous clustering can be setup using HoodieClusteringJob as well as DeltaStreamer utility.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="introduction">Introduction<a class="hash-link" href="#introduction" title="Direct link to heading"></a></h2><p>On a high level, clustering creates a plan based on a configurable strategy, groups eligible files based on specific |
| criteria and then executes the plan. Hudi supports <a href="https://hudi.apache.org/docs/concurrency_control#enabling-multi-writing" target="_blank" rel="noopener noreferrer">multi-writers</a> which provides |
| snapshot isolation between multiple table services, thus allowing writers to continue with ingestion while clustering |
| runs in the background. For a more detailed overview of the clustering architecture please check out the previous blog |
| post.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="clustering-strategies">Clustering Strategies<a class="hash-link" href="#clustering-strategies" title="Direct link to heading"></a></h2><p>As mentioned before, clustering plan as well as execution depends on configurable strategy. These strategies can be |
| broadly classified into three types: clustering plan strategy, execution strategy and update strategy.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="plan-strategy">Plan Strategy<a class="hash-link" href="#plan-strategy" title="Direct link to heading"></a></h3><p>This strategy comes into play while creating clustering plan. It helps to decide what file groups should be clustered. |
| Let's look at different plan strategies that are available with Hudi. Note that these strategies are easily pluggable |
| using this <a href="/docs/next/configurations#hoodieclusteringplanstrategyclass">config</a>.</p><ol><li><code>SparkSizeBasedClusteringPlanStrategy</code>: It selects file slices based on |
| the <a href="/docs/next/configurations/#hoodieclusteringplanstrategysmallfilelimit">small file limit</a> |
| of base files and creates clustering groups upto max file size allowed per group. The max size can be specified using |
| this <a href="/docs/next/configurations/#hoodieclusteringplanstrategymaxbytespergroup">config</a>. This |
| strategy is useful for stitching together medium-sized files into larger ones to reduce lot of files spread across |
| cold partitions.</li><li><code>SparkRecentDaysClusteringPlanStrategy</code>: It looks back previous 'N' days partitions and creates a plan that will |
| cluster the 'small' file slices within those partitions. This is the default strategy. It could be useful when the |
| workload is predictable and data is partitioned by time.</li><li><code>SparkSelectedPartitionsClusteringPlanStrategy</code>: In case you want to cluster only specific partitions within a range, |
| no matter how old or new are those partitions, then this strategy could be useful. To use this strategy, one needs |
| to set below two configs additionally (both begin and end partitions are inclusive):</li></ol><div class="codeBlockContainer_J+bg theme-code-block"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.cluster.begin.partition</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.cluster.end.partition</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>All the strategies are partition-aware and the latter two are still bound by the size limits of the first strategy.</p></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="execution-strategy">Execution Strategy<a class="hash-link" href="#execution-strategy" title="Direct link to heading"></a></h3><p>After building the clustering groups in the planning phase, Hudi applies execution strategy, for each group, primarily |
| based on sort columns and size. The strategy can be specified using this <a href="/docs/next/configurations/#hoodieclusteringexecutionstrategyclass">config</a>.</p><p><code>SparkSortAndSizeExecutionStrategy</code> is the default strategy. Users can specify the columns to sort the data by, when |
| clustering using |
| this <a href="/docs/next/configurations/#hoodieclusteringplanstrategysortcolumns">config</a>. Apart from |
| that, we can also set <a href="/docs/next/configurations/#hoodieparquetmaxfilesize">max file size</a> |
| for the parquet files produced due to clustering. The strategy uses bulk insert to write data into new files, in which |
| case, Hudi implicitly uses a partitioner that does sorting based on specified columns. In this way, the strategy changes |
| the data layout in a way that not only improves query performance but also balance rewrite overhead automatically.</p><p>Now this strategy can be executed either as a single spark job or multiple jobs depending on number of clustering groups |
| created in the planning phase. By default, Hudi will submit multiple spark jobs and union the results. In case you want |
| to force Hudi to use single spark job, set the execution strategy |
| class <a href="/docs/next/configurations/#hoodieclusteringexecutionstrategyclass">config</a> |
| to <code>SingleSparkJobExecutionStrategy</code>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="update-strategy">Update Strategy<a class="hash-link" href="#update-strategy" title="Direct link to heading"></a></h3><p>Currently, clustering can only be scheduled for tables/partitions not receiving any concurrent updates. By default, |
| the <a href="/docs/next/configurations/#hoodieclusteringupdatesstrategy">config for update strategy</a> is |
| set to <strong><em>SparkRejectUpdateStrategy</em></strong>. If some file group has updates during clustering then it will reject updates and |
| throw an exception. However, in some use-cases updates are very sparse and do not touch most file groups. The default |
| strategy to simply reject updates does not seem fair. In such use-cases, users can set the config to <strong><em>SparkAllowUpdateStrategy</em></strong>.</p><p>We discussed the critical strategy configurations. All other configurations related to clustering are |
| listed <a href="/docs/next/configurations/#Clustering-Configs">here</a>. Out of this list, a few |
| configurations that will be very useful are:</p><table><thead><tr><th>Config key</th><th>Remarks</th><th>Default</th></tr></thead><tbody><tr><td><code>hoodie.clustering.async.enabled</code></td><td>Enable running of clustering service, asynchronously as writes happen on the table.</td><td>False</td></tr><tr><td><code>hoodie.clustering.async.max.commits</code></td><td>Control frequency of async clustering by specifying after how many commits clustering should be triggered.</td><td>4</td></tr><tr><td><code>hoodie.clustering.preserve.commit.metadata</code></td><td>When rewriting data, preserves existing _hoodie_commit_time. This means users can run incremental queries on clustered data without any side-effects.</td><td>False</td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_y2LR" id="asynchronous-clustering">Asynchronous Clustering<a class="hash-link" href="#asynchronous-clustering" title="Direct link to heading"></a></h2><p>Previously, we have seen how users |
| can <a href="/blog/2021/01/27/hudi-clustering-intro#setting-up-clustering">setup inline clustering</a>. |
| Additionally, users can |
| leverage <a href="https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-SetupforAsyncclusteringJob" target="_blank" rel="noopener noreferrer">HoodieClusteringJob</a> |
| to setup 2-step asynchronous clustering.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="hoodieclusteringjob">HoodieClusteringJob<a class="hash-link" href="#hoodieclusteringjob" title="Direct link to heading"></a></h3><p>With the release of Hudi version 0.9.0, we can schedule as well as execute clustering in the same step. We just need to |
| specify the <code>—mode</code> or <code>-m</code> option. There are three modes:</p><ol><li><code>schedule</code>: Make a clustering plan. This gives an instant which can be passed in execute mode.</li><li><code>execute</code>: Execute a clustering plan at given instant which means --instant-time is required here.</li><li><code>scheduleAndExecute</code>: Make a clustering plan first and execute that plan immediately.</li></ol><p>Note that to run this job while the original writer is still running, please enable multi-writing:</p><div class="codeBlockContainer_J+bg theme-code-block"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.write.concurrency.mode=optimistic_concurrency_control</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>A sample spark-submit command to setup HoodieClusteringJob is as below:</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--class org.apache.hudi.utilities.HoodieClusteringJob </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/path/to/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.9.0-SNAPSHOT.jar </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--props /path/to/config/clusteringjob.properties </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--mode scheduleAndExecute </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--base-path /path/to/hudi_table/basePath </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--table-name hudi_table_schedule_clustering </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--spark-memory 1g</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>A sample <code>clusteringjob.properties</code> file:</p><div class="codeBlockContainer_J+bg theme-code-block"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.async.enabled=true</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.async.max.commits=4</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.small.file.limit=629145600</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.clustering.plan.strategy.sort.columns=column1,column2</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="hoodiedeltastreamer">HoodieDeltaStreamer<a class="hash-link" href="#hoodiedeltastreamer" title="Direct link to heading"></a></h3><p>This brings us to our users' favorite utility in Hudi. Now, we can trigger asynchronous clustering with DeltaStreamer. |
| Just set the <code>hoodie.clustering.async.enabled</code> config to true and specify other clustering config in properties file |
| whose location can be pased as <code>—props</code> when starting the deltastreamer (just like in the case of HoodieClusteringJob).</p><p>A sample spark-submit command to setup HoodieDeltaStreamer is as below:</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/path/to/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.9.0-SNAPSHOT.jar </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--props /path/to/config/clustering_kafka.properties </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--source-class org.apache.hudi.utilities.sources.AvroKafkaSource </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--source-ordering-field impresssiontime </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--table-type COPY_ON_WRITE </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--target-base-path /path/to/hudi_table/basePath </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--target-table impressions_cow_cluster </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--op INSERT </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--hoodie-conf hoodie.clustering.async.enabled</span><span class="token operator">=</span><span class="token plain">true </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--continuous</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="spark-structured-streaming">Spark Structured Streaming<a class="hash-link" href="#spark-structured-streaming" title="Direct link to heading"></a></h3><p>We can also enable asynchronous clustering with Spark structured streaming sink as shown below. </p><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">val commonOpts = Map(</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "hoodie.insert.shuffle.parallelism" -> "4",</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "hoodie.upsert.shuffle.parallelism" -> "4",</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> HoodieWriteConfig.TBL_NAME.key -> "hoodie_test"</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">def getAsyncClusteringOpts(isAsyncClustering: String, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> clusteringNumCommit: String, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> executionStrategy: String):Map[String, String] = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> commonOpts + (DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE.key -> isAsyncClustering,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> HoodieClusteringConfig.ASYNC_CLUSTERING_MAX_COMMITS.key -> clusteringNumCommit,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> HoodieClusteringConfig.EXECUTION_STRATEGY_CLASS_NAME.key -> executionStrategy</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> )</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">def initStreamingWriteFuture(hudiOptions: Map[String, String]): Future[Unit] = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val streamingInput = // define the source of streaming</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Future {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> println("streaming starting")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> streamingInput</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .writeStream</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .format("org.apache.hudi")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .options(hudiOptions)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option("checkpointLocation", basePath + "/checkpoint")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .mode(Append)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .start()</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .awaitTermination(10000)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> println("streaming ends")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">def structuredStreamingWithClustering(): Unit = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val df = //generate data frame</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val hudiOptions = getClusteringOpts("true", "1", "org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> val f1 = initStreamingWriteFuture(hudiOptions)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Await.result(f1, Duration.Inf)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2 class="anchor anchorWithStickyNavbar_y2LR" id="conclusion-and-future-work">Conclusion and Future Work<a class="hash-link" href="#conclusion-and-future-work" title="Direct link to heading"></a></h2><p>In this post, we discussed different clustering strategies and how to setup asynchronous clustering. The story is not |
| over yet and future work entails:</p><ul><li>Support clustering with updates.</li><li>CLI tools to support clustering.</li></ul><p>Please follow this <a href="https://issues.apache.org/jira/browse/HUDI-1042" target="_blank" rel="noopener noreferrer">JIRA</a> to learn more about active development on |
| this issue. We look forward to contributions from the community. Hope you enjoyed this post. Put your Hudi on and keep |
| streaming!</p></div><footer0 class="row docusaurus-mt-lg blogPostDetailsFull_2lop"><div class="col margin-top--sm"><a href="https://github.com/apache/hudi/edit/asf-site/website/blog/blog/2021-08-23-async-clustering.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></footer0></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/blog/2021/09/01/building-eb-level-data-lake-using-hudi-at-bytedance"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">« <!-- -->Building an ExaByte-level Data Lake Using Apache Hudi at ByteDance</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/blog/2021/08/23/s3-events-source"><div class="pagination-nav__sublabel">Older Post</div><div class="pagination-nav__label">Reliable ingestion from AWS S3 using Hudi<!-- --> »</div></a></div></nav></main><div class="col col--2"><div class="tableOfContents_vrFS thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#introduction" class="table-of-contents__link toc-highlight">Introduction</a></li><li><a href="#clustering-strategies" class="table-of-contents__link toc-highlight">Clustering Strategies</a><ul><li><a href="#plan-strategy" class="table-of-contents__link toc-highlight">Plan Strategy</a></li><li><a href="#execution-strategy" class="table-of-contents__link toc-highlight">Execution Strategy</a></li><li><a href="#update-strategy" class="table-of-contents__link toc-highlight">Update Strategy</a></li></ul></li><li><a href="#asynchronous-clustering" class="table-of-contents__link toc-highlight">Asynchronous Clustering</a><ul><li><a href="#hoodieclusteringjob" class="table-of-contents__link toc-highlight">HoodieClusteringJob</a></li><li><a href="#hoodiedeltastreamer" class="table-of-contents__link toc-highlight">HoodieDeltaStreamer</a></li><li><a href="#spark-structured-streaming" class="table-of-contents__link toc-highlight">Spark Structured Streaming</a></li></ul></li><li><a href="#conclusion-and-future-work" class="table-of-contents__link toc-highlight">Conclusion and Future Work</a></li></ul></div></div></div></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/blog/2021/07/21/streaming-data-lake-platform">Our Vision</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/concepts">Concepts</a></li><li class="footer__item"><a class="footer__link-item" href="/community/team">Team</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/release-0.14.1">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/powered-by">Who's Using</a></li></ul></div><div class="col footer__col"><div class="footer__title">Learn</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/quick-start-guide">Quick Start</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/docker_demo">Docker Demo</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/talks">Talks</a></li><li class="footer__item"><a class="footer__link-item" href="/videos">Video Guides</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Technical Wiki<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div><div class="col footer__col"><div class="footer__title">Hudi On Cloud</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/s3_hoodie">AWS</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/gcs_hoodie">Google Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/oss_hoodie">Alibaba Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/azure_hoodie">Microsoft Azure</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/cos_hoodie">Tencent Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/ibm_cos_hoodie">IBM Cloud</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/community/get-involved">Get Involved</a></li><li class="footer__item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Linkedin<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><div class="footer__title">Apache</div><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://hudi.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_SRtH"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footer__copyright">Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. <br>Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation.</div></div></div></footer></div> |
| <script src="/assets/js/runtime~main.2cab5691.js"></script> |
| <script src="/assets/js/main.bd020950.js"></script> |
| </body> |
| </html> |