blob: f9d03dc7d97703bfc2ad0831754bd4732e4d4540 [file] [log] [blame]
<!doctype html>
<html class="docs-version-0.14.0" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-beta.14">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Hudi: User-Facing Analytics RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Hudi: User-Facing Analytics Atom Feed">
<link rel="alternate" type="application/json" href="/blog/feed.json" title="Apache Hudi: User-Facing Analytics JSON Feed">
<link rel="search" type="application/opensearchdescription+xml" title="Apache Hudi" href="/opensearch.xml">
<link rel="alternate" type="application/rss+xml" href="/videos/rss.xml" title="Apache Hudi RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/videos/atom.xml" title="Apache Hudi Atom Feed">
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro">
<link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">FAQs | Apache Hudi</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://hudi.apache.org/docs/0.14.0/faq"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:version" content="0.14.0"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="docs-default-0.14.0"><meta data-react-helmet="true" property="og:title" content="FAQs | Apache Hudi"><meta data-react-helmet="true" name="description" content="General"><meta data-react-helmet="true" property="og:description" content="General"><meta data-react-helmet="true" name="keywords" content="hudi,writing,reading"><link data-react-helmet="true" rel="icon" href="/assets/images/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://hudi.apache.org/docs/0.14.0/faq"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.14.0/faq" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/cn/docs/0.14.0/faq" hreflang="cn"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.14.0/faq" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.ea681a30.css">
<link rel="preload" href="/assets/js/runtime~main.2cab5691.js" as="script">
<link rel="preload" href="/assets/js/main.bd020950.js" as="script">
</head>
<body>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><div class="announcementBar_axC9" role="banner"><div class="announcementBarPlaceholder_xYHE"></div><div class="announcementBarContent_6uhP">⭐️ If you like Apache Hudi, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi">GitHub</a>! ⭐</div><button type="button" class="clean-btn close announcementBarClose_A3A1" aria-label="Close"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav class="navbar navbar--fixed-top navbarWrapper_UIa0"><div class="navbar__inner"><img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8f594acf-9b77-44fb-9475-3e82ead1910c" width="0" height="0" alt=""><img referrerpolicy="no-referrer-when-downgrade" src="https://analytics.apache.org/matomo.php?idsite=47&amp;rec=1" width="0" height="0" alt=""><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo navbarLogo_Bz6n"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><a class="navbar__item navbar__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Learn<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/talks"><div class="labelWrapperDropdown_Mqbj">Talks</div></a></li><li><a class="dropdown__link" href="/videos"><div class="labelWrapperDropdown_Mqbj">Video Guides</div></a></li><li><a class="dropdown__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">FAQ</div></a></li><li><a class="dropdown__link" href="/tech-specs"><div class="labelWrapperDropdown_Mqbj">Tech Specs</div></a></li><li><a class="dropdown__link" href="/tech-specs-1point0"><div class="labelWrapperDropdown_Mqbj">Tech Specs 1.0</div></a></li><li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Technical Wiki<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Contribute<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/contribute/how-to-contribute"><div class="labelWrapperDropdown_Mqbj">How to Contribute</div></a></li><li><a class="dropdown__link" href="/contribute/developer-setup"><div class="labelWrapperDropdown_Mqbj">Developer Setup</div></a></li><li><a class="dropdown__link" href="/contribute/rfc-process"><div class="labelWrapperDropdown_Mqbj">RFC Process</div></a></li><li><a class="dropdown__link" href="/contribute/report-security-issues"><div class="labelWrapperDropdown_Mqbj">Report Security Issues</div></a></li><li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Report Issues<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Community<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/get-involved"><div class="labelWrapperDropdown_Mqbj">Get Involved</div></a></li><li><a class="dropdown__link" href="/community/syncs"><div class="labelWrapperDropdown_Mqbj">Community Syncs</div></a></li><li><a class="dropdown__link" href="/community/office_hours"><div class="labelWrapperDropdown_Mqbj">Office Hours</div></a></li><li><a class="dropdown__link" href="/community/team"><div class="labelWrapperDropdown_Mqbj">Team</div></a></li></ul></div><a class="navbar__item navbar__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a><a class="navbar__item navbar__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who&#x27;s Using</div></a><a class="navbar__item navbar__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a><a class="navbar__item navbar__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link downloadLinkDropdownHide_aDP3" href="/docs/0.14.0/overview"><div class="labelWrapperDropdown_Mqbj">0.14.0<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/faq"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li><a class="dropdown__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/0.14.0/faq"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li><a class="dropdown__link" href="/docs/0.13.1/faq"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li><a class="dropdown__link" href="/docs/0.13.0/faq"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li><a class="dropdown__link" href="/docs/0.12.3/faq"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li><a class="dropdown__link" href="/docs/0.12.2/faq"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li><a class="dropdown__link" href="/docs/0.12.1/faq"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li><a class="dropdown__link" href="/docs/0.12.0/faq"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li><a class="dropdown__link" href="/docs/0.11.1/faq"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li><a class="dropdown__link" href="/docs/0.11.0/faq"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li><a class="dropdown__link" href="/docs/0.10.1/faq"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li><a class="dropdown__link" href="/docs/0.10.0/faq"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li><a class="dropdown__link" href="/docs/0.9.0/overview"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li><a class="dropdown__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li><a class="dropdown__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li><a class="dropdown__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li><a class="dropdown__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li><a class="dropdown__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li><a class="dropdown__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li><a class="dropdown__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>English</span></span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#a)"><path d="M14 6.457a6.842 6.842 0 0 0-7-6.02 6.843 6.843 0 0 0-7 6.02v1.085a6.843 6.843 0 0 0 7 6.02 6.843 6.843 0 0 0 7-6.02V6.457Zm-1.094 0h-2.625a9.92 9.92 0 0 0-.376-2.222 6.65 6.65 0 0 0 1.531-.875 5.25 5.25 0 0 1 1.444 3.097h.026Zm-8.032 0a8.479 8.479 0 0 1 .324-1.872 7.376 7.376 0 0 0 3.63 0c.175.61.284 1.239.325 1.872h-4.28Zm4.305 1.085a8.391 8.391 0 0 1-.324 1.873 7.464 7.464 0 0 0-3.658 0 8.479 8.479 0 0 1-.323-1.873h4.305Zm.35-4.375A10.342 10.342 0 0 0 8.75 1.75c.627.194 1.218.49 1.75.875a5.748 5.748 0 0 1-.998.577l.027-.035ZM7.254 1.54A8.75 8.75 0 0 1 8.46 3.552c-.48.11-.97.165-1.461.167-.492-.001-.982-.057-1.461-.167.308-.722.715-1.4 1.207-2.012h.508ZM4.498 3.202a5.748 5.748 0 0 1-.998-.577 6.029 6.029 0 0 1 1.75-.875c-.294.46-.546.947-.753 1.452Zm-1.873.15c.47.358.984.652 1.531.874A9.625 9.625 0 0 0 3.78 6.45H1.155a5.25 5.25 0 0 1 1.47-3.098ZM1.12 7.541h2.625c.038.753.164 1.5.376 2.223a6.649 6.649 0 0 0-1.531.875 5.25 5.25 0 0 1-1.47-3.098Zm3.377 3.255c.207.506.459.992.753 1.453a6.03 6.03 0 0 1-1.75-.875c.312-.226.646-.419.997-.578Zm2.25 1.663a8.594 8.594 0 0 1-1.208-2.013 6.501 6.501 0 0 1 2.922 0 8.54 8.54 0 0 1-1.207 2.013h-.508Zm2.755-1.663c.367.156.716.35 1.042.578a6.338 6.338 0 0 1-1.75.875c.275-.464.512-.95.708-1.453Zm1.873-.148a6.647 6.647 0 0 0-1.531-.875 9.45 9.45 0 0 0 .376-2.223h2.625a5.25 5.25 0 0 1-1.47 3.098Z" fill="#1C1E21"></path></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h14v14H0z"></path></clipPath></defs></svg></div></a><ul class="dropdown__menu"><li><a href="/docs/0.14.0/faq" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active"><div class="labelWrapperDropdown_Mqbj">English</div></a></li><li><a href="/cn/docs/0.14.0/faq" target="_self" rel="noopener noreferrer" class="dropdown__link"><div class="labelWrapperDropdown_Mqbj">Chinese</div></a></li></ul></div><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a><div class="searchBox_fBfG"><div role="button" class="searchButton_g9-U" aria-label="Search"><span class="searchText_RI6l">Search</span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="6.864" cy="6.864" r="5.243" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></circle><path d="m10.51 10.783 2.056 2.05" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><button type="button" class="clean-btn navbar-sidebar__close"><svg viewBox="0 0 15 15" width="21" height="21"><g stroke="var(--ifm-color-emphasis-600)" stroke-width="1.2"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><div class="navbar-sidebar__items"><div class="navbar-sidebar__item menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Learn</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Contribute</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Community</div></a></li><li class="menu__list-item"><a class="menu__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a></li><li class="menu__list-item"><a class="menu__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who&#x27;s Using</div></a></li><li class="menu__list-item"><a class="menu__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a></li><li class="menu__list-item"><a class="menu__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></li><li class="menu__list-item"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Versions</div></a><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/next/faq"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/docs/0.14.0/faq"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.1/faq"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.0/faq"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.3/faq"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.2/faq"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.1/faq"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.0/faq"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.1/faq"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.0/faq"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.1/faq"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.0/faq"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.9.0/overview"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Languages</span></span></div></a></li><li class="menu__list-item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="menu__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="menu__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="menu__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="menu__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="menu__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a></li></ul></div><div class="navbar-sidebar__item menu"><button type="button" class="clean-btn navbar-sidebar__back">← Back to main menu</button></div></div></div></nav><div class="main-wrapper docs-wrapper docs-doc-page"><div class="docPage_GMj9"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_i9tI" type="button"></button><aside class="docSidebarContainer_k0Pq"><div class="sidebar_a3j0"><nav class="menu thin-scrollbar menu_cyFh menuWithAnnouncementBar_+O1J"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.14.0/overview">Overview</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.14.0/quick-start-guide">Quick Start</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.14.0/quick-start-guide">Spark Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.14.0/flink-quick-start-guide">Flink Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.14.0/docker_demo">Docker Demo</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.14.0/timeline">Concepts</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.14.0/sql_ddl">How To</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.14.0/migration_guide">Services</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.14.0/basic_configurations">Configurations</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.14.0/performance">Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.14.0/use_cases">Use Cases</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" href="/docs/0.14.0/faq">FAQs</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.14.0/privacy">Privacy Policy</a></li></ul></nav></div></aside><main class="docMainContainer_Q970"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_zHA2"><div class="theme-doc-version-banner alert alert--warning margin-bottom--md" role="alert"><div>This is documentation for <!-- -->Apache Hudi<!-- --> <b>0.14.0</b>, which is no longer actively maintained.</div><div class="margin-top--md">For up-to-date documentation, see the <b><a href="/docs/faq">latest version</a></b> (<!-- -->0.14.1<!-- -->).</div></div><div class="docItemContainer_oiyr"><article><span class="theme-doc-version-badge badge badge--secondary">Version: <!-- -->0.14.0</span><div class="tocCollapsible_aw-L theme-doc-toc-mobile tocMobile_Tx6Y"><button type="button" class="clean-btn tocCollapsibleButton_zr6a">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>FAQs</h1></header><h2 class="anchor anchorWithStickyNavbar_y2LR" id="general">General<a class="hash-link" href="#general" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="when-is-hudi-useful-for-me-or-my-organization">When is Hudi useful for me or my organization?<a class="hash-link" href="#when-is-hudi-useful-for-me-or-my-organization" title="Direct link to heading"></a></h3><p>If you are looking to quickly ingest data onto HDFS or cloud storage, Hudi can provide you tools to <a href="https://hudi.apache.org/docs/writing_data/" target="_blank" rel="noopener noreferrer">help</a>. Also, if you have ETL/hive/spark jobs which are slow/taking up a lot of resources, Hudi can potentially help by providing an incremental approach to reading and writing data.</p><p>As an organization, Hudi can help you build an <a href="https://docs.google.com/presentation/d/1FHhsvh70ZP6xXlHdVsAI0g__B_6Mpto5KQFlZ0b8-mM/edit#slide=id.p" target="_blank" rel="noopener noreferrer">efficient data lake</a>, solving some of the most complex, low-level storage management problems, while putting data into hands of your data analysts, engineers and scientists much quicker.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-are-some-non-goals-for-hudi">What are some non-goals for Hudi?<a class="hash-link" href="#what-are-some-non-goals-for-hudi" title="Direct link to heading"></a></h3><p>Hudi is not designed for any OLTP use-cases, where typically you are using existing NoSQL/RDBMS data stores. Hudi cannot replace your in-memory analytical database (at-least not yet!). Hudi support near-real time ingestion in the order of few minutes, trading off latency for efficient batching. If you truly desirable sub-minute processing delays, then stick with your favorite stream processing solution.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-is-incremental-processing-why-does-hudi-docstalks-keep-talking-about-it">What is incremental processing? Why does Hudi docs/talks keep talking about it?<a class="hash-link" href="#what-is-incremental-processing-why-does-hudi-docstalks-keep-talking-about-it" title="Direct link to heading"></a></h3><p>Incremental processing was first introduced by Vinoth Chandar, in the O&#x27;reilly <a href="https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/" target="_blank" rel="noopener noreferrer">blog</a>, that set off most of this effort. In purely technical terms, incremental processing merely refers to writing mini-batch programs in streaming processing style. Typical batch jobs consume <strong>all input</strong> and recompute <strong>all output</strong>, every few hours. Typical stream processing jobs consume some <strong>new input</strong> and recompute <strong>new/changes to output</strong>, continuously/every few seconds. While recomputing all output in batch fashion can be simpler, it&#x27;s wasteful and resource expensive. Hudi brings ability to author the same batch pipelines in streaming fashion, run every few minutes.</p><p>While we can merely refer to this as stream processing, we call it <em>incremental processing</em>, to distinguish from purely stream processing pipelines built using Apache Flink or Apache Kafka Streams.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-is-hudi-optimized-for-cdc-and-streaming-use-cases">How is Hudi optimized for CDC and streaming use cases?<a class="hash-link" href="#how-is-hudi-optimized-for-cdc-and-streaming-use-cases" title="Direct link to heading"></a></h3><p>One of the core use-cases for Apache Hudi is enabling seamless, efficient database ingestion to your lake, and change data capture is a direct application of that. Hudi’s core design primitives support fast upserts and deletes of data that are suitable for CDC and streaming use cases. Here is a glimpse of some of the challenges accompanying streaming and cdc workloads that Hudi handles efficiently out of the box.</p><ul><li><strong><em>Processing of deletes:</em></strong> Deletes are treated no differently than updates and are logged with the same filegroups where the corresponding keys exist. This helps process deletes faster same like regular inserts and updates and Hudi processes deletes at file group level using compaction in MOR tables. This can be very expensive in other open source systems that store deletes as separate files than data files and incur N(Data files)<!-- -->*<!-- -->N(Delete files) merge cost to process deletes every time, soon lending into a complex graph problem to solve whose planning itself is expensive. This gets worse with volume, especially when dealing with CDC style workloads that streams changes to records frequently.</li><li><strong><em>Operational overhead of merging deletes at scale:</em></strong> When deletes are stored as separate files without any notion of data locality, the merging of data and deletes can become a run away job that cannot complete in time due to various reasons (Spark retries, executor failure, OOM, etc.). As more data files and delete files are added, the merge becomes even more expensive and complex later on, making it hard to manage in practice causing operation overhead. Hudi removes this complexity from users by treating deletes similarly to any other write operation.</li><li><strong><em>File sizing with updates:</em></strong> Other open source systems, process updates by generating new data files for inserting the new records after deletion, where both data files and delete files get introduced for every batch of updates. This yields to small file problem and requires file sizing. Whereas, Hudi embraces mutations to the data, and manages the table automatically by keeping file sizes in check without passing the burden of file sizing to users as manual maintenance.</li><li><strong><em>Support for partial updates and payload ordering:</em></strong> Hudi support partial updates where already existing record can be updated for specific fields that are non null from newer records (with newer timestamps). Similarly, Hudi supports payload ordering with timestamp through specific payload implementation where late-arriving data with older timestamps will be ignored or dropped. Users can even implement custom logic and plug in to handle what they want.</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-choose-a-storage-type-for-my-workload">How do I choose a storage type for my workload?<a class="hash-link" href="#how-do-i-choose-a-storage-type-for-my-workload" title="Direct link to heading"></a></h3><p>A key goal of Hudi is to provide <strong>upsert functionality</strong> that is orders of magnitude faster than rewriting entire tables or partitions.</p><p>Choose Copy-on-write storage if :</p><ul><li>You are looking for a simple alternative, that replaces your existing parquet tables without any need for real-time data.</li><li>Your current job is rewriting entire table/partition to deal with updates, while only a few files actually change in each partition.</li><li>You are happy keeping things operationally simpler (no compaction etc), with the ingestion/write performance bound by the <a href="https://hudi.apache.org/docs/configurations#hoodieparquetmaxfilesize" target="_blank" rel="noopener noreferrer">parquet file size</a> and the number of such files affected/dirtied by updates</li><li>Your workload is fairly well-understood and does not have sudden bursts of large amount of update or inserts to older partitions. COW absorbs all the merging cost on the writer side and thus these sudden changes can clog up your ingestion and interfere with meeting normal mode ingest latency targets.</li></ul><p>Choose merge-on-read storage if :</p><ul><li>You want the data to be ingested as quickly &amp; queryable as much as possible.</li><li>Your workload can have sudden spikes/changes in pattern (e.g bulk updates to older transactions in upstream database causing lots of updates to old partitions on DFS). Asynchronous compaction helps amortize the write amplification caused by such scenarios, while normal ingestion keeps up with incoming stream of changes.</li></ul><p>Immaterial of what you choose, Hudi provides</p><ul><li>Snapshot isolation and atomic write of batch of records</li><li>Incremental pulls</li><li>Ability to de-duplicate data</li></ul><p>Find more <a href="https://hudi.apache.org/docs/concepts/" target="_blank" rel="noopener noreferrer">here</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="is-hudi-an-analytical-database">Is Hudi an analytical database?<a class="hash-link" href="#is-hudi-an-analytical-database" title="Direct link to heading"></a></h3><p>A typical database has a bunch of long running storage servers always running, which takes writes and reads. Hudi&#x27;s architecture is very different and for good reasons. It&#x27;s highly decoupled where writes and queries/reads can be scaled independently to be able to handle the scale challenges. So, it may not always seems like a database.</p><p>Nonetheless, Hudi is designed very much like a database and provides similar functionality (upserts, change capture) and semantics (transactional writes, snapshot isolated reads).</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-model-the-data-stored-in-hudi">How do I model the data stored in Hudi?<a class="hash-link" href="#how-do-i-model-the-data-stored-in-hudi" title="Direct link to heading"></a></h3><p>When writing data into Hudi, you model the records like how you would on a key-value store - specify a key field (unique for a single partition/across table), a partition field (denotes partition to place key into) and preCombine/combine logic that specifies how to handle duplicates in a batch of records written. This model enables Hudi to enforce primary key constraints like you would get on a database table. See <a href="https://hudi.apache.org/docs/writing_data/" target="_blank" rel="noopener noreferrer">here</a> for an example.</p><p>When querying/reading data, Hudi just presents itself as a json-like hierarchical table, everyone is used to querying using Hive/Spark/Presto over Parquet/Json/Avro.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="why-does-hudi-require-a-key-field-to-be-configured">Why does Hudi require a key field to be configured?<a class="hash-link" href="#why-does-hudi-require-a-key-field-to-be-configured" title="Direct link to heading"></a></h3><p>Hudi was designed to support fast record level Upserts and thus requires a key to identify whether an incoming record is
an insert or update or delete, and process accordingly. Additionally, Hudi automatically maintains indexes on this primary
key and for many use-cases like CDC, ensuring such primary key constraints is crucial to ensure data quality. In this context,
pre combine key helps reconcile multiple records with same key in a single batch of input records. Even for append-only data
streams, Hudi supports key based de-duplication before inserting records. For e-g; you may have atleast once data integration
systems like Kafka MirrorMaker that can introduce duplicates during failures. Even for plain old batch pipelines, keys
help eliminate duplication that could be caused by backfill pipelines, where commonly it&#x27;s unclear what set of records
need to be re-written. We are actively working on making keys easier by only requiring them for Upsert and/or automatically
generate the key internally (much like RDBMS row<!-- -->_<!-- -->ids)</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-does-hudi-actually-store-data-inside-a-table">How does Hudi actually store data inside a table?<a class="hash-link" href="#how-does-hudi-actually-store-data-inside-a-table" title="Direct link to heading"></a></h3><p>At a high level, Hudi is based on MVCC design that writes data to versioned parquet/base files and log files that contain changes to the base file. All the files are stored under a partitioning scheme for the table, which closely resembles how Apache Hive tables are laid out on DFS. Please refer <a href="https://hudi.apache.org/docs/concepts/" target="_blank" rel="noopener noreferrer">here</a> for more details.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-hudi-handles-partition-evolution-requirements-">How Hudi handles partition evolution requirements ?<a class="hash-link" href="#how-hudi-handles-partition-evolution-requirements-" title="Direct link to heading"></a></h3><p>Hudi recommends keeping coarse grained top level partition paths e.g date(ts) and within each such partition do clustering in a flexible way to z-order, sort data based on interested columns. This provides excellent performance by : minimzing the number of files in each partition, while still packing data that will be queried together physically closer (what partitioning aims to achieve).</p><p>Let&#x27;s take an example of a table, where we store log<!-- -->_<!-- -->events with two fields <code>ts</code> (time at which event was produced) and <code>cust_id</code> (user for which event was produced) and a common option is to partition by both date(ts) and cust<!-- -->_<!-- -->id.
Some users may want to start granular with hour(ts) and then later evolve to new partitioning scheme say date(ts). But this means, the number of partitions in the table could be very high - 365 days x 1K customers = at-least 365K potentially small parquet files, that can significantly slow down queries, facing throttling issues on the actual S3/DFS reads.</p><p>For the afore mentioned reasons, we don&#x27;t recommend mixing different partitioning schemes within the same table, since it adds operational complexity, and unpredictable performance.
Old data stays in old partitions and only new data gets into newer evolved partitions. If you want to tidy up the table, one has to rewrite all partition/data anwyay! This is where we suggest start with coarse grained partitions
and lean on clustering techniques to optimize for query performance.</p><p>We find that most datasets have at-least one high fidelity field, that can be used as a coarse partition. Clustering strategies in Hudi provide a lot of power - you can alter which partitions to cluster, and which fields to cluster each by etc.
Unlike Hive partitioning, Hudi does not remove the partition field from the data files i.e if you write new partition paths, it does not mean old partitions need to be rewritten.
Partitioning by itself is a relic of the Hive era; Hudi is working on replacing partitioning with database like indexing schemes/functions,
for even more flexibility and get away from Hive-style partition evol route.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="concepts">Concepts<a class="hash-link" href="#concepts" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-does-hudi-ensure-atomicity">How does Hudi ensure atomicity?<a class="hash-link" href="#how-does-hudi-ensure-atomicity" title="Direct link to heading"></a></h3><p>Hudi writers atomically move an inflight write operation to a &quot;completed&quot; state by writing an object/file to the <a href="https://hudi.apache.org/docs/timeline" target="_blank" rel="noopener noreferrer">timeline</a> folder, identifying the write operation with an instant time that denotes the time the action is deemed to have occurred. This is achieved on the underlying DFS (in the case of S3/Cloud Storage, by an atomic PUT operation) and can be observed by files of the pattern <code>&lt;instant&gt;.&lt;action&gt;.&lt;state&gt;</code> in Hudi’s timeline.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="does-hudi-extend-the-hive-table-layout">Does Hudi extend the Hive table layout?<a class="hash-link" href="#does-hudi-extend-the-hive-table-layout" title="Direct link to heading"></a></h3><p>Hudi is very different from Hive in important aspects described below. However, based on practical considerations, it chooses to be compatible with Hive table layout by adopting partitioning, schema evolution and being queryable through Hive query engine. Here are the key aspect where Hudi differs:</p><ul><li>Unlike Hive, Hudi does not remove the partition columns from the data files. Hudi in fact adds record level <a href="https://hudi.apache.org/tech-specs#meta-fields" target="_blank" rel="noopener noreferrer">meta fields</a> including instant time, primary record key, and partition path to the data to support efficient upserts and <a href="https://hudi.apache.org/learn/use_cases/#incremental-processing-pipelines" target="_blank" rel="noopener noreferrer">incremental queries/ETL</a>.  Hudi tables can be non-partitioned and the Hudi metadata table adds rich indexes on Hudi tables which are beyond simple Hive extensions.</li><li>Hive advocates partitioning as the main remedy for most performance-based issues. Features like partition evolution and hidden partitioning are primarily based on this Hive based principle of partitioning and aim to tackle the metadata problem partially.  Whereas, Hudi biases to coarse-grained partitioning and emphasizes <a href="https://hudi.apache.org/docs/clustering" target="_blank" rel="noopener noreferrer">clustering</a> for more fine-grained partitioning. Further, users can strategize and evolve the clustering asynchronously which “actually” help users experiencing performance issues with too granular partitions.</li><li>Hudi considers partition evolution as an anti-pattern and avoids such schemes due to the inconsistent performance of queries that goes to depend on which part of the table is being queried. Hudi’s design favors consistent performance and is aware of the need to redesign to partitioning/tables to achieve the same.</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-concurrency-control-approaches-does-hudi-adopt">What concurrency control approaches does Hudi adopt?<a class="hash-link" href="#what-concurrency-control-approaches-does-hudi-adopt" title="Direct link to heading"></a></h3><p>Hudi provides snapshot isolation between all three types of processes - writers, readers, and table services, meaning they all operate on a consistent snapshot of the table. Hudi provides optimistic concurrency control (OCC) between writers, while providing lock-free, non-blocking MVCC-based concurrency control between writers and table-services and between different table services. Widely accepted database literature like “<a href="https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf" target="_blank" rel="noopener noreferrer">Architecture of a database system, pg 81</a>” clearly lays out 2Phase Locking, OCC and MVCC as the different concurrency control approaches. Purely OCC-based approaches assume conflicts rarely occur and suffer from significant retries and penalties for any continuous/incremental workloads which are normal for modern lake based workloads. Hudi has been cognizant about this, and has a less enthusiastic view on <a href="https://hudi.apache.org/blog/2021/12/16/lakehouse-concurrency-control-are-we-too-optimistic/" target="_blank" rel="noopener noreferrer">OCC</a>, built out things like MVCC-based non-blocking async compaction (the commit time decision significantly aids this), that can have writers working non-stop with table services like compactions running in the background.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="hudis-commits-are-based-on-transaction-start-time-instead-of-completed-time-does-this-cause-data-loss-or-inconsistency-in-case-of-incremental-and-time-travel-queries">Hudi’s commits are based on transaction start time instead of completed time. Does this cause data loss or inconsistency in case of incremental and time travel queries?<a class="hash-link" href="#hudis-commits-are-based-on-transaction-start-time-instead-of-completed-time-does-this-cause-data-loss-or-inconsistency-in-case-of-incremental-and-time-travel-queries" title="Direct link to heading"></a></h3><p>Let’s take a closer look at the scenario here: two commits C1 and C2 (with C2 starting later than C1) start with a later commit (C2) finishing first leaving the inflight transaction of the earlier commit (C1)
before the completed write of the later transaction (C2) in Hudi’s timeline. This is not an uncommon scenario, especially with various ingestions needs such as backfilling, deleting, bootstrapping, etc
alongside regular writes. When/Whether the first job would commit will depend on factors such as conflicts between concurrent commits, inflight compactions, other actions on the table’s timeline etc.
If the first job fails for some reason, Hudi will abort the earlier commit inflight (c1) and the writer has to retry next time with a new instant time &gt; c2 much similar to other OCC implementations.
Firstly, for snapshot queries the order of commits should not matter at all, since any incomplete writes on the active timeline is ignored by queries and cause no side-effects.</p><p>In these scenarios, it might be tempting to think of data inconsistencies/data loss when using Hudi’s incremental queries. However, Hudi takes special handling
(examples <a href="https://github.com/apache/hudi/blob/aea5bb6f0ab824247f5e3498762ad94f643a2cb6/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java#L76" target="_blank" rel="noopener noreferrer">1</a>,
<a href="https://github.com/apache/hudi/blame/7a6543958368540d221ddc18e0c12b8d526b6859/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java#L173" target="_blank" rel="noopener noreferrer">2</a>) in incremental queries to ensure that no data
is served beyond the point there is an inflight instant in its timeline, so no data loss or drop happens. This detection is made possible because Hudi writes first request a transaction on the timeline, before planning/executing
the write, as explained in the <a href="https://hudi.apache.org/docs/timeline#states" target="_blank" rel="noopener noreferrer">timeline</a> section.</p><p>In this case, on seeing C1’s inflight commit (publish to timeline is atomic), C2 data (which is &gt; C1 in the timeline) is not served until C1 inflight transitions to a terminal state such as completed or marked as failed.
This <a href="https://github.com/apache/hudi/blob/master/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java#L137" target="_blank" rel="noopener noreferrer">test</a> demonstrates how Hudi incremental source stops proceeding until C1 completes.
Hudi favors <a href="https://en.wikipedia.org/wiki/Safety_and_liveness_properties" target="_blank" rel="noopener noreferrer">safety and sacrifices liveness</a>, in such a case. For a single writer, the start times of the transactions are the same as the order of completion of transactions, and both incremental and time-travel queries work as expected.
In the case of multi-writer, incremental queries still work as expected but time travel queries don&#x27;t. Since most time travel queries are on historical snapshots with a stable continuous timeline, this has not been implemented upto Hudi 0.13.
However, a similar approach like above can be easily applied to failing time travel queries as well in this window.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-does-hudi-plan-to-address-the-liveness-issue-above-for-incremental-queries">How does Hudi plan to address the liveness issue above for incremental queries?<a class="hash-link" href="#how-does-hudi-plan-to-address-the-liveness-issue-above-for-incremental-queries" title="Direct link to heading"></a></h3><p>Hudi 0.14 improves the liveness aspects by enabling change streams, incremental query and time-travel based on the file/object&#x27;s timestamp (similar to <a href="https://docs.delta.io/latest/delta-batch.html#query-an-older-snapshot-of-a-table-time-travel" target="_blank" rel="noopener noreferrer">Delta Lake</a>).</p><p>To expand more on the long term approach, Hudi has had a proposal to streamline/improve this experience by adding a transition-time to our timeline, which will remove the <a href="https://en.wikipedia.org/wiki/Safety_and_liveness_properties" target="_blank" rel="noopener noreferrer">liveness sacrifice</a> and makes it easier to understand.
This has been delayed for a few reasons</p><ul><li>Large hosted query engines and users not upgrading fast enough.</li><li>The issues brought up - <!-- -->[<a href="https://hudi.apache.org/docs/faq#does-hudis-use-of-wall-clock-timestamp-for-instants-pose-any-clock-skew-issues" target="_blank" rel="noopener noreferrer">1</a>,<a href="https://hudi.apache.org/docs/faq#hudis-commits-are-based-on-transaction-start-time-instead-of-completed-time-does-this-cause-data-loss-or-inconsistency-in-case-of-incremental-and-time-travel-queries" target="_blank" rel="noopener noreferrer">2</a>]<!-- -->,
relevant to this are not practically very important to users beyond good pedantic discussions,</li><li>Wanting to do it alongside <a href="https://github.com/apache/hudi/pull/7907" target="_blank" rel="noopener noreferrer">non-blocking concurrency control</a> in Hudi version 1.x.</li></ul><p>It&#x27;s planned to be addressed in the first 1.x release.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="does-hudis-use-of-wall-clock-timestamp-for-instants-pose-any-clock-skew-issues">Does Hudi’s use of wall clock timestamp for instants pose any clock skew issues?<a class="hash-link" href="#does-hudis-use-of-wall-clock-timestamp-for-instants-pose-any-clock-skew-issues" title="Direct link to heading"></a></h3><p>Theoretically speaking, a clock skew between two writers can result in different notions of time, and order the timeline differently. But, the current NTP implementations and regions standardizing on UTC make this very impractical to happen in practice. Even many popular OLTP-based systems such as DynamoDB and Cassandra use timestamps for record level conflict detection, cloud providers/OSS NTP are moving towards atomic/synchronized clocks all the time <!-- -->[<a href="https://aws.amazon.com/about-aws/whats-new/2017/11/introducing-the-amazon-time-sync-service/" target="_blank" rel="noopener noreferrer">1</a>,<a href="https://engineering.fb.com/2020/03/18/production-engineering/ntp-service/" target="_blank" rel="noopener noreferrer">2</a>]<!-- -->. We haven&#x27;t had these as practical issues raised over the last several years, across several large scale data lakes.</p><p>Further - Hudi’s commit time can be a logical time and need not strictly be a timestamp. If there are still uniqueness concerns over clock skew, it is easy for Hudi to further extend the timestamp implementation with salts or employ <a href="https://www.cockroachlabs.com/blog/living-without-atomic-clocks/" target="_blank" rel="noopener noreferrer">TrueTime</a> approaches that have been proven at planet scale. In short, this is not a design issue, but more of a pragmatic implementation choice, that allows us to implement unique features like async compaction in face of updates to the same file group, by scheduling actions on discrete timestamp space.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="writing-tables">Writing Tables<a class="hash-link" href="#writing-tables" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-are-some-ways-to-write-a-hudi-table">What are some ways to write a Hudi table?<a class="hash-link" href="#what-are-some-ways-to-write-a-hudi-table" title="Direct link to heading"></a></h3><p>Typically, you obtain a set of partial updates/inserts from your source and issue <a href="https://hudi.apache.org/docs/write_operations/" target="_blank" rel="noopener noreferrer">write operations</a> against a Hudi table. If you ingesting data from any of the standard sources like Kafka, or tailing DFS, the <a href="https://hudi.apache.org/docs/hoodie_streaming_ingestion#deltastreamer" target="_blank" rel="noopener noreferrer">delta streamer</a> tool is invaluable and provides an easy, self-managed solution to getting data written into Hudi. You can also write your own code to capture data from a custom source using the Spark datasource API and use a <a href="https://hudi.apache.org/docs/writing_data/#spark-datasource-writer" target="_blank" rel="noopener noreferrer">Hudi datasource</a> to write into Hudi.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-is-a-hudi-writer-job-deployed">How is a Hudi writer job deployed?<a class="hash-link" href="#how-is-a-hudi-writer-job-deployed" title="Direct link to heading"></a></h3><p>The nice thing about Hudi writing is that it just runs like any other spark job would on a YARN/Mesos or even a K8S cluster. So you could simply use the Spark UI to get visibility into write operations.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-i-implement-my-own-logic-for-how-input-records-are-merged-with-record-on-storage">Can I implement my own logic for how input records are merged with record on storage?<a class="hash-link" href="#can-i-implement-my-own-logic-for-how-input-records-are-merged-with-record-on-storage" title="Direct link to heading"></a></h3><p>Here is the payload interface that is used in Hudi to represent any hudi record.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">public interface HoodieRecordPayload&lt;T extends HoodieRecordPayload&gt; extends Serializable {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> /**</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * When more than one HoodieRecord have the same HoodieKey, this function combines them before attempting to insert/upsert by taking in a property map.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * Implementation can leverage the property to decide their business logic to do preCombine.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param another instance of another {@link HoodieRecordPayload} to be combined with.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @return the combined value</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> */</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> default T preCombine(T another, Properties properties);</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/**</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * This methods lets you write custom merging/combining logic to produce new values as a function of current value on storage and whats contained</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * in this object. Implementations can leverage properties if required.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * &lt;p&gt;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * eg:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * 1) You are updating counters, you may want to add counts to currentValue and write back updated counts</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * 2) You may be reading DB redo logs, and merge them with current image for a database row on storage</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * &lt;/p&gt;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param currentValue Current value in storage, to merge/combine this payload with</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param schema Schema used for record</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @return new combined/merged value to be written back to storage. EMPTY to skip writing this record.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> */</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> default Option&lt;IndexedRecord&gt; combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/**</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * Generates an avro record out of the given HoodieRecordPayload, to be written out to storage. Called when writing a new value for the given</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * HoodieKey, wherein there is no existing record in storage to be combined against. (i.e insert) Return EMPTY to skip writing this record.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * Implementations can leverage properties if required.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param schema Schema used for record</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @param properties Payload related properties. For example pass the ordering field(s) name to extract from value in storage.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @return the {@link IndexedRecord} to be inserted.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> */</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> default Option&lt;IndexedRecord&gt; getInsertValue(Schema schema, Properties properties) throws IOException;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/**</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * This method can be used to extract some metadata from HoodieRecordPayload. The metadata is passed to {@code WriteStatus.markSuccess()} and</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * {@code WriteStatus.markFailure()} in order to compute some aggregate metrics using the metadata in the context of a write success or failure.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * @return the metadata in the form of Map&lt;String, String&gt; if any.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> */</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> default Option&lt;Map&lt;String, String&gt;&gt; getMetadata() {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> return Option.empty();</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>As you could see, (<a href="https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java" target="_blank" rel="noopener noreferrer">combineAndGetUpdateValue(), getInsertValue()</a>) that control how the record on storage is combined with the incoming update/insert to generate the final value to be written back to storage. preCombine() is used to merge records within the same incoming batch.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-delete-records-in-the-dataset-using-hudi">How do I delete records in the dataset using Hudi?<a class="hash-link" href="#how-do-i-delete-records-in-the-dataset-using-hudi" title="Direct link to heading"></a></h3><p>GDPR has made deletes a must-have tool in everyone&#x27;s data management toolbox. Hudi supports both soft and hard deletes. For details on how to actually perform them, see <a href="https://hudi.apache.org/docs/writing_data/#deletes" target="_blank" rel="noopener noreferrer">here</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="should-i-need-to-worry-about-deleting-all-copies-of-the-records-in-case-of-duplicates">Should I need to worry about deleting all copies of the records in case of duplicates?<a class="hash-link" href="#should-i-need-to-worry-about-deleting-all-copies-of-the-records-in-case-of-duplicates" title="Direct link to heading"></a></h3><p>No. Hudi removes all the copies of a record key when deletes are issued. Here is the long form explanation - Sometimes accidental user errors can lead to duplicates introduced into a Hudi table by either <a href="https://hudi.apache.org/docs/faq#can-concurrent-inserts-cause-duplicates" target="_blank" rel="noopener noreferrer">concurrent inserts</a> or by <a href="https://hudi.apache.org/docs/faq#can-single-writer-inserts-have-duplicates" target="_blank" rel="noopener noreferrer">not deduping the input records</a> for an insert operation. However, using the right index (e.g., in the default <a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java#L116" target="_blank" rel="noopener noreferrer">Simple Index</a> and <a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java#L309" target="_blank" rel="noopener noreferrer">Bloom Index</a>), any subsequent updates and deletes are applied to all copies of the same primary key. This is because the indexing phase identifies records of a primary key in all locations.  So deletes in Hudi remove all copies of the same primary key, i.e., duplicates, and comply with GDPR or CCPA requirements.  Here are two examples <a href="https://gist.github.com/yihua/6eb11ce3f888a71935dbf21c77199a48" target="_blank" rel="noopener noreferrer">1</a>, <a href="https://gist.github.com/yihua/e3afe0f34400e60f81f6da925560118e" target="_blank" rel="noopener noreferrer">2</a> demonstrating that duplicates are properly deleted from a Hudi table. Hudi is adding <a href="https://github.com/apache/hudi/pull/8107" target="_blank" rel="noopener noreferrer">auto key generation</a>, which will remove the burden of key generation from the user for insert workloads.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-does-hudi-handle-duplicate-record-keys-in-an-input">How does Hudi handle duplicate record keys in an input?<a class="hash-link" href="#how-does-hudi-handle-duplicate-record-keys-in-an-input" title="Direct link to heading"></a></h3><p>When issuing an <code>upsert</code> operation on a table and the batch of records provided contains multiple entries for a given key, then all of them are reduced into a single final value by repeatedly calling payload class&#x27;s <a href="https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java#L40" target="_blank" rel="noopener noreferrer">preCombine()</a> method . By default, we pick the record with the greatest value (determined by calling .compareTo()) giving latest-write-wins style semantics. <a href="https://hudi.apache.org/docs/faq#can-i-implement-my-own-logic-for-how-input-records-are-merged-with-record-on-storage" target="_blank" rel="noopener noreferrer">This FAQ entry</a> shows the interface for HoodieRecordPayload if you are interested.</p><p>For an insert or bulk<!-- -->_<!-- -->insert operation, no such pre-combining is performed. Thus, if your input contains duplicates, the table would also contain duplicates. If you don&#x27;t want duplicate records either issue an <strong>upsert</strong> or consider specifying option to de-duplicate input in either datasource using <a href="https://hudi.apache.org/docs/configurations#hoodiedatasourcewriteinsertdropduplicates" target="_blank" rel="noopener noreferrer"><code>hoodie.datasource.write.insert.drop.duplicates</code></a> &amp; <a href="https://hudi.apache.org/docs/configurations/#hoodiecombinebeforeinsert" target="_blank" rel="noopener noreferrer"><code>hoodie.combine.before.insert</code></a> or in deltastreamer using <a href="https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java#L229" target="_blank" rel="noopener noreferrer"><code>--filter-dupes</code></a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-can-i-pass-hudi-configurations-to-my-spark-writer-job">How can I pass hudi configurations to my spark writer job?<a class="hash-link" href="#how-can-i-pass-hudi-configurations-to-my-spark-writer-job" title="Direct link to heading"></a></h3><p>Hudi configuration options covering the datasource and low level Hudi write client (which both deltastreamer &amp; datasource internally call) are <a href="https://hudi.apache.org/docs/configurations/" target="_blank" rel="noopener noreferrer">here</a>. Invoking <em>-<!-- -->-help</em> on any tool such as DeltaStreamer would print all the usage options. A lot of the options that control upsert, file sizing behavior are defined at the write client level and below is how we pass them to different options available for writing data.</p><ul><li>For Spark DataSource, you can use the &quot;options&quot; API of DataFrameWriter to pass in these configs.</li></ul><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">inputDF.write().format(&quot;org.apache.hudi&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .options(clientOpts) // any of the Hudi client opts can be passed in as well</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), &quot;_row_key&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ...</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><ul><li>When using <code>HoodieWriteClient</code> directly, you can simply construct HoodieWriteConfig object with the configs in the link you mentioned.</li><li>When using HoodieDeltaStreamer tool to ingest, you can set the configs in properties file and pass the file as the cmdline argument &quot;<em>-<!-- -->-props</em>&quot;</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-to-create-hive-style-partition-folder-structure">How to create Hive style partition folder structure?<a class="hash-link" href="#how-to-create-hive-style-partition-folder-structure" title="Direct link to heading"></a></h3><p>By default Hudi creates the partition folders with just the partition values, but if would like to create partition folders similar to the way Hive will generate the structure, with paths that contain key value pairs, like country=us/… or datestr=2021-04-20. This is Hive style (or format) partitioning. The paths include both the names of the partition keys and the values that each path represents.</p><p>To enable hive style partitioning, you need to add this hoodie config when you write your data:</p><div class="codeBlockContainer_J+bg language-plain theme-code-block"><div class="codeBlockContent_csEI plain"><pre tabindex="0" class="prism-code language-plain codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.datasource.write.hive_style_partitioning: true</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-i-register-my-hudi-table-with-apache-hive-metastore">Can I register my Hudi table with Apache Hive metastore?<a class="hash-link" href="#can-i-register-my-hudi-table-with-apache-hive-metastore" title="Direct link to heading"></a></h3><p>Yes. This can be performed either via the standalone <a href="https://hudi.apache.org/docs/syncing_metastore#hive-sync-tool" target="_blank" rel="noopener noreferrer">Hive Sync tool</a> or using options in <a href="https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/docker/demo/sparksql-incremental.commands#L50" target="_blank" rel="noopener noreferrer">Hudi Streamer</a> tool or <a href="https://hudi.apache.org/docs/configurations#hoodiedatasourcehive_syncenable" target="_blank" rel="noopener noreferrer">datasource</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="whats-hudis-schema-evolution-story">What&#x27;s Hudi&#x27;s schema evolution story?<a class="hash-link" href="#whats-hudis-schema-evolution-story" title="Direct link to heading"></a></h3><p>Hudi uses Avro as the internal canonical representation for records, primarily due to its nice <a href="https://docs.confluent.io/platform/current/schema-registry/avro.html" target="_blank" rel="noopener noreferrer">schema compatibility &amp; evolution</a> properties. This is a key aspect of having reliability in your ingestion or ETL pipelines. As long as the schema passed to Hudi (either explicitly in Hudi Streamer schema provider configs or implicitly by Spark Datasource&#x27;s Dataset schemas) is backwards compatible (e.g no field deletes, only appending new fields to schema), Hudi will seamlessly handle read/write of old and new data and also keep the Hive schema up-to date.</p><p>Starting 0.11.0, Spark SQL DDL support (experimental) was added for Spark 3.1.x and Spark 3.2.1 via ALTER TABLE syntax. Please refer to the <a href="https://hudi.apache.org/docs/schema_evolution" target="_blank" rel="noopener noreferrer">schema evolution guide</a> for more details on Schema-on-read for Spark..</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-performanceingest-latency-can-i-expect-for-hudi-writing">What performance/ingest latency can I expect for Hudi writing?<a class="hash-link" href="#what-performanceingest-latency-can-i-expect-for-hudi-writing" title="Direct link to heading"></a></h3><p>The speed at which you can write into Hudi depends on the <a href="https://hudi.apache.org/docs/write_operations" target="_blank" rel="noopener noreferrer">write operation</a> and some trade-offs you make along the way like file sizing. Just like how databases incur overhead over direct/raw file I/O on disks, Hudi operations may have overhead from supporting database like features compared to reading/writing raw DFS files. That said, Hudi implements advanced techniques from database literature to keep these minimal. User is encouraged to have this perspective when trying to reason about Hudi performance. As the saying goes : there is no free lunch (not yet atleast)</p><table><thead><tr><th>Storage Type</th><th>Type of workload</th><th>Performance</th><th>Tips</th></tr></thead><tbody><tr><td>copy on write</td><td>bulk<!-- -->_<!-- -->insert</td><td>Should match vanilla spark writing + an additional sort to properly size files</td><td>properly size <a href="https://hudi.apache.org/docs/configurations#hoodiebulkinsertshuffleparallelism" target="_blank" rel="noopener noreferrer">bulk insert parallelism</a> to get right number of files. use insert if you want this auto tuned . Configure <a href="https://hudi.apache.org/docs/configurations#hoodiebulkinsertsortmode" target="_blank" rel="noopener noreferrer">hoodie.bulkinsert.sort.mode</a> for better file sizes at the cost of memory. The default value NONE offers the fastest performance and matches <code>spark.write.parquet()</code> in terms of number of files, overheads.</td></tr><tr><td>copy on write</td><td>insert</td><td>Similar to bulk insert, except the file sizes are auto tuned requiring input to be cached into memory and custom partitioned.</td><td>Performance would be bound by how parallel you can write the ingested data. Tune <a href="https://hudi.apache.org/docs/configurations#hoodieinsertshuffleparallelism" target="_blank" rel="noopener noreferrer">this limit</a> up, if you see that writes are happening from only a few executors.</td></tr><tr><td>copy on write</td><td>upsert/ de-duplicate &amp; insert</td><td>Both of these would involve index lookup. Compared to naively using Spark (or similar framework)&#x27;s JOIN to identify the affected records, Hudi indexing is often 7-10x faster as long as you have ordered keys (discussed below) or &lt;50% updates. Compared to naively overwriting entire partitions, Hudi write can be several magnitudes faster depending on how many files in a given partition is actually updated. For e.g, if a partition has 1000 files out of which only 100 is dirtied every ingestion run, then Hudi would only read/merge a total of 100 files and thus 10x faster than naively rewriting entire partition.</td><td>Ultimately performance would be bound by how quickly we can read and write a parquet file and that depends on the size of the parquet file, configured <a href="https://hudi.apache.org/docs/configurations#hoodieparquetmaxfilesize" target="_blank" rel="noopener noreferrer">here</a>. Also be sure to properly tune your <a href="https://hudi.apache.org/docs/configurations#INDEX" target="_blank" rel="noopener noreferrer">bloom filters</a>. <a href="https://issues.apache.org/jira/browse/HUDI-56" target="_blank" rel="noopener noreferrer">HUDI-56</a> will auto-tune this.</td></tr><tr><td>merge on read</td><td>bulk insert</td><td>Currently new data only goes to parquet files and thus performance here should be similar to copy<!-- -->_<!-- -->on<!-- -->_<!-- -->write bulk insert. This has the nice side-effect of getting data into parquet directly for query performance. <a href="https://issues.apache.org/jira/browse/HUDI-86" target="_blank" rel="noopener noreferrer">HUDI-86</a> will add support for logging inserts directly and this up drastically.</td><td></td></tr><tr><td>merge on read</td><td>insert</td><td>Similar to above</td><td></td></tr><tr><td>merge on read</td><td>upsert/ de-duplicate &amp; insert</td><td>Indexing performance would remain the same as copy-on-write, while ingest latency for updates (costliest I/O operation in copy<!-- -->_<!-- -->on<!-- -->_<!-- -->write) are sent to log files and thus with asynchronous compaction provides very very good ingest performance with low write amplification.</td><td></td></tr></tbody></table><p>Like with many typical system that manage time-series data, Hudi performs much better if your keys have a timestamp prefix or monotonically increasing/decreasing. You can almost always achieve this. Even if you have UUID keys, you can follow tricks like <a href="https://www.percona.com/blog/2014/12/19/store-uuid-optimized-way/" target="_blank" rel="noopener noreferrer">this</a> to get keys that are ordered. See also <a href="https://hudi.apache.org/docs/tuning-guide" target="_blank" rel="noopener noreferrer">Tuning Guide</a> for more tips on JVM and other configurations.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-performance-can-i-expect-for-hudi-readingqueries">What performance can I expect for Hudi reading/queries?<a class="hash-link" href="#what-performance-can-i-expect-for-hudi-readingqueries" title="Direct link to heading"></a></h3><ul><li>For ReadOptimized views, you can expect the same best in-class columnar query performance as a standard parquet table in Hive/Spark/Presto</li><li>For incremental views, you can expect speed up relative to how much data usually changes in a given time window and how much time your entire scan takes. For e.g, if only 100 files changed in the last hour in a partition of 1000 files, then you can expect a speed of 10x using incremental pull in Hudi compared to full scanning the partition to find out new data.</li><li>For real time views, you can expect performance similar to the same avro backed table in Hive/Spark/Presto</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-to-avoid-creating-tons-of-small-files">How do I to avoid creating tons of small files?<a class="hash-link" href="#how-do-i-to-avoid-creating-tons-of-small-files" title="Direct link to heading"></a></h3><p>A key design decision in Hudi was to avoid creating small files and always write properly sized files.</p><p>There are 2 ways to avoid creating tons of small files in Hudi and both of them have different trade-offs:</p><p>a) <strong>Auto Size small files during ingestion</strong>: This solution trades ingest/writing time to keep queries always efficient. Common approaches to writing very small files and then later stitching them together only solve for system scalability issues posed by small files and also let queries slow down by exposing small files to them anyway.</p><p>Hudi has the ability to maintain a configured target file size, when performing <strong>upsert/insert</strong> operations. (Note: <strong>bulk<!-- -->_<!-- -->insert</strong> operation does not provide this functionality and is designed as a simpler replacement for normal <code>spark.write.parquet</code> )</p><p>For <strong>copy-on-write</strong>, this is as simple as configuring the <a href="https://hudi.apache.org/docs/configurations#hoodieparquetmaxfilesize" target="_blank" rel="noopener noreferrer">maximum size for a base/parquet file</a> and the <a href="https://hudi.apache.org/docs/configurations#hoodieparquetsmallfilelimit" target="_blank" rel="noopener noreferrer">soft limit</a> below which a file should be considered a small file. For the initial bootstrap to Hudi table, tuning record size estimate is also important to ensure sufficient records are bin-packed in a parquet file. For subsequent writes, Hudi automatically uses average record size based on previous commit. Hudi will try to add enough records to a small file at write time to get it to the configured maximum limit. For e.g , with <code>compactionSmallFileSize=100MB</code> and limitFileSize=120MB, Hudi will pick all files &lt; 100MB and try to get them upto 120MB.</p><p>For <strong>merge-on-read</strong>, there are few more configs to set. MergeOnRead works differently for different INDEX choices.</p><ul><li>Indexes with <strong>canIndexLogFiles = true</strong> : Inserts of new data go directly to log files. In this case, you can configure the <a href="https://hudi.apache.org/docs/configurations#hoodielogfilemaxsize" target="_blank" rel="noopener noreferrer">maximum log size</a> and a <a href="https://hudi.apache.org/docs/configurations#hoodielogfiletoparquetcompressionratio" target="_blank" rel="noopener noreferrer">factor</a> that denotes reduction in size when data moves from avro to parquet files.</li><li>Indexes with <strong>canIndexLogFiles = false</strong> : Inserts of new data go only to parquet files. In this case, the same configurations as above for the COPY<!-- -->_<!-- -->ON<!-- -->_<!-- -->WRITE case applies.</li></ul><p>NOTE : In either case, small files will be auto sized only if there is no PENDING compaction or associated log file for that particular file slice. For example, for case 1: If you had a log file and a compaction C1 was scheduled to convert that log file to parquet, no more inserts can go into that log file. For case 2: If you had a parquet file and an update ended up creating an associated delta log file, no more inserts can go into that parquet file. Only after the compaction has been performed and there are NO log files associated with the base parquet file, can new inserts be sent to auto size that parquet file.</p><p>b) <a href="https://hudi.apache.org/blog/2021/01/27/hudi-clustering-intro" target="_blank" rel="noopener noreferrer"><strong>Clustering</strong></a> : This is a feature in Hudi to group small files into larger ones either synchronously or asynchronously. Since first solution of auto-sizing small files has a tradeoff on ingestion speed (since the small files are sized during ingestion), if your use-case is very sensitive to ingestion latency where you don&#x27;t want to compromise on ingestion speed which may end up creating a lot of small files, clustering comes to the rescue. Clustering can be scheduled through the ingestion job and an asynchronus job can stitch small files together in the background to generate larger files. NOTE that during this, ingestion can continue to run concurrently.</p><p><em>Please note that Hudi always creates immutable files on disk. To be able to do auto-sizing or clustering, Hudi will always create a newer version of the smaller file, resulting in 2 versions of the same file. The cleaner service will later kick in and delte the older version small file and keep the latest one.</em></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-use-deltastreamer-or-spark-datasource-api-to-write-to-a-non-partitioned-hudi-table-">How do I use DeltaStreamer or Spark DataSource API to write to a Non-partitioned Hudi table ?<a class="hash-link" href="#how-do-i-use-deltastreamer-or-spark-datasource-api-to-write-to-a-non-partitioned-hudi-table-" title="Direct link to heading"></a></h3><p>Hudi supports writing to non-partitioned tables. For writing to a non-partitioned Hudi table and performing hive table syncing, you need to set the below configurations in the properties passed:</p><div class="codeBlockContainer_J+bg language-plain theme-code-block"><div class="codeBlockContent_csEI plain"><pre tabindex="0" class="prism-code language-plain codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.NonPartitionedExtractor</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-can-i-reduce-table-versions-created-by-hudi-in-aws-glue-data-catalog-metastore">How can I reduce table versions created by Hudi in AWS Glue Data Catalog/ metastore?<a class="hash-link" href="#how-can-i-reduce-table-versions-created-by-hudi-in-aws-glue-data-catalog-metastore" title="Direct link to heading"></a></h3><p>With each commit, Hudi creates a new table version in the metastore. This can be reduced by setting the option</p><p><a href="https://hudi.apache.org/docs/configurations#hoodiedatasourcemeta_syncconditionsync" target="_blank" rel="noopener noreferrer">hoodie.datasource.meta<!-- -->_<!-- -->sync.condition.sync</a> to true.</p><p>This will ensure that hive sync is triggered on schema or partitions changes.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="if-there-are-failed-writes-in-my-timeline-do-i-see-duplicates">If there are failed writes in my timeline, do I see duplicates?<a class="hash-link" href="#if-there-are-failed-writes-in-my-timeline-do-i-see-duplicates" title="Direct link to heading"></a></h3><p>No, Hudi does not expose uncommitted files/blocks to the readers. Further, Hudi strives to automatically manage the table for the user, by actively cleaning up files created from failed/aborted writes. See <a href="https://hudi.apache.org/blog/2021/08/18/improving-marker-mechanism/" target="_blank" rel="noopener noreferrer">marker mechanism</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-are-conflicts-detected-in-hudi-between-multiple-writers">How are conflicts detected in Hudi between multiple writers?<a class="hash-link" href="#how-are-conflicts-detected-in-hudi-between-multiple-writers" title="Direct link to heading"></a></h3><p>Hudi employs <a href="https://hudi.apache.org/docs/concurrency_control#supported-concurrency-controls" target="_blank" rel="noopener noreferrer">optimistic concurrency control</a> between writers, while implementing MVCC based concurrency control between writers and the table services. Concurrent writers to the same table need to be configured with the same lock provider configuration, to safely perform writes. By default (implemented in “<a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/SimpleConcurrentFileWritesConflictResolutionStrategy.java" target="_blank" rel="noopener noreferrer">SimpleConcurrentFileWritesConflictResolutionStrategy</a>”), Hudi allows multiple writers to concurrently write data and commit to the timeline if there is no conflicting writes to the same underlying file group IDs. This is achieved by holding a lock, checking for changes that modified the same file IDs. Hudi then supports a pluggable interface “<a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java" target="_blank" rel="noopener noreferrer">ConflictResolutionStrategy</a>” that determines how conflicts are handled. By default, the later conflicting write is aborted. Hudi also support eager conflict detection to help speed up conflict detection and release cluster resources back early to reduce costs.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-single-writer-inserts-have-duplicates">Can single-writer inserts have duplicates?<a class="hash-link" href="#can-single-writer-inserts-have-duplicates" title="Direct link to heading"></a></h3><p>By default, Hudi turns off key based de-duplication for INSERT/BULK<!-- -->_<!-- -->INSERT operations and thus the table could contain duplicates. If users believe, they have duplicates in inserts, they can either issue UPSERT or consider specifying the option to de-duplicate input in either datasource using <a href="https://hudi.apache.org/docs/configurations#hoodiedatasourcewriteinsertdropduplicates" target="_blank" rel="noopener noreferrer"><code>hoodie.datasource.write.insert.drop.duplicates</code></a> &amp; <a href="https://hudi.apache.org/docs/configurations/#hoodiecombinebeforeinsert" target="_blank" rel="noopener noreferrer"><code>hoodie.combine.before.insert</code></a> or in deltastreamer using <a href="https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java#L229" target="_blank" rel="noopener noreferrer"><code>--filter-dupes</code></a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-concurrent-inserts-cause-duplicates">Can concurrent inserts cause duplicates?<a class="hash-link" href="#can-concurrent-inserts-cause-duplicates" title="Direct link to heading"></a></h3><p>Yes. As mentioned before, the default conflict detection strategy only check for conflicting updates to the same file group IDs. In the case of concurrent inserts, inserted records end up creating new file groups and thus can go undetected. Most common workload patterns use multi-writer capability in the case of running ingestion of new data and concurrently backfilling/deleting older data, with NO overlap in the primary keys of the records. However, this can be implemented (or better yet contributed) by a new “<a href="https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/ConflictResolutionStrategy.java" target="_blank" rel="noopener noreferrer">ConflictResolutionStrategy</a>”, that reads out keys of new conflicting operations, to check the uncommitted data against other concurrent writes and then decide whether or not to commit/abort. This is rather a fine tradeoff between saving the additional cost of reading keys on most common workloads. Historically, users have preferred to take this into their control to save costs e.g we turned off de-duplication for inserts due to the same feedback. Hudi supports a pre-commit validator mechanism already where such tests can be authored as well.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="querying-tables">Querying Tables<a class="hash-link" href="#querying-tables" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="does-deleted-records-appear-in-hudis-incremental-query-results">Does deleted records appear in Hudi&#x27;s incremental query results?<a class="hash-link" href="#does-deleted-records-appear-in-hudis-incremental-query-results" title="Direct link to heading"></a></h3><p>Soft Deletes (unlike hard deletes) do appear in the incremental pull query results. So, if you need a mechanism to propagate deletes to downstream tables, you can use Soft deletes.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-pass-hudi-configurations-to-my-beeline-hive-queries">How do I pass hudi configurations to my beeline Hive queries?<a class="hash-link" href="#how-do-i-pass-hudi-configurations-to-my-beeline-hive-queries" title="Direct link to heading"></a></h3><p>If Hudi&#x27;s input format is not picked the returned results may be incorrect. To ensure correct inputformat is picked, please use <code>org.apache.hadoop.hive.ql.io.HiveInputFormat</code> or <code>org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat</code> for <code>hive.input.format</code> config. This can be set like shown below:</p><div class="codeBlockContainer_J+bg language-plain theme-code-block"><div class="codeBlockContent_csEI plain"><pre tabindex="0" class="prism-code language-plain codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>or</p><div class="codeBlockContainer_J+bg language-plain theme-code-block"><div class="codeBlockContent_csEI plain"><pre tabindex="0" class="prism-code language-plain codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">set hive.input.format=org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="does-hudi-guarantee-consistent-reads-how-to-think-about-read-optimized-queries">Does Hudi guarantee consistent reads? How to think about read optimized queries?<a class="hash-link" href="#does-hudi-guarantee-consistent-reads-how-to-think-about-read-optimized-queries" title="Direct link to heading"></a></h3><p>Hudi does offer consistent reads. To read the latest snapshot of a MOR table, a user should use snapshot query. The <a href="https://hudi.apache.org/docs/table_types#query-types" target="_blank" rel="noopener noreferrer">read-optimized queries</a> (targeted for the MOR table ONLY) are an add on benefit to provides users with a practical tradeoff of decoupling writer performance vs query performance, leveraging the fact that most queries query say the most recent data in the table.</p><p>Hudi’s read-optimized query is targeted for the MOR table only, with guidance around how compaction should be run to achieve predictable results. In the MOR table, the compaction, which runs every few commits (or “deltacommit” to be exact for the MOR table) by default, merges the base (parquet) file and corresponding change log files to a new base file within each file group, so that the snapshot query serving the latest data immediately after compaction reads the base files only.  Similarly, the read-optimized query always reads the base files only as of the latest compaction commit, usually a few commits before the latest commit, which is still a valid table state.</p><p>Users must use snapshot queries to read the latest snapshot of a MOR table.  Popular engines including Spark, Presto, and Hive already support snapshot queries on MOR table and the snapshot query support in Trino is in progress (the <a href="https://github.com/trinodb/trino/pull/14786" target="_blank" rel="noopener noreferrer">PR</a> is under review).  Note that the read-optimized query does not apply to the COW table.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="table-services">Table Services<a class="hash-link" href="#table-services" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-does-the-hudi-cleaner-do">What does the Hudi cleaner do?<a class="hash-link" href="#what-does-the-hudi-cleaner-do" title="Direct link to heading"></a></h3><p>The Hudi cleaner process often runs right after a commit and deltacommit and goes about deleting old files that are no longer needed. If you are using the incremental pull feature, then ensure you configure the cleaner to <a href="https://hudi.apache.org/docs/configurations#hoodiecleanercommitsretained" target="_blank" rel="noopener noreferrer">retain sufficient amount of last commits</a> to rewind. Another consideration is to provide sufficient time for your long running jobs to finish running. Otherwise, the cleaner could delete a file that is being or could be read by the job and will fail the job. Typically, the default configuration of 10 allows for an ingestion running every 30 mins to retain up-to 5 hours worth of data. If you run ingestion more frequently or if you want to give more running time for a query, consider increasing the value for the config : <code>hoodie.cleaner.commits.retained</code></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-run-compaction-for-a-mor-table">How do I run compaction for a MOR table?<a class="hash-link" href="#how-do-i-run-compaction-for-a-mor-table" title="Direct link to heading"></a></h3><p>Simplest way to run compaction on MOR table is to run the <a href="https://hudi.apache.org/docs/configurations#hoodiecompactinline" target="_blank" rel="noopener noreferrer">compaction inline</a>, at the cost of spending more time ingesting; This could be particularly useful, in common cases where you have small amount of late arriving data trickling into older partitions. In such a scenario, you may want to just aggressively compact the last N partitions while waiting for enough logs to accumulate for older partitions. The net effect is that you have converted most of the recent data, that is more likely to be queried to optimized columnar format.</p><p>That said, for obvious reasons of not blocking ingesting for compaction, you may want to run it asynchronously as well. This can be done either via a separate <a href="https://github.com/apache/hudi/blob/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java" target="_blank" rel="noopener noreferrer">compaction job</a> that is scheduled by your workflow scheduler/notebook independently. If you are using delta streamer, then you can run in <a href="https://github.com/apache/hudi/blob/d3edac4612bde2fa9deca9536801dbc48961fb95/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java#L241" target="_blank" rel="noopener noreferrer">continuous mode</a> where the ingestion and compaction are both managed concurrently in a single spark run time.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-options-do-i-have-for-asynchronousoffline-compactions-on-mor-table">What options do I have for asynchronous/offline compactions on MOR table?<a class="hash-link" href="#what-options-do-i-have-for-asynchronousoffline-compactions-on-mor-table" title="Direct link to heading"></a></h3><p>There are a couple of options depending on how you write to Hudi. But first let us understand briefly what is involved. There are two parts to compaction</p><ul><li>Scheduling: In this step, Hudi scans the partitions and selects file slices to be compacted. A compaction plan is finally written to Hudi timeline. Scheduling needs tighter coordination with other writers (regular ingestion is considered one of the writers). If scheduling is done inline with the ingestion job, this coordination is automatically taken care of. Else when scheduling happens asynchronously a lock provider needs to be configured for this coordination among multiple writers.</li><li>Execution: In this step the compaction plan is read and file slices are compacted. Execution doesnt need the same level of coordination with other writers as Scheduling step and can be decoupled from ingestion job easily.</li></ul><p>Depending on how you write to Hudi these are the possible options currently.</p><ul><li>DeltaStreamer:</li><li>In Continuous mode, asynchronous compaction is achieved by default. Here scheduling is done by the ingestion job inline and compaction execution is achieved asynchronously by a separate parallel thread.</li><li>In non continuous mode, only inline compaction is possible.</li><li>Please note in either mode, by passing --disable-compaction compaction is completely disabled</li><li>Spark datasource:</li><li>Async scheduling and async execution can be achieved by periodically running an offline Hudi Compactor Utility or Hudi CLI. However this needs a lock provider to be configured.</li><li>Alternately, from 0.11.0, to avoid dependency on lock providers, scheduling alone can be done inline by regular writer using the config <code>hoodie.compact.schedule.inline</code> . And compaction execution can be done offline by periodically triggering the Hudi Compactor Utility or Hudi CLI.</li><li>Spark structured streaming:</li><li>Compactions are scheduled and executed asynchronously inside the streaming job. Async Compactions are enabled by default for structured streaming jobs on Merge-On-Read table.</li><li>Please note it is not possible to disable async compaction for MOR table with spark structured streaming.</li><li>Flink:</li><li>Async compaction is enabled by default for Merge-On-Read table.</li><li>Offline compaction can be achieved by setting <code>compaction.async.enabled</code> to <code>false</code> and periodically running <a href="https://hudi.apache.org/docs/compaction/#flink-offline-compaction" target="_blank" rel="noopener noreferrer">Flink offline Compactor</a>. When running the offline compactor, one needs to ensure there are no active writes to the table.</li><li>Third option (highly recommended over the second one) is to schedule the compactions from the regular ingestion job and executing the compaction plans from an offline job. To achieve this set <code>compaction.async.enabled</code> to <code>false</code>, <code>compaction.schedule.enabled</code> to <code>true</code> and then run the <a href="https://hudi.apache.org/docs/compaction/#flink-offline-compaction" target="_blank" rel="noopener noreferrer">Flink offline Compactor</a> periodically to execute the plans.</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-to-disable-all-table-services-in-case-of-multiple-writers">How to disable all table services in case of multiple writers?<a class="hash-link" href="#how-to-disable-all-table-services-in-case-of-multiple-writers" title="Direct link to heading"></a></h3><p><a href="https://hudi.apache.org/docs/configurations#hoodietableservicesenabled" target="_blank" rel="noopener noreferrer">hoodie.table.services.enabled</a> is an umbrella config that can be used to turn off all table services at once without having to individually disable them. This is handy in use cases where there are multiple writers doing ingestion. While one of the main pipelines can take care of the table services, other ingestion pipelines can disable them to avoid frequent trigger of cleaning/clustering etc. This does not apply to singe writer scenarios.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="why-does-hudi-retain-at-least-one-previous-commit-even-after-setting-hoodiecleanercommitsretained-1-">Why does Hudi retain at-least one previous commit even after setting hoodie.cleaner.commits.retained&#x27;: 1 ?<a class="hash-link" href="#why-does-hudi-retain-at-least-one-previous-commit-even-after-setting-hoodiecleanercommitsretained-1-" title="Direct link to heading"></a></h3><p>Hudi runs cleaner to remove old file versions as part of writing data either in inline or in asynchronous mode (0.6.0 onwards). Hudi Cleaner retains at-least one previous commit when cleaning old file versions. This is to prevent the case when concurrently running queries which are reading the latest file versions suddenly see those files getting deleted by cleaner because a new file version got added . In other words, retaining at-least one previous commit is needed for ensuring snapshot isolation for readers.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-i-get-notified-when-new-commits-happen-in-my-hudi-table">Can I get notified when new commits happen in my Hudi table?<a class="hash-link" href="#can-i-get-notified-when-new-commits-happen-in-my-hudi-table" title="Direct link to heading"></a></h3><p>Yes. Hudi provides the ability to post a callback notification about a write commit. You can use a http hook or choose to</p><p>be notified via a Kafka/pulsar topic or plug in your own implementation to get notified. Please refer <a href="https://hudi.apache.org/docs/writing_data/#commit-notifications" target="_blank" rel="noopener noreferrer">here</a></p><p>for details</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="storage">Storage<a class="hash-link" href="#storage" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="does-hudi-support-cloud-storageobject-stores">Does Hudi support cloud storage/object stores?<a class="hash-link" href="#does-hudi-support-cloud-storageobject-stores" title="Direct link to heading"></a></h3><p>Yes. Generally speaking, Hudi is able to provide its functionality on any Hadoop FileSystem implementation and thus can read and write tables on <a href="https://hudi.apache.org/docs/cloud" target="_blank" rel="noopener noreferrer">Cloud stores</a> (Amazon S3 or Microsoft Azure or Google Cloud Storage). Over time, Hudi has also incorporated specific design aspects that make building Hudi tables on the cloud easy, such as <a href="https://hudi.apache.org/docs/configurations#hoodieconsistencycheckenabled" target="_blank" rel="noopener noreferrer">consistency checks for s3</a>, Zero moves/renames involved for data files.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="what-is-the-difference-between-copy-on-write-cow-vs-merge-on-read-mor-table-types">What is the difference between copy-on-write (COW) vs merge-on-read (MOR) table types?<a class="hash-link" href="#what-is-the-difference-between-copy-on-write-cow-vs-merge-on-read-mor-table-types" title="Direct link to heading"></a></h3><p><strong>Copy On Write</strong> - This storage type enables clients to ingest data on columnar file formats, currently parquet. Any new data that is written to the Hudi table using COW storage type, will write new parquet files. Updating an existing set of rows will result in a rewrite of the entire parquet files that collectively contain the affected rows being updated. Hence, all writes to such tables are limited by parquet writing performance, the larger the parquet file, the higher is the time taken to ingest the data.</p><p><strong>Merge On Read</strong> - This storage type enables clients to ingest data quickly onto row based data format such as avro. Any new data that is written to the Hudi table using MOR table type, will write new log/delta files that internally store the data as avro encoded bytes. A compaction process (configured as inline or asynchronous) will convert log file format to columnar file format (parquet). Two different InputFormats expose 2 different views of this data, Read Optimized view exposes columnar parquet reading performance while Realtime View exposes columnar and/or log reading performance respectively. Updating an existing set of rows will result in either a) a companion log/delta file for an existing base parquet file generated from a previous compaction or b) an update written to a log/delta file in case no compaction ever happened for it. Hence, all writes to such tables are limited by avro/log file writing performance, much faster than parquet. Although, there is a higher cost to pay to read log/delta files vs columnar (parquet) files.</p><p>More details can be found <a href="https://hudi.apache.org/docs/concepts/" target="_blank" rel="noopener noreferrer">here</a> and also <a href="https://cwiki.apache.org/confluence/display/HUDI/Design+And+Architecture" target="_blank" rel="noopener noreferrer">Design And Architecture</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-migrate-my-data-to-hudi">How do I migrate my data to Hudi?<a class="hash-link" href="#how-do-i-migrate-my-data-to-hudi" title="Direct link to heading"></a></h3><p>Hudi provides built in support for rewriting your entire table into Hudi one-time using the HDFSParquetImporter tool available from the hudi-cli . You could also do this via a simple read and write of the dataset using the Spark datasource APIs. Once migrated, writes can be performed using normal means discussed <a href="https://hudi.apache.org/docs/faq#what-are-some-ways-to-write-a-hudi-table" target="_blank" rel="noopener noreferrer">here</a>. This topic is discussed in detail <a href="https://hudi.apache.org/docs/migration_guide/" target="_blank" rel="noopener noreferrer">here</a>, including ways to doing partial migrations.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-to-convert-an-existing-cow-table-to-mor">How to convert an existing COW table to MOR?<a class="hash-link" href="#how-to-convert-an-existing-cow-table-to-mor" title="Direct link to heading"></a></h3><p>All you need to do is to edit the table type property in hoodie.properties(located at hudi_table_path/.hoodie/hoodie.properties).</p><p>But manually changing it will result in checksum errors. So, we have to go via hudi-cli.</p><ol><li>Copy existing hoodie.properties to a new location.</li><li>Edit table type to MERGE<!-- -->_<!-- -->ON<!-- -->_<!-- -->READ</li><li>launch hudi-cli</li><li>connect --path hudi<!-- -->_<!-- -->table<!-- -->_<!-- -->path</li><li>repair overwrite-hoodie-props --new-props-file new<!-- -->_<!-- -->hoodie.properties</li></ol><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-can-i-find-the-average-record-size-in-a-commit">How can I find the average record size in a commit?<a class="hash-link" href="#how-can-i-find-the-average-record-size-in-a-commit" title="Direct link to heading"></a></h3><p>The <code>commit showpartitons</code> command in <a href="https://hudi.apache.org/docs/cli" target="_blank" rel="noopener noreferrer">HUDI CLI</a> will show both &quot;bytes written&quot; and</p><p>&quot;records inserted.&quot; Divide the bytes written by records inserted to find the average size. Note that this answer assumes</p><p>metadata overhead is negligible. For a small table (such as 5 columns, 100 records) this will not be the case.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-does-the-hudi-indexing-work--what-are-its-benefits">How does the Hudi indexing work &amp; what are its benefits?<a class="hash-link" href="#how-does-the-hudi-indexing-work--what-are-its-benefits" title="Direct link to heading"></a></h3><p>The indexing component is a key part of the Hudi writing and it maps a given recordKey to a fileGroup inside Hudi consistently. This enables faster identification of the file groups that are affected/dirtied by a given write operation.</p><p>Hudi supports a few options for indexing as below</p><ul><li><em>HoodieBloomIndex</em> : Uses a bloom filter and ranges information placed in the footer of parquet/base files (and soon log files as well)</li><li><em>HoodieGlobalBloomIndex</em> : The non global indexing only enforces uniqueness of a key inside a single partition i.e the user is expected to know the partition under which a given record key is stored. This helps the indexing scale very well for even <a href="https://eng.uber.com/uber-big-data-platform/" target="_blank" rel="noopener noreferrer">very large datasets</a>. However, in some cases, it might be necessary instead to do the de-duping/enforce uniqueness across all partitions and the global bloom index does exactly that. If this is used, incoming records are compared to files across the entire table and ensure a recordKey is only present in one partition.</li><li><em>HBaseIndex</em> : Apache HBase is a key value store, typically found in close proximity to HDFS. You can also store the index inside HBase, which could be handy if you are already operating HBase.</li><li><em>HoodieSimpleIndex (default)</em> : A simple index which reads interested fields (record key and partition path) from base files and joins with incoming records to find the tagged location.</li><li><em>HoodieGlobalSimpleIndex</em> : Global version of Simple Index, where in uniqueness is on record key across entire table.</li><li><em>HoodieBucketIndex</em> : Each partition has statically defined buckets to which records are tagged with. Since locations are tagged via hashing mechanism, this index lookup will be very efficient.</li><li><em>HoodieSparkConsistentBucketIndex</em> : This is also similar to Bucket Index. Only difference is that, data skews can be tackled by dynamically changing the bucket number.</li></ul><p>You can implement your own index if you&#x27;d like, by subclassing the <code>HoodieIndex</code> class and configuring the index class name in configs.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-i-switch-from-one-index-type-to-another-without-having-to-rewrite-the-entire-table">Can I switch from one index type to another without having to rewrite the entire table?<a class="hash-link" href="#can-i-switch-from-one-index-type-to-another-without-having-to-rewrite-the-entire-table" title="Direct link to heading"></a></h3><p>It should be okay to switch between Bloom index and Simple index as long as they are not global.</p><p>Moving from global to non-global and vice versa may not work. Also switching between Hbase (gloabl index) and regular bloom might not work.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="i-have-an-existing-dataset-and-want-to-evaluate-hudi-using-portion-of-that-data-">I have an existing dataset and want to evaluate Hudi using portion of that data ?<a class="hash-link" href="#i-have-an-existing-dataset-and-want-to-evaluate-hudi-using-portion-of-that-data-" title="Direct link to heading"></a></h3><p>You can bulk import portion of that data to a new hudi table. For example, if you want to try on a month of data -</p><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark.read.parquet(&quot;your_data_set/path/to/month&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .write.format(&quot;org.apache.hudi&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(&quot;hoodie.datasource.write.operation&quot;, &quot;bulk_insert&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(&quot;hoodie.datasource.write.storage.type&quot;, &quot;storage_type&quot;) // COPY_ON_WRITE or MERGE_ON_READ</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(RECORDKEY_FIELD_OPT_KEY, &quot;&lt;your key&gt;&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(PARTITIONPATH_FIELD_OPT_KEY, &quot;&lt;your_partition&gt;&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .mode(SaveMode.Append)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .save(basePath);</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Once you have the initial copy, you can simply run upsert operations on this by selecting some sample of data every round</p><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark.read.parquet(&quot;your_data_set/path/to/month&quot;).limit(n) // Limit n records</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .write.format(&quot;org.apache.hudi&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(&quot;hoodie.datasource.write.operation&quot;, &quot;upsert&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(RECORDKEY_FIELD_OPT_KEY, &quot;&lt;your key&gt;&quot;).</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(PARTITIONPATH_FIELD_OPT_KEY, &quot;&lt;your_partition&gt;&quot;)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .mode(SaveMode.Append)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .save(basePath);</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>For merge on read table, you may want to also try scheduling and running compaction jobs. You can run compaction directly using spark submit on org.apache.hudi.utilities.HoodieCompactor or by using <a href="https://hudi.apache.org/docs/cli" target="_blank" rel="noopener noreferrer">HUDI CLI</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="why-does-maintain-record-level-commit-metadata-isnt-tracking-table-version-at-file-level-good-enough">Why does maintain record level commit metadata? Isn&#x27;t tracking table version at file level good enough? <a class="hash-link" href="#why-does-maintain-record-level-commit-metadata-isnt-tracking-table-version-at-file-level-good-enough" title="Direct link to heading"></a></h3><p>By generating a commit time ahead of time, Hudi is able to stamp each record with effectively a transaction id that it&#x27;s part of that commit enabling record level change tracking. This means, that even if that file is compacted/clustered (<a href="https://hudi.apache.org/docs/clustering#how-is-compaction-different-from-clustering" target="_blank" rel="noopener noreferrer">they mean different things in Hudi</a>) many times, in between incremental queries, we are able to <a href="https://hudi.apache.org/blog/2023/05/19/hudi-metafields-demystified" target="_blank" rel="noopener noreferrer">preserve history of the records</a>. Further more, Hudi is able to leverage compaction to amortize the cost of &quot;catching up&quot; for incremental readers by handing latest state of a record after a point in time - which is orders of magnitude efficient than processing each record. Other similar systems lack such decoupling of change streams from physical files the records were part of and core table management services being aware of the history of records. Such similar approaches of record level metadata fields for efficient incremental processing has been also applied in other leading industry <a href="https://twitter.com/apachehudi/status/1676021143697002496?s=20" target="_blank" rel="noopener noreferrer">data warehouses</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="why-partition-fields-are-also-stored-in-parquet-files-in-addition-to-the-partition-path-">Why partition fields are also stored in parquet files in addition to the partition path ?<a class="hash-link" href="#why-partition-fields-are-also-stored-in-parquet-files-in-addition-to-the-partition-path-" title="Direct link to heading"></a></h3><p>Hudi supports customizable partition values which could be a derived value of another field. Also, storing the partition value only as part of the field results in losing type information when queried by various query engines.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-configure-bloom-filter-when-bloomglobal_bloom-index-is-used">How do I configure Bloom filter (when Bloom/Global<!-- -->_<!-- -->Bloom index is used)?<a class="hash-link" href="#how-do-i-configure-bloom-filter-when-bloomglobal_bloom-index-is-used" title="Direct link to heading"></a></h3><p>Bloom filters are used in bloom indexes to look up the location of record keys in write path. Bloom filters are used only when the index type is chosen as “BLOOM” or “GLOBAL<!-- -->_<!-- -->BLOOM”. Hudi has few config knobs that users can use to tune their bloom filters.</p><p>On a high level, hudi has two types of blooms: Simple and Dynamic.</p><p>Simple, as the name suggests, is simple. Size is statically allocated based on few configs.</p><p><code>hoodie.bloom.index.filter.type</code>: SIMPLE</p><p><code>hoodie.index.bloom.num_entries</code> refers to the total number of entries per bloom filter, which refers to one file slice. Default value is 60000.</p><p><code>hoodie.index.bloom.fpp</code> refers to the false positive probability with the bloom filter. Default value: 1<!-- -->*<!-- -->10^-9.</p><p>Size of the bloom filter depends on these two values. This is statically allocated and here is the formula that determines the size of bloom. Until the total number of entries added to the bloom is within the configured <code>hoodie.index.bloom.num_entries</code> value, the fpp will be honored. i.e. with default values of 60k and 1<!-- -->*<!-- -->10^-9, bloom filter serialized size = 430kb. But if more entries are added, then the false positive probability will not be honored. Chances that more false positives could be returned if you add more number of entries than the configured value. So, users are expected to set the right values for both num<!-- -->_<!-- -->entries and fpp.</p><p>Hudi suggests to have roughly 100 to 120 mb sized files for better query performance. So, based on the record size, one could determine how many records could fit into one data file.</p><p>Lets say your data file max size is 128Mb and default avg record size is 1024 bytes. Hence, roughly this translates to 130k entries per data file. For this config, you should set num<!-- -->_<!-- -->entries to ~130k.</p><p>Dynamic bloom filter:</p><p><code>hoodie.bloom.index.filter.type</code> : DYNAMIC</p><p>This is an advanced version of the bloom filter which grows dynamically as the number of entries grows. So, users are expected to set two values wrt num<!-- -->_<!-- -->entries. <code>hoodie.index.bloom.num_entries</code> will determine the starting size of the bloom. <code>hoodie.bloom.index.filter.dynamic.max.entries</code> will determine the max size to which the bloom can grow upto. And fpp needs to be set similar to “Simple” bloom filter. Bloom size will be allotted based on the first config <code>hoodie.index.bloom.num_entries</code>. Once the number of entries reaches this value, bloom will dynamically grow its size to 2X. This will go on until the size reaches a max of <code>hoodie.bloom.index.filter.dynamic.max.entries</code> value. Until the size reaches this max value, fpp will be honored. If the entries added exceeds the max value, then the fpp may not be honored.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-do-i-verify-datasource-schema-reconciliation-in-hudi">How do I verify datasource schema reconciliation in Hudi?<a class="hash-link" href="#how-do-i-verify-datasource-schema-reconciliation-in-hudi" title="Direct link to heading"></a></h3><p>With Hudi you can reconcile schema, meaning you can apply target table schema on your incoming data, so if there&#x27;s a missing field in your batch it&#x27;ll be injected null value. You can enable schema reconciliation using <a href="https://hudi.apache.org/docs/configurations/#hoodiedatasourcewritereconcileschema" target="_blank" rel="noopener noreferrer">hoodie.datasource.write.reconcile.schema</a> config.</p><p>Example how schema reconciliation works with Spark:</p><div class="codeBlockContainer_J+bg language-scala theme-code-block"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hudi_options = {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hoodie.table.name&#x27;: &quot;test_recon1&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hoodie.datasource.write.recordkey.field&#x27;: &#x27;uuid&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hoodie.datasource.write.table.name&#x27;: &quot;test_recon1&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hoodie.datasource.write.precombine.field&#x27;: &#x27;ts&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hoodie.upsert.shuffle.parallelism&#x27;: 2,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hoodie.insert.shuffle.parallelism&#x27;: 2,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.write.hive_style_partitioning&quot;:&quot;true&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.write.reconcile.schema&quot;: &quot;true&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.hive_sync.jdbcurl&quot;:&quot;thrift://localhost:9083&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.hive_sync.database&quot;:&quot;hudi&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.hive_sync.table&quot;:&quot;test_recon1&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.hive_sync.enable&quot;:&quot;true&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &quot;hoodie.datasource.hive_sync.mode&quot;: &quot;hms&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">some_json = &#x27;{&quot;uuid&quot;:1,&quot;ts&quot;:1,&quot;Url&quot;:&quot;hudi.apache.com&quot;}&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">df = spark.read.json(sc.parallelize([some_json]))</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">df.write.format(&quot;hudi&quot;).mode(&quot;append&quot;).options(**hudi_options).save(base_path)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark.sql(&quot;select * from hudi.test_recon1;&quot;).show()</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">missing_field_json = &#x27;{&quot;uuid&quot;:2,&quot;ts&quot;:1}&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">df = spark.read.json(sc.parallelize([missing_field_json]))</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">df.write.format(&quot;hudi&quot;).mode(&quot;append&quot;).options(**hudi_options).save(base_path)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark.sql(&quot;select * from hudi.test_recon1;&quot;).show()</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>After first write:</p><table><thead><tr><th>_<!-- -->hoodie<!-- -->_<!-- -->commit<!-- -->_<!-- -->time</th><th>_<!-- -->hoodie<!-- -->_<!-- -->commit<!-- -->_<!-- -->seqno</th><th>_<!-- -->hoodie<!-- -->_<!-- -->record<!-- -->_<!-- -->key</th><th>_<!-- -->hoodie<!-- -->_<!-- -->partition<!-- -->_<!-- -->path</th><th>_<!-- -->hoodie<!-- -->_<!-- -->file<!-- -->_<!-- -->name</th><th>Url</th><th>ts</th><th>uuid</th></tr></thead><tbody><tr><td>20220622204044318</td><td>20220622204044318...</td><td>1</td><td></td><td>890aafc0-d897-44d...</td><td><a href="http://hudi.apache.com" target="_blank" rel="noopener noreferrer">hudi.apache.com</a></td><td>1</td><td>1</td></tr></tbody></table><p>After the second write:</p><table><thead><tr><th>_<!-- -->hoodie<!-- -->_<!-- -->commit<!-- -->_<!-- -->time</th><th>_<!-- -->hoodie<!-- -->_<!-- -->commit<!-- -->_<!-- -->seqno</th><th>_<!-- -->hoodie<!-- -->_<!-- -->record<!-- -->_<!-- -->key</th><th>_<!-- -->hoodie<!-- -->_<!-- -->partition<!-- -->_<!-- -->path</th><th>_<!-- -->hoodie<!-- -->_<!-- -->file<!-- -->_<!-- -->name</th><th>Url</th><th>ts</th><th>uuid</th></tr></thead><tbody><tr><td>20220622204044318</td><td>20220622204044318...</td><td>1</td><td></td><td>890aafc0-d897-44d...</td><td><a href="http://hudi.apache.com" target="_blank" rel="noopener noreferrer">hudi.apache.com</a></td><td>1</td><td>1</td></tr><tr><td>20220622204208997</td><td>20220622204208997...</td><td>2</td><td></td><td>890aafc0-d897-44d...</td><td>null</td><td>1</td><td>2</td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_y2LR" id="can-i-change-keygenerator-for-an-existing-table">Can I change keygenerator for an existing table?<a class="hash-link" href="#can-i-change-keygenerator-for-an-existing-table" title="Direct link to heading"></a></h3><p>No. There are small set of properties that cannot change once chosen. KeyGenerator is one among them. <a href="https://github.com/apache/hudi/blob/3f37d4fb08169c95930f9cc32389abf4e5cd5551/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala#L128" target="_blank" rel="noopener noreferrer">Here</a> is a code referecne where we</p><p>validate the properties.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="is-hudi-jvm-dependent-does-hudi-leverage-java-specific-serialization">Is Hudi JVM dependent? Does Hudi leverage Java specific serialization?<a class="hash-link" href="#is-hudi-jvm-dependent-does-hudi-leverage-java-specific-serialization" title="Direct link to heading"></a></h3><p>Hudi was not originally designed as a database layer that would fit under the various big data query engines, that were painfully hard to integrate with (Spark did not have DataSet/DataSource APIs, Trino was still Presto, Presto SPI was still budding, Hive storage handlers were just out). Popular engines including Spark, Flink, Presto, Trino, and Athena do not have issues integrating with Hudi as they are all based on JVM, and access access to Timeline, Metadata table are well-abstracted by Hudi APIs. Even non-jvm engines like Redshift have successfully integrated with Hudi.</p><p>Since it was not thought of as a &quot;format&quot;, the focus on the APIs for such lower level integrations and documenting the serialized bytes has been historically inadequate. However, with some understanding of the serialization, looking beyond the APIs used and focus on what the serialized bytes are, its possible to integrate Hudi from outside the JVM. For e.g Bloom filters are serialized as hex strings, from byte arrays/primitive types, and should be <strong>readable cross language</strong>. The Hudi Log Format bytes and layout are clearly defined as well, the header/footers are also binary serialized only with primitive types/byte arrays. So with the right endianity information and documentation of these bytes, <strong>cross jvm clients can read this</strong>. The Hudi metadata table uses <a href="https://hbase.apache.org/book.html#_hfile_format_2" target="_blank" rel="noopener noreferrer">HFile format</a> as the base file format, which while being a well-documented open file format with clear protobuf specifications, does not have native readers. Community has taken efforts towards improving the docs on <a href="https://hudi.apache.org/tech-specs" target="_blank" rel="noopener noreferrer">tech specs</a>. Going forward, Hudi community plans on improving the <a href="https://github.com/apache/hudi/pull/7080" target="_blank" rel="noopener noreferrer">table APIs</a> to facilitate faster engine integrations, including native language support, as a big part of the <a href="https://github.com/apache/hudi/blob/master/rfc/rfc-69/rfc-69.md" target="_blank" rel="noopener noreferrer">Hudi 1.0</a> format changes to generalize Hudi more.</p><p><strong><em>Note</em></strong>: <em>In a recent release the delete block keys were unintentionally serialized as kryo, and is being fixed in the 0.14 release. Thankfully, since Hudi’s log blocks and format are versioned, when the file slice is compacted things return to normal.</em></p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="integrations">Integrations<a class="hash-link" href="#integrations" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="does-aws-glue-support-hudi-">Does AWS GLUE support Hudi ?<a class="hash-link" href="#does-aws-glue-support-hudi-" title="Direct link to heading"></a></h3><p>AWS Glue jobs can write, read and update Glue Data Catalog for hudi tables. In order to successfully integrate with Glue Data Catalog, you need to subscribe to one of the AWS provided Glue connectors named &quot;AWS Glue Connector for Apache Hudi&quot;. Glue job needs to have &quot;Use Glue data catalog as the Hive metastore&quot; option ticked. Detailed steps with a sample scripts is available on this article provided by AWS - <a href="https://aws.amazon.com/blogs/big-data/writing-to-apache-hudi-tables-using-aws-glue-connector/" target="_blank" rel="noopener noreferrer">https://aws.amazon.com/blogs/big-data/writing-to-apache-hudi-tables-using-aws-glue-connector/</a>.</p><p>In case if your using either notebooks or Zeppelin through Glue dev-endpoints, your script might not be able to integrate with Glue DataCatalog when writing to hudi tables.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="how-to-override-hudi-jars-in-emr">How to override Hudi jars in EMR?<a class="hash-link" href="#how-to-override-hudi-jars-in-emr" title="Direct link to heading"></a></h3><p>If you are looking to override Hudi jars in your EMR clusters one way to achieve this is by providing the Hudi jars through a bootstrap script.</p><p>Here are the example steps for overriding Hudi version 0.7.0 in EMR 0.6.2.</p><p><strong>Build Hudi Jars:</strong></p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token comment" style="color:rgb(98, 114, 164)"># Git clone</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">git</span><span class="token plain"> clone https://github.com/apache/hudi.git </span><span class="token operator">&amp;&amp;</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">cd</span><span class="token plain"> hudi </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token comment" style="color:rgb(98, 114, 164)"># Get version 0.7.0</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">git</span><span class="token plain"> checkout --track origin/release-0.7.0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token comment" style="color:rgb(98, 114, 164)"># Build jars with spark 3.0.0 and scala 2.12 (since emr 6.2.0 uses spark 3 which requires scala 2.12):</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">mvn clean package -DskipTests -Dspark3 -Dscala-2.12 -T </span><span class="token number">30</span><span class="token plain"> </span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><strong>Copy jars to s3:</strong></p><p>These are the jars we are interested in after build completes. Copy them to a temp location first.</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">mkdir</span><span class="token plain"> -p ~/Downloads/hudi-jars</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">cp</span><span class="token plain"> packaging/hudi-hadoop-mr-bundle/target/hudi-hadoop-mr-bundle-0.7.0.jar ~/Downloads/hudi-jars/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">cp</span><span class="token plain"> packaging/hudi-hive-sync-bundle/target/hudi-hive-sync-bundle-0.7.0.jar ~/Downloads/hudi-jars/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">cp</span><span class="token plain"> packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.12-0.7.0.jar ~/Downloads/hudi-jars/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">cp</span><span class="token plain"> packaging/hudi-timeline-server-bundle/target/hudi-timeline-server-bundle-0.7.0.jar ~/Downloads/hudi-jars/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">cp</span><span class="token plain"> packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.7.0.jar ~/Downloads/hudi-jars/</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Upload all jars from ~/Downloads/hudi-jars/ to the s3 location s3://xxx/yyy/hudi-jars</p><p><strong>Include Hudi jars as part of the emr bootstrap script:</strong></p><p>Below script downloads Hudi jars from above s3 location. Use this script as part <code>bootstrap-actions</code> when launching the EMR cluster to install the jars in each node.</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token shebang important">#!/bin/bash</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">mkdir</span><span class="token plain"> -p /mnt1/hudi-jars</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> aws s3 </span><span class="token function" style="color:rgb(80, 250, 123)">cp</span><span class="token plain"> s3://xxx/yyy/hudi-jars /mnt1/hudi-jars --recursive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token comment" style="color:rgb(98, 114, 164)"># create symlinks</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">cd</span><span class="token plain"> /mnt1/hudi-jars</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">ln</span><span class="token plain"> -sf hudi-hadoop-mr-bundle-0.7.0.jar hudi-hadoop-mr-bundle.jar</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">ln</span><span class="token plain"> -sf hudi-hive-sync-bundle-0.7.0.jar hudi-hive-sync-bundle.jar</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">ln</span><span class="token plain"> -sf hudi-spark-bundle_2.12-0.7.0.jar hudi-spark-bundle.jar</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">ln</span><span class="token plain"> -sf hudi-timeline-server-bundle-0.7.0.jar hudi-timeline-server-bundle.jar</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">sudo</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">ln</span><span class="token plain"> -sf hudi-utilities-bundle_2.12-0.7.0.jar hudi-utilities-bundle.jar</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><strong>Using the overriden jar in Deltastreamer:</strong></p><p>When invoking DeltaStreamer specify the above jar location as part of spark-submit command.</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/apache/hudi/tree/asf-site/website/versioned_docs/version-0.14.0/faq.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_mt2f"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/0.14.0/use_cases"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Use Cases</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/0.14.0/privacy"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Privacy Policy</div></a></div></nav></div></div><div class="col col--3"><div class="tableOfContents_vrFS thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#general" class="table-of-contents__link toc-highlight">General</a><ul><li><a href="#when-is-hudi-useful-for-me-or-my-organization" class="table-of-contents__link toc-highlight">When is Hudi useful for me or my organization?</a></li><li><a href="#what-are-some-non-goals-for-hudi" class="table-of-contents__link toc-highlight">What are some non-goals for Hudi?</a></li><li><a href="#what-is-incremental-processing-why-does-hudi-docstalks-keep-talking-about-it" class="table-of-contents__link toc-highlight">What is incremental processing? Why does Hudi docs/talks keep talking about it?</a></li><li><a href="#how-is-hudi-optimized-for-cdc-and-streaming-use-cases" class="table-of-contents__link toc-highlight">How is Hudi optimized for CDC and streaming use cases?</a></li><li><a href="#how-do-i-choose-a-storage-type-for-my-workload" class="table-of-contents__link toc-highlight">How do I choose a storage type for my workload?</a></li><li><a href="#is-hudi-an-analytical-database" class="table-of-contents__link toc-highlight">Is Hudi an analytical database?</a></li><li><a href="#how-do-i-model-the-data-stored-in-hudi" class="table-of-contents__link toc-highlight">How do I model the data stored in Hudi?</a></li><li><a href="#why-does-hudi-require-a-key-field-to-be-configured" class="table-of-contents__link toc-highlight">Why does Hudi require a key field to be configured?</a></li><li><a href="#how-does-hudi-actually-store-data-inside-a-table" class="table-of-contents__link toc-highlight">How does Hudi actually store data inside a table?</a></li><li><a href="#how-hudi-handles-partition-evolution-requirements-" class="table-of-contents__link toc-highlight">How Hudi handles partition evolution requirements ?</a></li></ul></li><li><a href="#concepts" class="table-of-contents__link toc-highlight">Concepts</a><ul><li><a href="#how-does-hudi-ensure-atomicity" class="table-of-contents__link toc-highlight">How does Hudi ensure atomicity?</a></li><li><a href="#does-hudi-extend-the-hive-table-layout" class="table-of-contents__link toc-highlight">Does Hudi extend the Hive table layout?</a></li><li><a href="#what-concurrency-control-approaches-does-hudi-adopt" class="table-of-contents__link toc-highlight">What concurrency control approaches does Hudi adopt?</a></li><li><a href="#hudis-commits-are-based-on-transaction-start-time-instead-of-completed-time-does-this-cause-data-loss-or-inconsistency-in-case-of-incremental-and-time-travel-queries" class="table-of-contents__link toc-highlight">Hudi’s commits are based on transaction start time instead of completed time. Does this cause data loss or inconsistency in case of incremental and time travel queries?</a></li><li><a href="#how-does-hudi-plan-to-address-the-liveness-issue-above-for-incremental-queries" class="table-of-contents__link toc-highlight">How does Hudi plan to address the liveness issue above for incremental queries?</a></li><li><a href="#does-hudis-use-of-wall-clock-timestamp-for-instants-pose-any-clock-skew-issues" class="table-of-contents__link toc-highlight">Does Hudi’s use of wall clock timestamp for instants pose any clock skew issues?</a></li></ul></li><li><a href="#writing-tables" class="table-of-contents__link toc-highlight">Writing Tables</a><ul><li><a href="#what-are-some-ways-to-write-a-hudi-table" class="table-of-contents__link toc-highlight">What are some ways to write a Hudi table?</a></li><li><a href="#how-is-a-hudi-writer-job-deployed" class="table-of-contents__link toc-highlight">How is a Hudi writer job deployed?</a></li><li><a href="#can-i-implement-my-own-logic-for-how-input-records-are-merged-with-record-on-storage" class="table-of-contents__link toc-highlight">Can I implement my own logic for how input records are merged with record on storage?</a></li><li><a href="#how-do-i-delete-records-in-the-dataset-using-hudi" class="table-of-contents__link toc-highlight">How do I delete records in the dataset using Hudi?</a></li><li><a href="#should-i-need-to-worry-about-deleting-all-copies-of-the-records-in-case-of-duplicates" class="table-of-contents__link toc-highlight">Should I need to worry about deleting all copies of the records in case of duplicates?</a></li><li><a href="#how-does-hudi-handle-duplicate-record-keys-in-an-input" class="table-of-contents__link toc-highlight">How does Hudi handle duplicate record keys in an input?</a></li><li><a href="#how-can-i-pass-hudi-configurations-to-my-spark-writer-job" class="table-of-contents__link toc-highlight">How can I pass hudi configurations to my spark writer job?</a></li><li><a href="#how-to-create-hive-style-partition-folder-structure" class="table-of-contents__link toc-highlight">How to create Hive style partition folder structure?</a></li><li><a href="#can-i-register-my-hudi-table-with-apache-hive-metastore" class="table-of-contents__link toc-highlight">Can I register my Hudi table with Apache Hive metastore?</a></li><li><a href="#whats-hudis-schema-evolution-story" class="table-of-contents__link toc-highlight">What&#39;s Hudi&#39;s schema evolution story?</a></li><li><a href="#what-performanceingest-latency-can-i-expect-for-hudi-writing" class="table-of-contents__link toc-highlight">What performance/ingest latency can I expect for Hudi writing?</a></li><li><a href="#what-performance-can-i-expect-for-hudi-readingqueries" class="table-of-contents__link toc-highlight">What performance can I expect for Hudi reading/queries?</a></li><li><a href="#how-do-i-to-avoid-creating-tons-of-small-files" class="table-of-contents__link toc-highlight">How do I to avoid creating tons of small files?</a></li><li><a href="#how-do-i-use-deltastreamer-or-spark-datasource-api-to-write-to-a-non-partitioned-hudi-table-" class="table-of-contents__link toc-highlight">How do I use DeltaStreamer or Spark DataSource API to write to a Non-partitioned Hudi table ?</a></li><li><a href="#how-can-i-reduce-table-versions-created-by-hudi-in-aws-glue-data-catalog-metastore" class="table-of-contents__link toc-highlight">How can I reduce table versions created by Hudi in AWS Glue Data Catalog/ metastore?</a></li><li><a href="#if-there-are-failed-writes-in-my-timeline-do-i-see-duplicates" class="table-of-contents__link toc-highlight">If there are failed writes in my timeline, do I see duplicates?</a></li><li><a href="#how-are-conflicts-detected-in-hudi-between-multiple-writers" class="table-of-contents__link toc-highlight">How are conflicts detected in Hudi between multiple writers?</a></li><li><a href="#can-single-writer-inserts-have-duplicates" class="table-of-contents__link toc-highlight">Can single-writer inserts have duplicates?</a></li><li><a href="#can-concurrent-inserts-cause-duplicates" class="table-of-contents__link toc-highlight">Can concurrent inserts cause duplicates?</a></li></ul></li><li><a href="#querying-tables" class="table-of-contents__link toc-highlight">Querying Tables</a><ul><li><a href="#does-deleted-records-appear-in-hudis-incremental-query-results" class="table-of-contents__link toc-highlight">Does deleted records appear in Hudi&#39;s incremental query results?</a></li><li><a href="#how-do-i-pass-hudi-configurations-to-my-beeline-hive-queries" class="table-of-contents__link toc-highlight">How do I pass hudi configurations to my beeline Hive queries?</a></li><li><a href="#does-hudi-guarantee-consistent-reads-how-to-think-about-read-optimized-queries" class="table-of-contents__link toc-highlight">Does Hudi guarantee consistent reads? How to think about read optimized queries?</a></li></ul></li><li><a href="#table-services" class="table-of-contents__link toc-highlight">Table Services</a><ul><li><a href="#what-does-the-hudi-cleaner-do" class="table-of-contents__link toc-highlight">What does the Hudi cleaner do?</a></li><li><a href="#how-do-i-run-compaction-for-a-mor-table" class="table-of-contents__link toc-highlight">How do I run compaction for a MOR table?</a></li><li><a href="#what-options-do-i-have-for-asynchronousoffline-compactions-on-mor-table" class="table-of-contents__link toc-highlight">What options do I have for asynchronous/offline compactions on MOR table?</a></li><li><a href="#how-to-disable-all-table-services-in-case-of-multiple-writers" class="table-of-contents__link toc-highlight">How to disable all table services in case of multiple writers?</a></li><li><a href="#why-does-hudi-retain-at-least-one-previous-commit-even-after-setting-hoodiecleanercommitsretained-1-" class="table-of-contents__link toc-highlight">Why does Hudi retain at-least one previous commit even after setting hoodie.cleaner.commits.retained&#39;: 1 ?</a></li><li><a href="#can-i-get-notified-when-new-commits-happen-in-my-hudi-table" class="table-of-contents__link toc-highlight">Can I get notified when new commits happen in my Hudi table?</a></li></ul></li><li><a href="#storage" class="table-of-contents__link toc-highlight">Storage</a><ul><li><a href="#does-hudi-support-cloud-storageobject-stores" class="table-of-contents__link toc-highlight">Does Hudi support cloud storage/object stores?</a></li><li><a href="#what-is-the-difference-between-copy-on-write-cow-vs-merge-on-read-mor-table-types" class="table-of-contents__link toc-highlight">What is the difference between copy-on-write (COW) vs merge-on-read (MOR) table types?</a></li><li><a href="#how-do-i-migrate-my-data-to-hudi" class="table-of-contents__link toc-highlight">How do I migrate my data to Hudi?</a></li><li><a href="#how-to-convert-an-existing-cow-table-to-mor" class="table-of-contents__link toc-highlight">How to convert an existing COW table to MOR?</a></li><li><a href="#how-can-i-find-the-average-record-size-in-a-commit" class="table-of-contents__link toc-highlight">How can I find the average record size in a commit?</a></li><li><a href="#how-does-the-hudi-indexing-work--what-are-its-benefits" class="table-of-contents__link toc-highlight">How does the Hudi indexing work &amp; what are its benefits?</a></li><li><a href="#can-i-switch-from-one-index-type-to-another-without-having-to-rewrite-the-entire-table" class="table-of-contents__link toc-highlight">Can I switch from one index type to another without having to rewrite the entire table?</a></li><li><a href="#i-have-an-existing-dataset-and-want-to-evaluate-hudi-using-portion-of-that-data-" class="table-of-contents__link toc-highlight">I have an existing dataset and want to evaluate Hudi using portion of that data ?</a></li><li><a href="#why-does-maintain-record-level-commit-metadata-isnt-tracking-table-version-at-file-level-good-enough" class="table-of-contents__link toc-highlight">Why does maintain record level commit metadata? Isn&#39;t tracking table version at file level good enough? </a></li><li><a href="#why-partition-fields-are-also-stored-in-parquet-files-in-addition-to-the-partition-path-" class="table-of-contents__link toc-highlight">Why partition fields are also stored in parquet files in addition to the partition path ?</a></li><li><a href="#how-do-i-configure-bloom-filter-when-bloomglobal_bloom-index-is-used" class="table-of-contents__link toc-highlight">How do I configure Bloom filter (when Bloom/Global_Bloom index is used)?</a></li><li><a href="#how-do-i-verify-datasource-schema-reconciliation-in-hudi" class="table-of-contents__link toc-highlight">How do I verify datasource schema reconciliation in Hudi?</a></li><li><a href="#can-i-change-keygenerator-for-an-existing-table" class="table-of-contents__link toc-highlight">Can I change keygenerator for an existing table?</a></li><li><a href="#is-hudi-jvm-dependent-does-hudi-leverage-java-specific-serialization" class="table-of-contents__link toc-highlight">Is Hudi JVM dependent? Does Hudi leverage Java specific serialization?</a></li></ul></li><li><a href="#integrations" class="table-of-contents__link toc-highlight">Integrations</a><ul><li><a href="#does-aws-glue-support-hudi-" class="table-of-contents__link toc-highlight">Does AWS GLUE support Hudi ?</a></li><li><a href="#how-to-override-hudi-jars-in-emr" class="table-of-contents__link toc-highlight">How to override Hudi jars in EMR?</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/blog/2021/07/21/streaming-data-lake-platform">Our Vision</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/concepts">Concepts</a></li><li class="footer__item"><a class="footer__link-item" href="/community/team">Team</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/release-0.14.1">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/powered-by">Who&#x27;s Using</a></li></ul></div><div class="col footer__col"><div class="footer__title">Learn</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/quick-start-guide">Quick Start</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/docker_demo">Docker Demo</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/talks">Talks</a></li><li class="footer__item"><a class="footer__link-item" href="/videos">Video Guides</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Technical Wiki<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div><div class="col footer__col"><div class="footer__title">Hudi On Cloud</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/s3_hoodie">AWS</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/gcs_hoodie">Google Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/oss_hoodie">Alibaba Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/azure_hoodie">Microsoft Azure</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/cos_hoodie">Tencent Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/ibm_cos_hoodie">IBM Cloud</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/community/get-involved">Get Involved</a></li><li class="footer__item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Linkedin<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><div class="footer__title">Apache</div><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://hudi.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_SRtH"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footer__copyright">Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. <br>Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation.</div></div></div></footer></div>
<script src="/assets/js/runtime~main.2cab5691.js"></script>
<script src="/assets/js/main.bd020950.js"></script>
</body>
</html>