| <!doctype html> |
| <html class="docs-version-0.5.1" lang="en" dir="ltr"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta name="generator" content="Docusaurus v2.0.0-beta.14"> |
| <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Hudi: User-Facing Analytics RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Hudi: User-Facing Analytics Atom Feed"> |
| <link rel="alternate" type="application/json" href="/blog/feed.json" title="Apache Hudi: User-Facing Analytics JSON Feed"> |
| <link rel="search" type="application/opensearchdescription+xml" title="Apache Hudi" href="/opensearch.xml"> |
| <link rel="alternate" type="application/rss+xml" href="/videos/rss.xml" title="Apache Hudi RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/videos/atom.xml" title="Apache Hudi Atom Feed"> |
| <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro"> |
| <link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">Writing Hudi Tables | Apache Hudi</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://hudi.apache.org/docs/0.5.1/writing_data"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:version" content="0.5.1"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="docs-default-0.5.1"><meta data-react-helmet="true" property="og:title" content="Writing Hudi Tables | Apache Hudi"><meta data-react-helmet="true" name="description" content="In this section, we will cover ways to ingest new changes from external sources or even other Hudi tables using the DeltaStreamer tool, as well as"><meta data-react-helmet="true" property="og:description" content="In this section, we will cover ways to ingest new changes from external sources or even other Hudi tables using the DeltaStreamer tool, as well as"><meta data-react-helmet="true" name="keywords" content="hudi,incremental,batch,stream,processing,Hive,ETL,Spark SQL"><link data-react-helmet="true" rel="icon" href="/assets/images/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://hudi.apache.org/docs/0.5.1/writing_data"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.5.1/writing_data" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/cn/docs/0.5.1/writing_data" hreflang="cn"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.5.1/writing_data" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.ea681a30.css"> |
| <link rel="preload" href="/assets/js/runtime~main.2cab5691.js" as="script"> |
| <link rel="preload" href="/assets/js/main.bd020950.js" as="script"> |
| </head> |
| <body> |
| <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"> |
| <div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><div class="announcementBar_axC9" role="banner"><div class="announcementBarPlaceholder_xYHE"></div><div class="announcementBarContent_6uhP">⭐️ If you like Apache Hudi, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi">GitHub</a>! ⭐</div><button type="button" class="clean-btn close announcementBarClose_A3A1" aria-label="Close"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav class="navbar navbar--fixed-top navbarWrapper_UIa0"><div class="navbar__inner"><img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8f594acf-9b77-44fb-9475-3e82ead1910c" width="0" height="0" alt=""><img referrerpolicy="no-referrer-when-downgrade" src="https://analytics.apache.org/matomo.php?idsite=47&rec=1" width="0" height="0" alt=""><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo navbarLogo_Bz6n"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><a class="navbar__item navbar__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Learn<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/talks"><div class="labelWrapperDropdown_Mqbj">Talks</div></a></li><li><a class="dropdown__link" href="/videos"><div class="labelWrapperDropdown_Mqbj">Video Guides</div></a></li><li><a class="dropdown__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">FAQ</div></a></li><li><a class="dropdown__link" href="/tech-specs"><div class="labelWrapperDropdown_Mqbj">Tech Specs</div></a></li><li><a class="dropdown__link" href="/tech-specs-1point0"><div class="labelWrapperDropdown_Mqbj">Tech Specs 1.0</div></a></li><li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Technical Wiki<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Contribute<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/contribute/how-to-contribute"><div class="labelWrapperDropdown_Mqbj">How to Contribute</div></a></li><li><a class="dropdown__link" href="/contribute/developer-setup"><div class="labelWrapperDropdown_Mqbj">Developer Setup</div></a></li><li><a class="dropdown__link" href="/contribute/rfc-process"><div class="labelWrapperDropdown_Mqbj">RFC Process</div></a></li><li><a class="dropdown__link" href="/contribute/report-security-issues"><div class="labelWrapperDropdown_Mqbj">Report Security Issues</div></a></li><li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Report Issues<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Community<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/get-involved"><div class="labelWrapperDropdown_Mqbj">Get Involved</div></a></li><li><a class="dropdown__link" href="/community/syncs"><div class="labelWrapperDropdown_Mqbj">Community Syncs</div></a></li><li><a class="dropdown__link" href="/community/office_hours"><div class="labelWrapperDropdown_Mqbj">Office Hours</div></a></li><li><a class="dropdown__link" href="/community/team"><div class="labelWrapperDropdown_Mqbj">Team</div></a></li></ul></div><a class="navbar__item navbar__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a><a class="navbar__item navbar__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who's Using</div></a><a class="navbar__item navbar__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a><a class="navbar__item navbar__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link downloadLinkDropdownHide_aDP3" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/writing_data"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li><a class="dropdown__link" href="/docs/writing_data"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li><a class="dropdown__link" href="/docs/0.14.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li><a class="dropdown__link" href="/docs/0.13.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li><a class="dropdown__link" href="/docs/0.13.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li><a class="dropdown__link" href="/docs/0.12.3/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li><a class="dropdown__link" href="/docs/0.12.2/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li><a class="dropdown__link" href="/docs/0.12.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li><a class="dropdown__link" href="/docs/0.12.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li><a class="dropdown__link" href="/docs/0.11.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li><a class="dropdown__link" href="/docs/0.11.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li><a class="dropdown__link" href="/docs/0.10.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li><a class="dropdown__link" href="/docs/0.10.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li><a class="dropdown__link" href="/docs/0.9.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li><a class="dropdown__link" href="/docs/0.8.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li><a class="dropdown__link" href="/docs/0.7.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li><a class="dropdown__link" href="/docs/0.6.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li><a class="dropdown__link" href="/docs/0.5.3/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li><a class="dropdown__link" href="/docs/0.5.2/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/0.5.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li><a class="dropdown__link" href="/docs/0.5.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>English</span></span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#a)"><path d="M14 6.457a6.842 6.842 0 0 0-7-6.02 6.843 6.843 0 0 0-7 6.02v1.085a6.843 6.843 0 0 0 7 6.02 6.843 6.843 0 0 0 7-6.02V6.457Zm-1.094 0h-2.625a9.92 9.92 0 0 0-.376-2.222 6.65 6.65 0 0 0 1.531-.875 5.25 5.25 0 0 1 1.444 3.097h.026Zm-8.032 0a8.479 8.479 0 0 1 .324-1.872 7.376 7.376 0 0 0 3.63 0c.175.61.284 1.239.325 1.872h-4.28Zm4.305 1.085a8.391 8.391 0 0 1-.324 1.873 7.464 7.464 0 0 0-3.658 0 8.479 8.479 0 0 1-.323-1.873h4.305Zm.35-4.375A10.342 10.342 0 0 0 8.75 1.75c.627.194 1.218.49 1.75.875a5.748 5.748 0 0 1-.998.577l.027-.035ZM7.254 1.54A8.75 8.75 0 0 1 8.46 3.552c-.48.11-.97.165-1.461.167-.492-.001-.982-.057-1.461-.167.308-.722.715-1.4 1.207-2.012h.508ZM4.498 3.202a5.748 5.748 0 0 1-.998-.577 6.029 6.029 0 0 1 1.75-.875c-.294.46-.546.947-.753 1.452Zm-1.873.15c.47.358.984.652 1.531.874A9.625 9.625 0 0 0 3.78 6.45H1.155a5.25 5.25 0 0 1 1.47-3.098ZM1.12 7.541h2.625c.038.753.164 1.5.376 2.223a6.649 6.649 0 0 0-1.531.875 5.25 5.25 0 0 1-1.47-3.098Zm3.377 3.255c.207.506.459.992.753 1.453a6.03 6.03 0 0 1-1.75-.875c.312-.226.646-.419.997-.578Zm2.25 1.663a8.594 8.594 0 0 1-1.208-2.013 6.501 6.501 0 0 1 2.922 0 8.54 8.54 0 0 1-1.207 2.013h-.508Zm2.755-1.663c.367.156.716.35 1.042.578a6.338 6.338 0 0 1-1.75.875c.275-.464.512-.95.708-1.453Zm1.873-.148a6.647 6.647 0 0 0-1.531-.875 9.45 9.45 0 0 0 .376-2.223h2.625a5.25 5.25 0 0 1-1.47 3.098Z" fill="#1C1E21"></path></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h14v14H0z"></path></clipPath></defs></svg></div></a><ul class="dropdown__menu"><li><a href="/docs/0.5.1/writing_data" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active"><div class="labelWrapperDropdown_Mqbj">English</div></a></li><li><a href="/cn/docs/0.5.1/writing_data" target="_self" rel="noopener noreferrer" class="dropdown__link"><div class="labelWrapperDropdown_Mqbj">Chinese</div></a></li></ul></div><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a><div class="searchBox_fBfG"><div role="button" class="searchButton_g9-U" aria-label="Search"><span class="searchText_RI6l">Search</span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="6.864" cy="6.864" r="5.243" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></circle><path d="m10.51 10.783 2.056 2.05" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><button type="button" class="clean-btn navbar-sidebar__close"><svg viewBox="0 0 15 15" width="21" height="21"><g stroke="var(--ifm-color-emphasis-600)" stroke-width="1.2"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><div class="navbar-sidebar__items"><div class="navbar-sidebar__item menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Learn</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Contribute</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Community</div></a></li><li class="menu__list-item"><a class="menu__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a></li><li class="menu__list-item"><a class="menu__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who's Using</div></a></li><li class="menu__list-item"><a class="menu__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a></li><li class="menu__list-item"><a class="menu__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></li><li class="menu__list-item"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Versions</div></a><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/next/writing_data"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/writing_data"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.14.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.3/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.2/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.9.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.8.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.7.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.6.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.3/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.2/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/docs/0.5.1/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.0/writing_data"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Languages</span></span></div></a></li><li class="menu__list-item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="menu__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="menu__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="menu__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="menu__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="menu__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a></li></ul></div><div class="navbar-sidebar__item menu"><button type="button" class="clean-btn navbar-sidebar__back">← Back to main menu</button></div></div></div></nav><div class="main-wrapper docs-wrapper docs-doc-page"><div class="docPage_GMj9"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_i9tI" type="button"></button><aside class="docSidebarContainer_k0Pq"><div class="sidebar_a3j0"><nav class="menu thin-scrollbar menu_cyFh menuWithAnnouncementBar_+O1J"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.5.1/quick-start-guide">Quick-Start Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.5.1/use_cases">Use Cases</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" href="/docs/0.5.1/writing_data">Writing Hudi Tables</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.5.1/querying_data">Querying Hudi Tables</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.5.1/configurations">Configurations</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.5.1/performance">Performance</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.5.1/deployment">Deployment Guide</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.5.1/s3_hoodie">Storage Configurations</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.5.1/docker_demo">Resources</a></div></li></ul></nav></div></aside><main class="docMainContainer_Q970"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_zHA2"><div class="theme-doc-version-banner alert alert--warning margin-bottom--md" role="alert"><div>This is documentation for <!-- -->Apache Hudi<!-- --> <b>0.5.1</b>, which is no longer actively maintained.</div><div class="margin-top--md">For up-to-date documentation, see the <b><a href="/docs/writing_data">latest version</a></b> (<!-- -->0.14.1<!-- -->).</div></div><div class="docItemContainer_oiyr"><article><span class="theme-doc-version-badge badge badge--secondary">Version: <!-- -->0.5.1</span><div class="tocCollapsible_aw-L theme-doc-toc-mobile tocMobile_Tx6Y"><button type="button" class="clean-btn tocCollapsibleButton_zr6a">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Writing Hudi Tables</h1></header><p>In this section, we will cover ways to ingest new changes from external sources or even other Hudi tables using the <a href="#deltastreamer">DeltaStreamer</a> tool, as well as |
| speeding up large Spark jobs via upserts using the <a href="#datasource-writer">Hudi datasource</a>. Such tables can then be <a href="/docs/querying_data">queried</a> using various query engines.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="write-operations">Write Operations<a class="hash-link" href="#write-operations" title="Direct link to heading"></a></h2><p>Before that, it may be helpful to understand the 3 different write operations provided by Hudi datasource or the delta streamer tool and how best to leverage them. These operations |
| can be chosen/changed across each commit/deltacommit issued against the table.</p><ul><li><strong>UPSERT</strong> : This is the default operation where the input records are first tagged as inserts or updates by looking up the index and |
| the records are ultimately written after heuristics are run to determine how best to pack them on storage to optimize for things like file sizing. |
| This operation is recommended for use-cases like database change capture where the input almost certainly contains updates.</li><li><strong>INSERT</strong> : This operation is very similar to upsert in terms of heuristics/file sizing but completely skips the index lookup step. Thus, it can be a lot faster than upserts |
| for use-cases like log de-duplication (in conjunction with options to filter duplicates mentioned below). This is also suitable for use-cases where the table can tolerate duplicates, but just |
| need the transactional writes/incremental pull/storage management capabilities of Hudi.</li><li><strong>BULK_INSERT</strong> : Both upsert and insert operations keep input records in memory to speed up storage heuristics computations faster (among other things) and thus can be cumbersome for |
| initial loading/bootstrapping a Hudi table at first. Bulk insert provides the same semantics as insert, while implementing a sort-based data writing algorithm, which can scale very well for several hundred TBs |
| of initial load. However, this just does a best-effort job at sizing files vs guaranteeing file sizes like inserts/upserts do. </li></ul><h2 class="anchor anchorWithStickyNavbar_y2LR" id="deltastreamer">DeltaStreamer<a class="hash-link" href="#deltastreamer" title="Direct link to heading"></a></h2><p>The <code>HoodieDeltaStreamer</code> utility (part of hudi-utilities-bundle) provides the way to ingest from different sources such as DFS or Kafka, with the following capabilities.</p><ul><li>Exactly once ingestion of new events from Kafka, <a href="https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports" target="_blank" rel="noopener noreferrer">incremental imports</a> from Sqoop or output of <code>HiveIncrementalPuller</code> or files under a DFS folder</li><li>Support json, avro or a custom record types for the incoming data</li><li>Manage checkpoints, rollback & recovery </li><li>Leverage Avro schemas from DFS or Confluent <a href="https://github.com/confluentinc/schema-registry" target="_blank" rel="noopener noreferrer">schema registry</a>.</li><li>Support for plugging in transformations</li></ul><p>Command line options describe capabilities in more detail</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` --help</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Usage: <main class> [options]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Options:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --checkpoint</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Resume Delta Streamer from this checkpoint.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --commit-on-errors</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Commit even when some records failed to be written</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --compact-scheduling-minshare</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Minshare for compaction as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --compact-scheduling-weight</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Scheduling weight for compaction as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --continuous</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Delta Streamer runs in continuous mode running source-fetch -> Transform</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -> Hudi Write in loop</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --delta-sync-scheduling-minshare</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Minshare for delta sync as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --delta-sync-scheduling-weight</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Scheduling weight for delta sync as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --disable-compaction</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Compaction is enabled for MoR table by default. This flag disables it</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --enable-hive-sync</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Enable syncing to hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --filter-dupes</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Should duplicate records from source be dropped/filtered out before</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> insert/bulk-insert</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --help, -h</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hoodie-conf</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Any configuration that can be set in the properties file (using the CLI</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> parameter "--propsFilePath") can also be passed command line using this</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> parameter</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: []</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --max-pending-compactions</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Maximum number of outstanding inflight/requested compactions. Delta Sync</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> will not happen unlessoutstanding compactions is less than this number</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 5</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --min-sync-interval-seconds</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> the min sync interval of each sync in continuous mode</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --op</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Takes one of these values : UPSERT (default), INSERT (use when input is</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> purely new data/inserts to gain speed)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: UPSERT</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Possible Values: [UPSERT, INSERT, BULK_INSERT]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --payload-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> subclass of HoodieRecordPayload, that works off a GenericRecord.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Implement your own, if you want to do something other than overwriting</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> existing value</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: org.apache.hudi.common.model.OverwriteWithLatestAvroPayload</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> path to properties file on localfs or dfs, with configurations for</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hoodie client, schema provider, key generator and data source. For</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hoodie client props, sane defaults are used, but recommend use to</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> provide basic things like metrics endpoints, hive configs etc. For</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> sources, referto individual classes, for supported properties.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: file:///Users/vinoth/bin/hoodie/src/test/resources/delta-streamer-config/dfs-source.properties</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> subclass of org.apache.hudi.utilities.schema.SchemaProvider to attach</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> schemas to input & target table data, built in options:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> org.apache.hudi.utilities.schema.FilebasedSchemaProvider.Source (See</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> org.apache.hudi.utilities.sources.Source) implementation can implement</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> their own SchemaProvider. For Sources that return Dataset<Row>, the</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> schema is obtained implicitly. However, this CLI option allows</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> overriding the schemaprovider returned by Source.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Subclass of org.apache.hudi.utilities.sources to read data. Built-in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> options: org.apache.hudi.utilities.sources.{JsonDFSSource (default),</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> AvroDFSSource, JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: org.apache.hudi.utilities.sources.JsonDFSSource</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-limit</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Maximum amount of data to read from source. Default: No limit For e.g:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DFS-Source => max bytes to read, Kafka-Source => max events to read</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 9223372036854775807</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Field within source record to decide how to break ties between records</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> with same key in input data. Default: 'ts' holding unix timestamp of</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> record</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: ts</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --spark-master</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark master to use.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: local[2]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --table-type</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Type of table. COPY_ON_WRITE (or) MERGE_ON_READ</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --target-base-path</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> base path for the target hoodie table. (Will be created if did not exist</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> first time around. If exists, expected to be a hoodie table)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --target-table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> name of the target table in Hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --transformer-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> subclass of org.apache.hudi.utilities.transform.Transformer. Allows</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> transforming raw source Dataset to a target Dataset (conforming to</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> target schema) before writing. Default : Not set. E:g -</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> org.apache.hudi.utilities.transform.SqlQueryBasedTransformer (which</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> allows a SQL query templated to be passed as a transformation function)</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The tool takes a hierarchically composed property file and has pluggable interfaces for extracting data, key generation and providing schema. Sample configs for ingesting from kafka and dfs are |
| provided under <code>hudi-utilities/src/test/resources/delta-streamer-config</code>.</p><p>For e.g: once you have Confluent Kafka, Schema registry up & running, produce some test data using (<a href="https://docs.confluent.io/current/ksql/docs/tutorials/generate-custom-test-data" target="_blank" rel="noopener noreferrer">impressions.avro</a> provided by schema-registry repo)</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[confluent-5.0.0]$ bin/ksql-datagen schema=../impressions.avro format=avro topic=impressions key=impressionid</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>and then ingest it as follows.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field impresssiontime \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path file:\/\/\/tmp/hudi-deltastreamer-op \ </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table uber.impressions \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --op BULK_INSERT</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>In some cases, you may want to migrate your existing table into Hudi beforehand. Please refer to <a href="/docs/migration_guide">migration guide</a>. </p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="datasource-writer">Datasource Writer<a class="hash-link" href="#datasource-writer" title="Direct link to heading"></a></h2><p>The <code>hudi-spark</code> module offers the DataSource API to write (and also read) any data frame into a Hudi table. |
| Following is how we can upsert a dataframe, while specifying the field names that need to be used |
| for <code>recordKey => _row_key</code>, <code>partitionPath => partition</code> and <code>precombineKey => timestamp</code></p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">inputDF.write()</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .format("org.apache.hudi")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .options(clientOpts) // any of the Hudi client opts can be passed in as well</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(HoodieWriteConfig.TABLE_NAME, tableName)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .mode(SaveMode.Append)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .save(basePath);</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2 class="anchor anchorWithStickyNavbar_y2LR" id="syncing-to-hive">Syncing to Hive<a class="hash-link" href="#syncing-to-hive" title="Direct link to heading"></a></h2><p>Both tools above support syncing of the table's latest schema to Hive metastore, such that queries can pick up new columns and partitions. |
| In case, its preferable to run this from commandline or in an independent jvm, Hudi provides a <code>HiveSyncTool</code>, which can be invoked as below, |
| once you have built the hudi-hive module. Following is how we sync the above Datasource Writer written table to Hive metastore.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cd hudi-hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./run_sync_tool.sh --jdbc-url jdbc:hive2:\/\/hiveserver:10000 --user hive --pass hive --partitioned-by partition --base-path <basePath> --database default --table <tableName></span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Starting with Hudi 0.5.1 version read optimized version of merge-on-read tables are suffixed '_ro' by default. For backwards compatibility with older Hudi versions, |
| an optional HiveSyncConfig - <code>--skip-ro-suffix</code>, has been provided to turn off '_ro' suffixing if desired. Explore other hive sync options using the following command:</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cd hudi-hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./run_sync_tool.sh</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> [hudi-hive]$ ./run_sync_tool.sh --help</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2 class="anchor anchorWithStickyNavbar_y2LR" id="deletes">Deletes<a class="hash-link" href="#deletes" title="Direct link to heading"></a></h2><p>Hudi supports implementing two types of deletes on data stored in Hudi tables, by enabling the user to specify a different record payload implementation. |
| For more info refer to <a href="https://cwiki.apache.org/confluence/x/6IqvC" target="_blank" rel="noopener noreferrer">Delete support in Hudi</a>.</p><ul><li><strong>Soft Deletes</strong> : With soft deletes, user wants to retain the key but just null out the values for all other fields. |
| This can be simply achieved by ensuring the appropriate fields are nullable in the table schema and simply upserting the table after setting these fields to null.</li><li><strong>Hard Deletes</strong> : A stronger form of delete is to physically remove any trace of the record from the table. This can be achieved by issuing an upsert with a custom payload implementation |
| via either DataSource or DeltaStreamer which always returns Optional.Empty as the combined value. Hudi ships with a built-in <code>org.apache.hudi.EmptyHoodieRecordPayload</code> class that does exactly this.</li></ul><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain"> deleteDF // dataframe containing just records to be deleted</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .write().format("org.apache.hudi")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(...) // Add HUDI options like record-key, partition-path and others as needed for your setup</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> // specify record_key, partition_key, precombine_fieldkey & usual params</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, "org.apache.hudi.EmptyHoodieRecordPayload")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2 class="anchor anchorWithStickyNavbar_y2LR" id="optimized-dfs-access">Optimized DFS Access<a class="hash-link" href="#optimized-dfs-access" title="Direct link to heading"></a></h2><p>Hudi also performs several key storage management functions on the data stored in a Hudi table. A key aspect of storing data on DFS is managing file sizes and counts |
| and reclaiming storage space. For e.g HDFS is infamous for its handling of small files, which exerts memory/RPC pressure on the Name Node and can potentially destabilize |
| the entire cluster. In general, query engines provide much better performance on adequately sized columnar files, since they can effectively amortize cost of obtaining |
| column statistics etc. Even on some cloud data stores, there is often cost to listing directories with large number of small files.</p><p>Here are some ways to efficiently manage the storage of your Hudi tables.</p><ul><li>The <a href="/docs/configurations#compactionSmallFileSize">small file handling feature</a> in Hudi, profiles incoming workload |
| and distributes inserts to existing file groups instead of creating new file groups, which can lead to small files. </li><li>Cleaner can be <a href="/docs/configurations#retainCommits">configured</a> to clean up older file slices, more or less aggressively depending on maximum time for queries to run & lookback needed for incremental pull</li><li>User can also tune the size of the <a href="/docs/configurations#limitFileSize">base/parquet file</a>, <a href="/docs/configurations#logFileMaxSize">log files</a> & expected <a href="/docs/configurations#parquetCompressionRatio">compression ratio</a>, |
| such that sufficient number of inserts are grouped into the same file group, resulting in well sized base files ultimately.</li><li>Intelligently tuning the <a href="/docs/configurations#withBulkInsertParallelism">bulk insert parallelism</a>, can again in nicely sized initial file groups. It is in fact critical to get this right, since the file groups |
| once created cannot be deleted, but simply expanded as explained before.</li><li>For workloads with heavy updates, the <a href="/docs/concepts#merge-on-read-table">merge-on-read table</a> provides a nice mechanism for ingesting quickly into smaller files and then later merging them into larger base files via compaction.</li></ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/apache/hudi/tree/asf-site/website/versioned_docs/version-0.5.1/writing_data.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_mt2f"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/0.5.1/use_cases"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Use Cases</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/0.5.1/querying_data"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Querying Hudi Tables</div></a></div></nav></div></div><div class="col col--3"><div class="tableOfContents_vrFS thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#write-operations" class="table-of-contents__link toc-highlight">Write Operations</a></li><li><a href="#deltastreamer" class="table-of-contents__link toc-highlight">DeltaStreamer</a></li><li><a href="#datasource-writer" class="table-of-contents__link toc-highlight">Datasource Writer</a></li><li><a href="#syncing-to-hive" class="table-of-contents__link toc-highlight">Syncing to Hive</a></li><li><a href="#deletes" class="table-of-contents__link toc-highlight">Deletes</a></li><li><a href="#optimized-dfs-access" class="table-of-contents__link toc-highlight">Optimized DFS Access</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/blog/2021/07/21/streaming-data-lake-platform">Our Vision</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/concepts">Concepts</a></li><li class="footer__item"><a class="footer__link-item" href="/community/team">Team</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/release-0.14.1">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/powered-by">Who's Using</a></li></ul></div><div class="col footer__col"><div class="footer__title">Learn</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/quick-start-guide">Quick Start</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/docker_demo">Docker Demo</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/talks">Talks</a></li><li class="footer__item"><a class="footer__link-item" href="/videos">Video Guides</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Technical Wiki<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div><div class="col footer__col"><div class="footer__title">Hudi On Cloud</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/s3_hoodie">AWS</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/gcs_hoodie">Google Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/oss_hoodie">Alibaba Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/azure_hoodie">Microsoft Azure</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/cos_hoodie">Tencent Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/ibm_cos_hoodie">IBM Cloud</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/community/get-involved">Get Involved</a></li><li class="footer__item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Linkedin<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><div class="footer__title">Apache</div><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://hudi.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_SRtH"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footer__copyright">Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. <br>Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation.</div></div></div></footer></div> |
| <script src="/assets/js/runtime~main.2cab5691.js"></script> |
| <script src="/assets/js/main.bd020950.js"></script> |
| </body> |
| </html> |