blob: 0f659cdb22e27fc12e7031c97413ac5eecf2e287 [file] [log] [blame]
<!doctype html>
<html class="docs-version-0.10.1" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-beta.14">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Hudi: User-Facing Analytics RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Hudi: User-Facing Analytics Atom Feed">
<link rel="alternate" type="application/json" href="/blog/feed.json" title="Apache Hudi: User-Facing Analytics JSON Feed">
<link rel="search" type="application/opensearchdescription+xml" title="Apache Hudi" href="/opensearch.xml">
<link rel="alternate" type="application/rss+xml" href="/videos/rss.xml" title="Apache Hudi RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/videos/atom.xml" title="Apache Hudi Atom Feed">
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro">
<link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">Streaming Ingestion | Apache Hudi</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://hudi.apache.org/docs/0.10.1/hoodie_deltastreamer"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:version" content="0.10.1"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="docs-default-0.10.1"><meta data-react-helmet="true" property="og:title" content="Streaming Ingestion | Apache Hudi"><meta data-react-helmet="true" name="description" content="DeltaStreamer"><meta data-react-helmet="true" property="og:description" content="DeltaStreamer"><meta data-react-helmet="true" name="keywords" content="hudi,deltastreamer,hoodiedeltastreamer"><link data-react-helmet="true" rel="icon" href="/assets/images/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://hudi.apache.org/docs/0.10.1/hoodie_deltastreamer"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.10.1/hoodie_deltastreamer" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/cn/docs/0.10.1/hoodie_deltastreamer" hreflang="cn"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.10.1/hoodie_deltastreamer" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.ea681a30.css">
<link rel="preload" href="/assets/js/runtime~main.2cab5691.js" as="script">
<link rel="preload" href="/assets/js/main.bd020950.js" as="script">
</head>
<body>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><div class="announcementBar_axC9" role="banner"><div class="announcementBarPlaceholder_xYHE"></div><div class="announcementBarContent_6uhP">⭐️ If you like Apache Hudi, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi">GitHub</a>! ⭐</div><button type="button" class="clean-btn close announcementBarClose_A3A1" aria-label="Close"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav class="navbar navbar--fixed-top navbarWrapper_UIa0"><div class="navbar__inner"><img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8f594acf-9b77-44fb-9475-3e82ead1910c" width="0" height="0" alt=""><img referrerpolicy="no-referrer-when-downgrade" src="https://analytics.apache.org/matomo.php?idsite=47&amp;rec=1" width="0" height="0" alt=""><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo navbarLogo_Bz6n"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><a class="navbar__item navbar__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Learn<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/talks"><div class="labelWrapperDropdown_Mqbj">Talks</div></a></li><li><a class="dropdown__link" href="/videos"><div class="labelWrapperDropdown_Mqbj">Video Guides</div></a></li><li><a class="dropdown__link" href="/docs/faq"><div class="labelWrapperDropdown_Mqbj">FAQ</div></a></li><li><a class="dropdown__link" href="/tech-specs"><div class="labelWrapperDropdown_Mqbj">Tech Specs</div></a></li><li><a class="dropdown__link" href="/tech-specs-1point0"><div class="labelWrapperDropdown_Mqbj">Tech Specs 1.0</div></a></li><li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Technical Wiki<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Contribute<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/contribute/how-to-contribute"><div class="labelWrapperDropdown_Mqbj">How to Contribute</div></a></li><li><a class="dropdown__link" href="/contribute/developer-setup"><div class="labelWrapperDropdown_Mqbj">Developer Setup</div></a></li><li><a class="dropdown__link" href="/contribute/rfc-process"><div class="labelWrapperDropdown_Mqbj">RFC Process</div></a></li><li><a class="dropdown__link" href="/contribute/report-security-issues"><div class="labelWrapperDropdown_Mqbj">Report Security Issues</div></a></li><li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Report Issues<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Community<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/get-involved"><div class="labelWrapperDropdown_Mqbj">Get Involved</div></a></li><li><a class="dropdown__link" href="/community/syncs"><div class="labelWrapperDropdown_Mqbj">Community Syncs</div></a></li><li><a class="dropdown__link" href="/community/office_hours"><div class="labelWrapperDropdown_Mqbj">Office Hours</div></a></li><li><a class="dropdown__link" href="/community/team"><div class="labelWrapperDropdown_Mqbj">Team</div></a></li></ul></div><a class="navbar__item navbar__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a><a class="navbar__item navbar__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who&#x27;s Using</div></a><a class="navbar__item navbar__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a><a class="navbar__item navbar__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link downloadLinkDropdownHide_aDP3" href="/docs/0.10.1/overview"><div class="labelWrapperDropdown_Mqbj">0.10.1<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/overview"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li><a class="dropdown__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li><a class="dropdown__link" href="/docs/0.14.0/overview"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li><a class="dropdown__link" href="/docs/0.13.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li><a class="dropdown__link" href="/docs/0.13.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li><a class="dropdown__link" href="/docs/0.12.3/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li><a class="dropdown__link" href="/docs/0.12.2/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li><a class="dropdown__link" href="/docs/0.12.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li><a class="dropdown__link" href="/docs/0.12.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li><a class="dropdown__link" href="/docs/0.11.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li><a class="dropdown__link" href="/docs/0.11.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/0.10.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li><a class="dropdown__link" href="/docs/0.10.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li><a class="dropdown__link" href="/docs/0.9.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li><a class="dropdown__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li><a class="dropdown__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li><a class="dropdown__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li><a class="dropdown__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li><a class="dropdown__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li><a class="dropdown__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li><a class="dropdown__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>English</span></span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#a)"><path d="M14 6.457a6.842 6.842 0 0 0-7-6.02 6.843 6.843 0 0 0-7 6.02v1.085a6.843 6.843 0 0 0 7 6.02 6.843 6.843 0 0 0 7-6.02V6.457Zm-1.094 0h-2.625a9.92 9.92 0 0 0-.376-2.222 6.65 6.65 0 0 0 1.531-.875 5.25 5.25 0 0 1 1.444 3.097h.026Zm-8.032 0a8.479 8.479 0 0 1 .324-1.872 7.376 7.376 0 0 0 3.63 0c.175.61.284 1.239.325 1.872h-4.28Zm4.305 1.085a8.391 8.391 0 0 1-.324 1.873 7.464 7.464 0 0 0-3.658 0 8.479 8.479 0 0 1-.323-1.873h4.305Zm.35-4.375A10.342 10.342 0 0 0 8.75 1.75c.627.194 1.218.49 1.75.875a5.748 5.748 0 0 1-.998.577l.027-.035ZM7.254 1.54A8.75 8.75 0 0 1 8.46 3.552c-.48.11-.97.165-1.461.167-.492-.001-.982-.057-1.461-.167.308-.722.715-1.4 1.207-2.012h.508ZM4.498 3.202a5.748 5.748 0 0 1-.998-.577 6.029 6.029 0 0 1 1.75-.875c-.294.46-.546.947-.753 1.452Zm-1.873.15c.47.358.984.652 1.531.874A9.625 9.625 0 0 0 3.78 6.45H1.155a5.25 5.25 0 0 1 1.47-3.098ZM1.12 7.541h2.625c.038.753.164 1.5.376 2.223a6.649 6.649 0 0 0-1.531.875 5.25 5.25 0 0 1-1.47-3.098Zm3.377 3.255c.207.506.459.992.753 1.453a6.03 6.03 0 0 1-1.75-.875c.312-.226.646-.419.997-.578Zm2.25 1.663a8.594 8.594 0 0 1-1.208-2.013 6.501 6.501 0 0 1 2.922 0 8.54 8.54 0 0 1-1.207 2.013h-.508Zm2.755-1.663c.367.156.716.35 1.042.578a6.338 6.338 0 0 1-1.75.875c.275-.464.512-.95.708-1.453Zm1.873-.148a6.647 6.647 0 0 0-1.531-.875 9.45 9.45 0 0 0 .376-2.223h2.625a5.25 5.25 0 0 1-1.47 3.098Z" fill="#1C1E21"></path></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h14v14H0z"></path></clipPath></defs></svg></div></a><ul class="dropdown__menu"><li><a href="/docs/0.10.1/hoodie_deltastreamer" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active"><div class="labelWrapperDropdown_Mqbj">English</div></a></li><li><a href="/cn/docs/0.10.1/hoodie_deltastreamer" target="_self" rel="noopener noreferrer" class="dropdown__link"><div class="labelWrapperDropdown_Mqbj">Chinese</div></a></li></ul></div><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a><div class="searchBox_fBfG"><div role="button" class="searchButton_g9-U" aria-label="Search"><span class="searchText_RI6l">Search</span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="6.864" cy="6.864" r="5.243" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></circle><path d="m10.51 10.783 2.056 2.05" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><button type="button" class="clean-btn navbar-sidebar__close"><svg viewBox="0 0 15 15" width="21" height="21"><g stroke="var(--ifm-color-emphasis-600)" stroke-width="1.2"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><div class="navbar-sidebar__items"><div class="navbar-sidebar__item menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Learn</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Contribute</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Community</div></a></li><li class="menu__list-item"><a class="menu__link" href="/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a></li><li class="menu__list-item"><a class="menu__link" href="/powered-by"><div class="labelWrapperDropdown_Mqbj">Who&#x27;s Using</div></a></li><li class="menu__list-item"><a class="menu__link" href="/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a></li><li class="menu__list-item"><a class="menu__link" href="/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></li><li class="menu__list-item"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Versions</div></a><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/next/overview"><div class="labelWrapperDropdown_Mqbj">Current</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/overview"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.14.0/overview"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.13.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.3/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.2/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.12.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.11.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/docs/0.10.1/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.10.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.9.0/hoodie_deltastreamer"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.8.0/overview"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.7.0/overview"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.6.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.3/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.2/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.1/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li class="menu__list-item"><a class="menu__link" href="/docs/0.5.0/quick-start-guide"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Languages</span></span></div></a></li><li class="menu__list-item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="menu__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="menu__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="menu__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="menu__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="menu__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a></li></ul></div><div class="navbar-sidebar__item menu"><button type="button" class="clean-btn navbar-sidebar__back">← Back to main menu</button></div></div></div></nav><div class="main-wrapper docs-wrapper docs-doc-page"><div class="docPage_GMj9"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_i9tI" type="button"></button><aside class="docSidebarContainer_k0Pq"><div class="sidebar_a3j0"><nav class="menu thin-scrollbar menu_cyFh menuWithAnnouncementBar_+O1J"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.10.1/overview">Overview</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.10.1/quick-start-guide">Quick Start</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/quick-start-guide">Spark Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/flink-quick-start-guide">Flink Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/docker_demo">Docker Demo</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.10.1/timeline">Concepts</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--active hasHref_TwRn" href="/docs/0.10.1/table_management">How To</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/table_management">SQL DDL</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/writing_data">Writing Data</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/docs/0.10.1/hoodie_deltastreamer">Streaming Ingestion</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/querying_data">Querying Data</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/flink_configuration">Flink Setup</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/0.10.1/syncing_metastore">Syncing to Metastore</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.10.1/migration_guide">Services</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.10.1/configurations">Configurations</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/docs/0.10.1/query_engine_setup">Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.10.1/use_cases">Use Cases</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.10.1/faq">FAQs</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/0.10.1/privacy">Privacy Policy</a></li></ul></nav></div></aside><main class="docMainContainer_Q970"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_zHA2"><div class="theme-doc-version-banner alert alert--warning margin-bottom--md" role="alert"><div>This is documentation for <!-- -->Apache Hudi<!-- --> <b>0.10.1</b>, which is no longer actively maintained.</div><div class="margin-top--md">For up-to-date documentation, see the <b><a href="/docs/overview">latest version</a></b> (<!-- -->0.14.1<!-- -->).</div></div><div class="docItemContainer_oiyr"><article><span class="theme-doc-version-badge badge badge--secondary">Version: <!-- -->0.10.1</span><div class="tocCollapsible_aw-L theme-doc-toc-mobile tocMobile_Tx6Y"><button type="button" class="clean-btn tocCollapsibleButton_zr6a">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Streaming Ingestion</h1></header><h2 class="anchor anchorWithStickyNavbar_y2LR" id="deltastreamer">DeltaStreamer<a class="hash-link" href="#deltastreamer" title="Direct link to heading"></a></h2><p>The <code>HoodieDeltaStreamer</code> utility (part of hudi-utilities-bundle) provides the way to ingest from different sources such as DFS or Kafka, with the following capabilities.</p><ul><li>Exactly once ingestion of new events from Kafka, <a href="https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports" target="_blank" rel="noopener noreferrer">incremental imports</a> from Sqoop or output of <code>HiveIncrementalPuller</code> or files under a DFS folder</li><li>Support json, avro or a custom record types for the incoming data</li><li>Manage checkpoints, rollback &amp; recovery</li><li>Leverage Avro schemas from DFS or Confluent <a href="https://github.com/confluentinc/schema-registry" target="_blank" rel="noopener noreferrer">schema registry</a>.</li><li>Support for plugging in transformations</li></ul><p>Command line options describe capabilities in more detail</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` --help</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Usage: &lt;main class&gt; [options]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Options:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --checkpoint</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Resume Delta Streamer from this checkpoint.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --commit-on-errors</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Commit even when some records failed to be written</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --compact-scheduling-minshare</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Minshare for compaction as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --compact-scheduling-weight</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Scheduling weight for compaction as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --continuous</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Delta Streamer runs in continuous mode running source-fetch -&gt; Transform</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -&gt; Hudi Write in loop</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --delta-sync-scheduling-minshare</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Minshare for delta sync as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --delta-sync-scheduling-weight</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Scheduling weight for delta sync as defined in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> https://spark.apache.org/docs/latest/job-scheduling</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --disable-compaction</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Compaction is enabled for MoR table by default. This flag disables it</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --enable-hive-sync</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Enable syncing to hive (Deprecated in favor of --enable-sync)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --enable-sync</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Enable syncing meta</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --filter-dupes</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Should duplicate records from source be dropped/filtered out before</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> insert/bulk-insert</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --help, -h</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hoodie-conf</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Any configuration that can be set in the properties file (using the CLI</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> parameter &quot;--propsFilePath&quot;) can also be passed command line using this</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> parameter</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: []</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --max-pending-compactions</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Maximum number of outstanding inflight/requested compactions. Delta Sync</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> will not happen unlessoutstanding compactions is less than this number</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 5</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --min-sync-interval-seconds</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> the min sync interval of each sync in continuous mode</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 0</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --op</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Takes one of these values : UPSERT (default), INSERT (use when input is</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> purely new data/inserts to gain speed)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: UPSERT</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Possible Values: [UPSERT, INSERT, BULK_INSERT]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --payload-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> subclass of HoodieRecordPayload, that works off a GenericRecord.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Implement your own, if you want to do something other than overwriting</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> existing value</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: org.apache.hudi.common.model.OverwriteWithLatestAvroPayload</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> path to properties file on localfs or dfs, with configurations for</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hoodie client, schema provider, key generator and data source. For</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hoodie client props, sane defaults are used, but recommend use to</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> provide basic things like metrics endpoints, hive configs etc. For</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> sources, referto individual classes, for supported properties.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: file:///Users/vinoth/bin/hoodie/src/test/resources/delta-streamer-config/dfs-source.properties</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> subclass of org.apache.hudi.utilities.schema.SchemaProvider to attach</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> schemas to input &amp; target table data, built in options:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> org.apache.hudi.utilities.schema.FilebasedSchemaProvider.Source (See</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> org.apache.hudi.utilities.sources.Source) implementation can implement</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> their own SchemaProvider. For Sources that return Dataset&lt;Row&gt;, the</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> schema is obtained implicitly. However, this CLI option allows</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> overriding the schemaprovider returned by Source.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Subclass of org.apache.hudi.utilities.sources to read data. Built-in</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> options: org.apache.hudi.utilities.sources.{JsonDFSSource (default), </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> AvroDFSSource, AvroKafkaSource, CsvDFSSource, HiveIncrPullSource, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> JdbcSource, JsonKafkaSource, ORCDFSSource, ParquetDFSSource, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> S3EventsHoodieIncrSource, S3EventsSource, SqlSource}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: org.apache.hudi.utilities.sources.JsonDFSSource</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-limit</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Maximum amount of data to read from source. Default: No limit For e.g:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> DFS-Source =&gt; max bytes to read, Kafka-Source =&gt; max events to read</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: 9223372036854775807</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Field within source record to decide how to break ties between records</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> with same key in input data. Default: &#x27;ts&#x27; holding unix timestamp of</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> record</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: ts</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --spark-master</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark master to use.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Default: local[2]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --table-type</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Type of table. COPY_ON_WRITE (or) MERGE_ON_READ</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --target-base-path</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> base path for the target hoodie table. (Will be created if did not exist</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> first time around. If exists, expected to be a hoodie table)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --target-table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> name of the target table in Hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --transformer-class</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> subclass of org.apache.hudi.utilities.transform.Transformer. Allows</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> transforming raw source Dataset to a target Dataset (conforming to</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> target schema) before writing. Default : Not set. E:g -</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> org.apache.hudi.utilities.transform.SqlQueryBasedTransformer (which</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> allows a SQL query templated to be passed as a transformation function)</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The tool takes a hierarchically composed property file and has pluggable interfaces for extracting data, key generation and providing schema. Sample configs for ingesting from kafka and dfs are
provided under <code>hudi-utilities/src/test/resources/delta-streamer-config</code>.</p><p>For e.g: once you have Confluent Kafka, Schema registry up &amp; running, produce some test data using (<a href="https://docs.confluent.io/current/ksql/docs/tutorials/generate-custom-test-data" target="_blank" rel="noopener noreferrer">impressions.avro</a> provided by schema-registry repo)</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[confluent-5.0.0]$ bin/ksql-datagen schema=../impressions.avro format=avro topic=impressions key=impressionid</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>and then ingest it as follows.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field impresssiontime \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path file:\/\/\/tmp/hudi-deltastreamer-op \ </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table uber.impressions \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --op BULK_INSERT</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>In some cases, you may want to migrate your existing table into Hudi beforehand. Please refer to <a href="/docs/migration_guide">migration guide</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="multitabledeltastreamer">MultiTableDeltaStreamer<a class="hash-link" href="#multitabledeltastreamer" title="Direct link to heading"></a></h3><p><code>HoodieMultiTableDeltaStreamer</code>, a wrapper on top of <code>HoodieDeltaStreamer</code>, enables one to ingest multiple tables at a single go into hudi datasets. Currently it only supports sequential processing of tables to be ingested and COPY_ON_WRITE storage type. The command line options for <code>HoodieMultiTableDeltaStreamer</code> are pretty much similar to <code>HoodieDeltaStreamer</code> with the only exception that you are required to provide table wise configs in separate files in a dedicated config folder. The following command line options are introduced</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain"> * --config-folder</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> the path to the folder which contains all the table wise config files</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --base-path-prefix</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> this is added to enable users to create all the hudi datasets for related tables under one path in FS. The datasets are then created under the path - &lt;base_path_prefix&gt;/&lt;database&gt;/&lt;table_to_be_ingested&gt;. However you can override the paths for every table by setting the property hoodie.deltastreamer.ingestion.targetBasePath</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The following properties are needed to be set properly to ingest data using <code>HoodieMultiTableDeltaStreamer</code>.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.deltastreamer.ingestion.tablesToBeIngested</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> comma separated names of tables to be ingested in the format &lt;database&gt;.&lt;table&gt;, for example db1.table1,db1.table2</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.deltastreamer.ingestion.targetBasePath</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> if you wish to ingest a particular table in a separate path, you can mention that path here</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie.deltastreamer.ingestion.&lt;database&gt;.&lt;table&gt;.configFile</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> path to the config file in dedicated config folder which contains table overridden properties for the particular table to be ingested.</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Sample config files for table wise overridden properties can be found under <code>hudi-utilities/src/test/resources/delta-streamer-config</code>. The command to run <code>HoodieMultiTableDeltaStreamer</code> is also similar to how you run <code>HoodieDeltaStreamer</code>.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --config-folder file://tmp/hudi-ingestion-config \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field impresssiontime \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --base-path-prefix file:\/\/\/tmp/hudi-deltastreamer-op \ </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table uber.impressions \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --op BULK_INSERT</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>For detailed information on how to configure and use <code>HoodieMultiTableDeltaStreamer</code>, please refer <a href="/blog/2020/08/22/ingest-multiple-tables-using-hudi">blog section</a>.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="concurrency-control">Concurrency Control<a class="hash-link" href="#concurrency-control" title="Direct link to heading"></a></h3><p>The <code>HoodieDeltaStreamer</code> utility (part of hudi-utilities-bundle) provides ways to ingest from different sources such as DFS or Kafka, with the following capabilities.</p><p>Using optimistic_concurrency_control via delta streamer requires adding the above configs to the properties file that can be passed to the
job. For example below, adding the configs to kafka-source.properties file and passing them to deltastreamer will enable optimistic concurrency.
A deltastreamer job can then be triggered as follows:</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field impresssiontime \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path file:\/\/\/tmp/hudi-deltastreamer-op \ </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table uber.impressions \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --op BULK_INSERT</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Read more in depth about concurrency control in the <a href="/docs/concurrency_control">concurrency control concepts</a> section</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="checkpointing">Checkpointing<a class="hash-link" href="#checkpointing" title="Direct link to heading"></a></h2><p>HoodieDeltaStreamer uses checkpoints to keep track of what data has been read already so it can resume without needing to reprocess all data.
When using a Kafka source, the checkpoint is the <a href="https://cwiki.apache.org/confluence/display/KAFKA/Offset+Management" target="_blank" rel="noopener noreferrer">Kafka Offset</a>
When using a DFS source, the checkpoint is the &#x27;last modified&#x27; timestamp of the latest file read.
Checkpoints are saved in the .hoodie commit file as <code>deltastreamer.checkpoint.key</code>.</p><p>If you need to change the checkpoints for reprocessing or replaying data you can use the following options:</p><ul><li><code>--checkpoint</code> will set <code>deltastreamer.checkpoint.reset_key</code> in the commit file to overwrite the current checkpoint.</li><li><code>--source-limit</code> will set a maximum amount of data to read from the source. For DFS sources, this is max # of bytes read.
For Kafka, this is the max # of events to read.</li></ul><h2 class="anchor anchorWithStickyNavbar_y2LR" id="schema-providers">Schema Providers<a class="hash-link" href="#schema-providers" title="Direct link to heading"></a></h2><p>By default, Spark will infer the schema of the source and use that inferred schema when writing to a table. If you need
to explicitly define the schema you can use one of the following Schema Providers below.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="schema-registry-provider">Schema Registry Provider<a class="hash-link" href="#schema-registry-provider" title="Direct link to heading"></a></h3><p>You can obtain the latest schema from an online registry. You pass a URL to the registry and if needed, you can also
pass userinfo and credentials in the url like: <code>https://foo:bar@schemaregistry.org</code> The credentials are then extracted
and are set on the request as an Authorization Header.</p><p>When fetching schemas from a registry, you can specify both the source schema and the target schema separately.</p><table><thead><tr><th>Config</th><th>Description</th><th>Example</th></tr></thead><tbody><tr><td>hoodie.deltastreamer.schemaprovider.registry.url</td><td>The schema of the source you are reading from</td><td>https://foo:<a href="mailto:bar@schemaregistry.org" target="_blank" rel="noopener noreferrer">bar@schemaregistry.org</a></td></tr><tr><td>hoodie.deltastreamer.schemaprovider.registry.targetUrl</td><td>The schema of the target you are writing to</td><td>https://foo:<a href="mailto:bar@schemaregistry.org" target="_blank" rel="noopener noreferrer">bar@schemaregistry.org</a></td></tr></tbody></table><p>The above configs are passed to DeltaStreamer spark-submit command like:
<code>--hoodie-conf hoodie.deltastreamer.schemaprovider.registry.url=https://foo:bar@schemaregistry.org</code></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="jdbc-schema-provider">JDBC Schema Provider<a class="hash-link" href="#jdbc-schema-provider" title="Direct link to heading"></a></h3><p>You can obtain the latest schema through a JDBC connection.</p><table><thead><tr><th>Config</th><th>Description</th><th>Example</th></tr></thead><tbody><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.connection.url</td><td>The JDBC URL to connect to. You can specify source specific connection properties in the URL</td><td>jdbc:postgresql://localhost/test?user=fred&amp;password=secret</td></tr><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.driver.type</td><td>The class name of the JDBC driver to use to connect to this URL</td><td>org.h2.Driver</td></tr><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.username</td><td>username for the connection</td><td>fred</td></tr><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.password</td><td>password for the connection</td><td>secret</td></tr><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.dbtable</td><td>The table with the schema to reference</td><td>test_database.test1_table or test1_table</td></tr><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.timeout</td><td>The number of seconds the driver will wait for a Statement object to execute to the given number of seconds. Zero means there is no limit. In the write path, this option depends on how JDBC drivers implement the API setQueryTimeout, e.g., the h2 JDBC driver checks the timeout of each query instead of an entire JDBC batch. It defaults to 0.</td><td>0</td></tr><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.jdbc.nullable</td><td>If true, all columns are nullable</td><td>true</td></tr></tbody></table><p>The above configs are passed to DeltaStreamer spark-submit command like:
<code>--hoodie-conf hoodie.deltastreamer.jdbcbasedschemaprovider.connection.url=jdbc:postgresql://localhost/test?user=fred&amp;password=secret</code></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="file-based-schema-provider">File Based Schema Provider<a class="hash-link" href="#file-based-schema-provider" title="Direct link to heading"></a></h3><p>You can use a .avsc file to define your schema. You can then point to this file on DFS as a schema provider.</p><table><thead><tr><th>Config</th><th>Description</th><th>Example</th></tr></thead><tbody><tr><td>hoodie.deltastreamer.schemaprovider.source.schema.file</td><td>The schema of the source you are reading from</td><td><a href="https://github.com/apache/hudi/blob/a8fb69656f522648233f0310ca3756188d954281/docker/demo/config/test-suite/source.avsc" target="_blank" rel="noopener noreferrer">example schema file</a></td></tr><tr><td>hoodie.deltastreamer.schemaprovider.target.schema.file</td><td>The schema of the target you are writing to</td><td><a href="https://github.com/apache/hudi/blob/a8fb69656f522648233f0310ca3756188d954281/docker/demo/config/test-suite/target.avsc" target="_blank" rel="noopener noreferrer">example schema file</a></td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_y2LR" id="schema-provider-with-post-processor">Schema Provider with Post Processor<a class="hash-link" href="#schema-provider-with-post-processor" title="Direct link to heading"></a></h3><p>The SchemaProviderWithPostProcessor, will extract the schema from one of the previously mentioned Schema Providers and
then will apply a post processor to change the schema before it is used. You can write your own post processor by extending
this class: <a href="https://github.com/apache/hudi/blob/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaPostProcessor.java" target="_blank" rel="noopener noreferrer">https://github.com/apache/hudi/blob/master/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaPostProcessor.java</a></p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="sources">Sources<a class="hash-link" href="#sources" title="Direct link to heading"></a></h2><p>Hoodie DeltaStreamer can read data from a wide variety of sources. The following are a list of supported sources:</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="distributed-file-system-dfs">Distributed File System (DFS)<a class="hash-link" href="#distributed-file-system-dfs" title="Direct link to heading"></a></h3><p>See the storage configurations page to see some examples of DFS applications Hudi can read from. The following are the
supported file formats Hudi can read/write with on DFS Sources. (Note: you can still use Spark/Flink readers to read from
other formats and then write data as Hudi format.)</p><ul><li>CSV</li><li>AVRO</li><li>JSON</li><li>PARQUET</li><li>ORC</li><li>HUDI</li></ul><p>For DFS sources the following behaviors are expected:</p><ul><li>For JSON DFS source, you always need to set a schema. If the target Hudi table follows the same schema as from the source file, you just need to set the source schema. If not, you need to set schemas for both source and target.</li><li><code>HoodieDeltaStreamer</code> reads the files under the source base path (<code>hoodie.deltastreamer.source.dfs.root</code>) directly, and it won&#x27;t use the partition paths under this base path as fields of the dataset. Detailed examples can be found <a href="https://github.com/apache/hudi/issues/5485" target="_blank" rel="noopener noreferrer">here</a>.</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="kafka">Kafka<a class="hash-link" href="#kafka" title="Direct link to heading"></a></h3><p>Hudi can read directly from Kafka clusters. See more details on HoodieDeltaStreamer to learn how to setup streaming
ingestion with exactly once semantics, checkpointing, and plugin transformations. The following formats are supported
when reading data from Kafka:</p><ul><li>AVRO</li><li>JSON</li></ul><h3 class="anchor anchorWithStickyNavbar_y2LR" id="s3-events">S3 Events<a class="hash-link" href="#s3-events" title="Direct link to heading"></a></h3><p>AWS S3 storage provides an event notification service which will post notifications when certain events happen in your S3 bucket:
<a href="https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html" target="_blank" rel="noopener noreferrer">https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html</a>
AWS will put these events in a Simple Queue Service (SQS). Apache Hudi provides an S3EventsSource that can read from SQS
to trigger/processing of new or changed data as soon as it is available on S3.</p><h4 class="anchor anchorWithStickyNavbar_y2LR" id="setup">Setup<a class="hash-link" href="#setup" title="Direct link to heading"></a></h4><ol><li>Enable S3 Event Notifications <a href="https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html" target="_blank" rel="noopener noreferrer">https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html</a></li><li>Download the aws-java-sdk-sqs jar. </li><li>Find the queue URL and Region to set these configurations:<ol><li>hoodie.deltastreamer.s3.source.queue.url=<a href="https://sqs.us-west-2.amazonaws.com/queue/url" target="_blank" rel="noopener noreferrer">https://sqs.us-west-2.amazonaws.com/queue/url</a></li><li>hoodie.deltastreamer.s3.source.queue.region=us-west-2 </li></ol></li><li>start the S3EventsSource and S3EventsHoodieIncrSource using the HoodieDeltaStreamer utility as shown in sample commands below:</li></ol><p>Insert code sample from this blog: <a href="https://hudi.apache.org/blog/2021/08/23/s3-events-source/#configuration-and-setup" target="_blank" rel="noopener noreferrer">https://hudi.apache.org/blog/2021/08/23/s3-events-source/#configuration-and-setup</a></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="jdbc-source">JDBC Source<a class="hash-link" href="#jdbc-source" title="Direct link to heading"></a></h3><p>Hudi can read from a JDBC source with a full fetch of a table, or Hudi can even read incrementally with checkpointing from a JDBC source.</p><table><thead><tr><th>Config</th><th>Description</th><th>Example</th></tr></thead><tbody><tr><td>hoodie.deltastreamer.jdbc.url</td><td>URL of the JDBC connection</td><td>jdbc:postgresql://localhost/test</td></tr><tr><td>hoodie.deltastreamer.jdbc.user</td><td>User to use for authentication of the JDBC connection</td><td>fred</td></tr><tr><td>hoodie.deltastreamer.jdbc.password</td><td>Password to use for authentication of the JDBC connection</td><td>secret</td></tr><tr><td>hoodie.deltastreamer.jdbc.password.file</td><td>If you prefer to use a password file for the connection</td><td></td></tr><tr><td>hoodie.deltastreamer.jdbc.driver.class</td><td>Driver class to use for the JDBC connection</td><td></td></tr><tr><td>hoodie.deltastreamer.jdbc.table.name</td><td></td><td>my_table</td></tr><tr><td>hoodie.deltastreamer.jdbc.table.incr.column.name</td><td>If run in incremental mode, this field will be used to pull new data incrementally</td><td></td></tr><tr><td>hoodie.deltastreamer.jdbc.incr.pull</td><td>Will the JDBC connection perform an incremental pull?</td><td></td></tr><tr><td>hoodie.deltastreamer.jdbc.extra.options.</td><td>How you pass extra configurations that would normally by specified as spark.read.option()</td><td>hoodie.deltastreamer.jdbc.extra.options.fetchSize=100 hoodie.deltastreamer.jdbc.extra.options.upperBound=1 hoodie.deltastreamer.jdbc.extra.options.lowerBound=100</td></tr><tr><td>hoodie.deltastreamer.jdbc.storage.level</td><td>Used to control the persistence level</td><td>Default = MEMORY_AND_DISK_SER</td></tr><tr><td>hoodie.deltastreamer.jdbc.incr.fallback.to.full.fetch</td><td>Boolean which if set true makes an incremental fetch fallback to a full fetch if there is any error in the incremental read</td><td>FALSE</td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_y2LR" id="sql-source">SQL Source<a class="hash-link" href="#sql-source" title="Direct link to heading"></a></h3><p>SQL Source that reads from any table, used mainly for backfill jobs which will process specific partition dates.
This won&#x27;t update the deltastreamer.checkpoint.key to the processed commit, instead it will fetch the latest successful
checkpoint key and set that value as this backfill commits checkpoint so that it won&#x27;t interrupt the regular incremental
processing. To fetch and use the latest incremental checkpoint, you need to also set this hoodie_conf for deltastremer
jobs: <code>hoodie.write.meta.key.prefixes = &#x27;deltastreamer.checkpoint.key&#x27;</code></p><p>Spark SQL should be configured using this hoodie config:
hoodie.deltastreamer.source.sql.sql.query = &#x27;select * from source_table&#x27;</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="flink-ingestion">Flink Ingestion<a class="hash-link" href="#flink-ingestion" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="cdc-ingestion">CDC Ingestion<a class="hash-link" href="#cdc-ingestion" title="Direct link to heading"></a></h3><p>CDC(change data capture) keep track of the data changes evolving in a source system so a downstream process or system can action that change.
We recommend two ways for syncing CDC data into Hudi:</p><p><img alt="slide1 title" src="/assets/images/cdc-2-hudi-d151389758f4ce3fd873c1258b0a8ce5.png"></p><ol><li>Using the Ververica <a href="https://github.com/ververica/flink-cdc-connectors" target="_blank" rel="noopener noreferrer">flink-cdc-connectors</a> directly connect to DB Server to sync the binlog data into Hudi.
The advantage is that it does not rely on message queues, but the disadvantage is that it puts pressure on the db server;</li><li>Consume data from a message queue (for e.g, the Kafka) using the flink cdc format, the advantage is that it is highly scalable,
but the disadvantage is that it relies on message queues.</li></ol><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><ul><li>If the upstream data cannot guarantee the order, you need to specify option <code>write.precombine.field</code> explicitly;</li><li>The MOR table can not handle DELETEs in event time sequence now, thus causing data loss. You better switch on the changelog mode through
option <code>changelog.enabled</code>.</li></ul></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="bulk-insert">Bulk Insert<a class="hash-link" href="#bulk-insert" title="Direct link to heading"></a></h3><p>For the demand of snapshot data import. If the snapshot data comes from other data sources, use the <code>bulk_insert</code> mode to quickly
import the snapshot data into Hudi.</p><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p><code>bulk_insert</code> eliminates the serialization and data merging. The data deduplication is skipped, so the user need to guarantee the uniqueness of the data.</p></div></div><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p><code>bulk_insert</code> is more efficient in the <code>batch execution mode</code>. By default, the <code>batch execution mode</code> sorts the input records
by the partition path and writes these records to Hudi, which can avoid write performance degradation caused by
frequent <code>file handle</code> switching. </p></div></div><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>The parallelism of <code>bulk_insert</code> is specified by <code>write.tasks</code>. The parallelism will affect the number of small files.
In theory, the parallelism of <code>bulk_insert</code> is the number of <code>bucket</code>s (In particular, when each bucket writes to maximum file size, it
will rollover to the new file handle. Finally, <code>the number of files</code> &gt;= <a href="#parallelism"><code>write.bucket_assign.tasks</code></a>).</p></div></div><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options">Options<a class="hash-link" href="#options" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>write.operation</code></td><td><code>true</code></td><td><code>upsert</code></td><td>Setting as <code>bulk_insert</code> to open this function</td></tr><tr><td><code>write.tasks</code></td><td><code>false</code></td><td><code>4</code></td><td>The parallelism of <code>bulk_insert</code>, <code>the number of files</code> &gt;= <a href="#parallelism"><code>write.bucket_assign.tasks</code></a></td></tr><tr><td><code>write.bulk_insert.shuffle_by_partition</code></td><td><code>false</code></td><td><code>true</code></td><td>Whether to shuffle data according to the partition field before writing. Enabling this option will reduce the number of small files, but there may be a risk of data skew</td></tr><tr><td><code>write.bulk_insert.sort_by_partition</code></td><td><code>false</code></td><td><code>true</code></td><td>Whether to sort data according to the partition field before writing. Enabling this option will reduce the number of small files when a write task writes multiple partitions</td></tr><tr><td><code>write.sort.memory</code></td><td><code>false</code></td><td><code>128</code></td><td>Available managed memory of sort operator. default <code>128</code> MB</td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_y2LR" id="index-bootstrap">Index Bootstrap<a class="hash-link" href="#index-bootstrap" title="Direct link to heading"></a></h3><p>For the demand of <code>snapshot data</code> + <code>incremental data</code> import. If the <code>snapshot data</code> already insert into Hudi by <a href="#bulk-insert">bulk insert</a>.
User can insert <code>incremental data</code> in real time and ensure the data is not repeated by using the index bootstrap function.</p><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>If you think this process is very time-consuming, you can add resources to write in streaming mode while writing <code>snapshot data</code>,
and then reduce the resources to write <code>incremental data</code> (or open the rate limit function).</p></div></div><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options-1">Options<a class="hash-link" href="#options-1" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>index.bootstrap.enabled</code></td><td><code>true</code></td><td><code>false</code></td><td>When index bootstrap is enabled, the remain records in Hudi table will be loaded into the Flink state at one time</td></tr><tr><td><code>index.partition.regex</code></td><td><code>false</code></td><td><code>*</code></td><td>Optimize option. Setting regular expressions to filter partitions. By default, all partitions are loaded into flink state</td></tr></tbody></table><h4 class="anchor anchorWithStickyNavbar_y2LR" id="how-to-use">How To Use<a class="hash-link" href="#how-to-use" title="Direct link to heading"></a></h4><ol><li><code>CREATE TABLE</code> creates a statement corresponding to the Hudi table. Note that the <code>table.type</code> must be correct.</li><li>Setting <code>index.bootstrap.enabled</code> = <code>true</code> to enable the index bootstrap function.</li><li>Setting Flink checkpoint failure tolerance in <code>flink-conf.yaml</code> : <code>execution.checkpointing.tolerable-failed-checkpoints = n</code> (depending on Flink checkpoint scheduling times).</li><li>Waiting until the first checkpoint succeeds, indicating that the index bootstrap completed.</li><li>After the index bootstrap completed, user can exit and save the savepoint (or directly use the externalized checkpoint).</li><li>Restart the job, setting <code>index.bootstrap.enable</code> as <code>false</code>.</li></ol><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><ol><li>Index bootstrap is blocking, so checkpoint cannot be completed during index bootstrap.</li><li>Index bootstrap triggers by the input data. User need to ensure that there is at least one record in each partition.</li><li>Index bootstrap executes concurrently. User can search in log by <code>finish loading the index under partition</code> and <code>Load record form file</code> to observe the progress of index bootstrap.</li><li>The first successful checkpoint indicates that the index bootstrap completed. There is no need to load the index again when recovering from the checkpoint.</li></ol></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="changelog-mode">Changelog Mode<a class="hash-link" href="#changelog-mode" title="Direct link to heading"></a></h3><p>Hudi can keep all the intermediate changes (I / -U / U / D) of messages, then consumes through stateful computing of flink to have a near-real-time
data warehouse ETL pipeline (Incremental computing). Hudi MOR table stores messages in the forms of rows, which supports the retention of all change logs (Integration at the format level).
All changelog records can be consumed with Flink streaming reader.</p><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options-2">Options<a class="hash-link" href="#options-2" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>changelog.enabled</code></td><td><code>false</code></td><td><code>false</code></td><td>It is turned off by default, to have the <code>upsert</code> semantics, only the merged messages are ensured to be kept, intermediate changes may be merged. Setting to true to support consumption of all changes</td></tr></tbody></table><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>Batch (Snapshot) read still merge all the intermediate changes, regardless of whether the format has stored the intermediate changelog messages.</p></div></div><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>After setting <code>changelog.enable</code> as <code>true</code>, the retention of changelog records are only best effort: the asynchronous compaction task will merge the changelog records into one record, so if the
stream source does not consume timely, only the merged record for each key can be read after compaction. The solution is to reserve some buffer time for the reader by adjusting the compaction strategy, such as
the compaction options: <a href="#compaction"><code>compaction.delta_commits</code></a> and <a href="#compaction"><code>compaction.delta_seconds</code></a>.</p></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="append-mode">Append Mode<a class="hash-link" href="#append-mode" title="Direct link to heading"></a></h3><p>If INSERT operation is used for ingestion, for COW table, there is no merging of small files by default; for MOR table, the small file strategy is applied always: MOR appends delta records to log files.</p><p>The small file strategy lead to performance degradation. If you want to apply the behavior of file merge for COW table, turns on option <code>write.insert.cluster</code>, there is no record key combining by the way.</p><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options-3">Options<a class="hash-link" href="#options-3" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>write.insert.cluster</code></td><td><code>false</code></td><td><code>false</code></td><td>Whether to merge small files while ingesting, for COW table, open the option to enable the small file merging strategy(no deduplication for keys but the throughput will be affected)</td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_y2LR" id="rate-limit">Rate Limit<a class="hash-link" href="#rate-limit" title="Direct link to heading"></a></h3><p>There are many use cases that user put the full history data set onto the message queue together with the realtime incremental data. Then they consume the data from the queue into the hudi from the earliest offset using flink. Consuming history data set has these characteristics:
1). The instant throughput is huge 2). It has serious disorder (with random writing partitions). It will lead to degradation of writing performance and throughput glitches. For this case, the speed limit parameter can be turned on to ensure smooth writing of the flow.</p><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options-4">Options<a class="hash-link" href="#options-4" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>write.rate.limit</code></td><td><code>false</code></td><td><code>0</code></td><td>Default disable the rate limit</td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_y2LR" id="streaming-query">Streaming Query<a class="hash-link" href="#streaming-query" title="Direct link to heading"></a></h3><p>By default, the hoodie table is read as batch, that is to read the latest snapshot data set and returns. Turns on the streaming read
mode by setting option <code>read.streaming.enabled</code> as <code>true</code>. Sets up option <code>read.start-commit</code> to specify the read start offset, specifies the
value as <code>earliest</code> if you want to consume all the history data set.</p><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options-5">Options<a class="hash-link" href="#options-5" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>read.streaming.enabled</code></td><td>false</td><td><code>false</code></td><td>Specify <code>true</code> to read as streaming</td></tr><tr><td><code>read.start-commit</code></td><td>false</td><td>the latest commit</td><td>Start commit time in format &#x27;yyyyMMddHHmmss&#x27;, use <code>earliest</code> to consume from the start commit</td></tr><tr><td><code>read.streaming.skip_compaction</code></td><td>false</td><td><code>false</code></td><td>Whether to skip compaction commits while reading, generally for two purposes: 1) Avoid consuming duplications from the compaction instants 2) When change log mode is enabled, to only consume change logs for right semantics.</td></tr><tr><td><code>clean.retain_commits</code></td><td>false</td><td><code>10</code></td><td>The max number of commits to retain before cleaning, when change log mode is enabled, tweaks this option to adjust the change log live time. For example, the default strategy keeps 50 minutes of change logs if the checkpoint interval is set up as 5 minutes.</td></tr></tbody></table><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>When option <code>read.streaming.skip_compaction</code> turns on and the streaming reader lags behind by commits of number
<code>clean.retain_commits</code>, the data loss may occur.</p></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="incremental-query">Incremental Query<a class="hash-link" href="#incremental-query" title="Direct link to heading"></a></h3><p>There are 3 use cases for incremental query:</p><ol><li>Streaming query: specify the start commit with option <code>read.start-commit</code>;</li><li>Batch query: specify the start commit with option <code>read.start-commit</code> and end commit with option <code>read.end-commit</code>,
the interval is a closed one: both start commit and end commit are inclusive;</li><li>TimeTravel: consume as batch for an instant time, specify the <code>read.end-commit</code> is enough because the start commit is latest by default.</li></ol><h4 class="anchor anchorWithStickyNavbar_y2LR" id="options-6">Options<a class="hash-link" href="#options-6" title="Direct link to heading"></a></h4><table><thead><tr><th>Option Name</th><th>Required</th><th>Default</th><th>Remarks</th></tr></thead><tbody><tr><td><code>read.start-commit</code></td><td><code>false</code></td><td>the latest commit</td><td>Specify <code>earliest</code> to consume from the start commit</td></tr><tr><td><code>read.end-commit</code></td><td><code>false</code></td><td>the latest commit</td><td>--</td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_y2LR" id="kafka-connect-sink">Kafka Connect Sink<a class="hash-link" href="#kafka-connect-sink" title="Direct link to heading"></a></h2><p>If you want to perform streaming ingestion into Hudi format similar to HoodieDeltaStreamer, but you don&#x27;t want to depend on Spark,
try out the new experimental release of Hudi Kafka Connect Sink. Read the <a href="https://github.com/apache/hudi/tree/master/hudi-kafka-connect" target="_blank" rel="noopener noreferrer">ReadMe</a>
for full documentation.</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/apache/hudi/tree/asf-site/website/versioned_docs/version-0.10.1/hoodie_deltastreamer.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_mt2f"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/0.10.1/writing_data"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Writing Data</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/0.10.1/querying_data"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Querying Data</div></a></div></nav></div></div><div class="col col--3"><div class="tableOfContents_vrFS thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#deltastreamer" class="table-of-contents__link toc-highlight">DeltaStreamer</a><ul><li><a href="#multitabledeltastreamer" class="table-of-contents__link toc-highlight">MultiTableDeltaStreamer</a></li><li><a href="#concurrency-control" class="table-of-contents__link toc-highlight">Concurrency Control</a></li></ul></li><li><a href="#checkpointing" class="table-of-contents__link toc-highlight">Checkpointing</a></li><li><a href="#schema-providers" class="table-of-contents__link toc-highlight">Schema Providers</a><ul><li><a href="#schema-registry-provider" class="table-of-contents__link toc-highlight">Schema Registry Provider</a></li><li><a href="#jdbc-schema-provider" class="table-of-contents__link toc-highlight">JDBC Schema Provider</a></li><li><a href="#file-based-schema-provider" class="table-of-contents__link toc-highlight">File Based Schema Provider</a></li><li><a href="#schema-provider-with-post-processor" class="table-of-contents__link toc-highlight">Schema Provider with Post Processor</a></li></ul></li><li><a href="#sources" class="table-of-contents__link toc-highlight">Sources</a><ul><li><a href="#distributed-file-system-dfs" class="table-of-contents__link toc-highlight">Distributed File System (DFS)</a></li><li><a href="#kafka" class="table-of-contents__link toc-highlight">Kafka</a></li><li><a href="#s3-events" class="table-of-contents__link toc-highlight">S3 Events</a></li><li><a href="#jdbc-source" class="table-of-contents__link toc-highlight">JDBC Source</a></li><li><a href="#sql-source" class="table-of-contents__link toc-highlight">SQL Source</a></li></ul></li><li><a href="#flink-ingestion" class="table-of-contents__link toc-highlight">Flink Ingestion</a><ul><li><a href="#cdc-ingestion" class="table-of-contents__link toc-highlight">CDC Ingestion</a></li><li><a href="#bulk-insert" class="table-of-contents__link toc-highlight">Bulk Insert</a></li><li><a href="#index-bootstrap" class="table-of-contents__link toc-highlight">Index Bootstrap</a></li><li><a href="#changelog-mode" class="table-of-contents__link toc-highlight">Changelog Mode</a></li><li><a href="#append-mode" class="table-of-contents__link toc-highlight">Append Mode</a></li><li><a href="#rate-limit" class="table-of-contents__link toc-highlight">Rate Limit</a></li><li><a href="#streaming-query" class="table-of-contents__link toc-highlight">Streaming Query</a></li><li><a href="#incremental-query" class="table-of-contents__link toc-highlight">Incremental Query</a></li></ul></li><li><a href="#kafka-connect-sink" class="table-of-contents__link toc-highlight">Kafka Connect Sink</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/blog/2021/07/21/streaming-data-lake-platform">Our Vision</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/concepts">Concepts</a></li><li class="footer__item"><a class="footer__link-item" href="/community/team">Team</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/release-0.14.1">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/releases/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/powered-by">Who&#x27;s Using</a></li></ul></div><div class="col footer__col"><div class="footer__title">Learn</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/quick-start-guide">Quick Start</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/docker_demo">Docker Demo</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/talks">Talks</a></li><li class="footer__item"><a class="footer__link-item" href="/videos">Video Guides</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Technical Wiki<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div><div class="col footer__col"><div class="footer__title">Hudi On Cloud</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/s3_hoodie">AWS</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/gcs_hoodie">Google Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/oss_hoodie">Alibaba Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/azure_hoodie">Microsoft Azure</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/cos_hoodie">Tencent Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/ibm_cos_hoodie">IBM Cloud</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/community/get-involved">Get Involved</a></li><li class="footer__item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Linkedin<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><div class="footer__title">Apache</div><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://hudi.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_SRtH"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footer__copyright">Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. <br>Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation.</div></div></div></footer></div>
<script src="/assets/js/runtime~main.2cab5691.js"></script>
<script src="/assets/js/main.bd020950.js"></script>
</body>
</html>