| <!doctype html> |
| <html class="docs-version-0.13.1" lang="cn" dir="ltr"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta name="generator" content="Docusaurus v2.0.0-beta.14"> |
| <link rel="alternate" type="application/rss+xml" href="/cn/blog/rss.xml" title="Apache Hudi: User-Facing Analytics RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/cn/blog/atom.xml" title="Apache Hudi: User-Facing Analytics Atom Feed"> |
| <link rel="alternate" type="application/json" href="/cn/blog/feed.json" title="Apache Hudi: User-Facing Analytics JSON Feed"> |
| <link rel="search" type="application/opensearchdescription+xml" title="Apache Hudi" href="/cn/opensearch.xml"> |
| <link rel="alternate" type="application/rss+xml" href="/cn/videos/rss.xml" title="Apache Hudi RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/cn/videos/atom.xml" title="Apache Hudi Atom Feed"> |
| <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro"> |
| <link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">Docker Demo | Apache Hudi</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://hudi.apache.org/cn/docs/0.13.1/docker_demo"><meta data-react-helmet="true" name="docsearch:language" content="cn"><meta data-react-helmet="true" name="docsearch:version" content="0.13.1"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="docs-default-0.13.1"><meta data-react-helmet="true" property="og:title" content="Docker Demo | Apache Hudi"><meta data-react-helmet="true" name="description" content="A Demo using Docker containers"><meta data-react-helmet="true" property="og:description" content="A Demo using Docker containers"><meta data-react-helmet="true" name="keywords" content="hudi,docker,demo"><link data-react-helmet="true" rel="icon" href="/cn/assets/images/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://hudi.apache.org/cn/docs/0.13.1/docker_demo"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.13.1/docker_demo" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/cn/docs/0.13.1/docker_demo" hreflang="cn"><link data-react-helmet="true" rel="alternate" href="https://hudi.apache.org/docs/0.13.1/docker_demo" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/cn/assets/css/styles.ea681a30.css"> |
| <link rel="preload" href="/cn/assets/js/runtime~main.0acdb754.js" as="script"> |
| <link rel="preload" href="/cn/assets/js/main.6d6aa24f.js" as="script"> |
| </head> |
| <body> |
| <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"> |
| <div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><div class="announcementBar_axC9" role="banner"><div class="announcementBarPlaceholder_xYHE"></div><div class="announcementBarContent_6uhP">⭐️ If you like Apache Hudi, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi">GitHub</a>! ⭐</div><button type="button" class="clean-btn close announcementBarClose_A3A1" aria-label="Close"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav class="navbar navbar--fixed-top navbarWrapper_UIa0"><div class="navbar__inner"><img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8f594acf-9b77-44fb-9475-3e82ead1910c" width="0" height="0" alt=""><img referrerpolicy="no-referrer-when-downgrade" src="https://analytics.apache.org/matomo.php?idsite=47&rec=1" width="0" height="0" alt=""><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/cn/"><div class="navbar__logo navbarLogo_Bz6n"><img src="/cn/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/cn/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><a class="navbar__item navbar__link" href="/cn/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Learn<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/cn/talks"><div class="labelWrapperDropdown_Mqbj">Talks</div></a></li><li><a class="dropdown__link" href="/cn/videos"><div class="labelWrapperDropdown_Mqbj">Video Guides</div></a></li><li><a class="dropdown__link" href="/cn/docs/faq"><div class="labelWrapperDropdown_Mqbj">FAQ</div></a></li><li><a class="dropdown__link" href="/cn/tech-specs"><div class="labelWrapperDropdown_Mqbj">Tech Specs</div></a></li><li><a class="dropdown__link" href="/cn/tech-specs-1point0"><div class="labelWrapperDropdown_Mqbj">Tech Specs 1.0</div></a></li><li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Technical Wiki<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Contribute<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/cn/contribute/how-to-contribute"><div class="labelWrapperDropdown_Mqbj">How to Contribute</div></a></li><li><a class="dropdown__link" href="/cn/contribute/developer-setup"><div class="labelWrapperDropdown_Mqbj">Developer Setup</div></a></li><li><a class="dropdown__link" href="/cn/contribute/rfc-process"><div class="labelWrapperDropdown_Mqbj">RFC Process</div></a></li><li><a class="dropdown__link" href="/cn/contribute/report-security-issues"><div class="labelWrapperDropdown_Mqbj">Report Security Issues</div></a></li><li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank" rel="noopener noreferrer" class="dropdown__link"><span class="externalLink_AE3f">Report Issues<svg width="20" height="20" viewBox="0 0 26 26" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.965 8.745 9.01 16.7M10.561 8.758l6.403-.013-.013 6.403" stroke="#0DB1F9" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path><rect x="4.5" y="4.5" width="17" height="17" rx="2.5" stroke="#0DB1F9"></rect></svg></span></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj">Community<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/cn/community/get-involved"><div class="labelWrapperDropdown_Mqbj">Get Involved</div></a></li><li><a class="dropdown__link" href="/cn/community/syncs"><div class="labelWrapperDropdown_Mqbj">Community Syncs</div></a></li><li><a class="dropdown__link" href="/cn/community/office_hours"><div class="labelWrapperDropdown_Mqbj">Office Hours</div></a></li><li><a class="dropdown__link" href="/cn/community/team"><div class="labelWrapperDropdown_Mqbj">Team</div></a></li></ul></div><a class="navbar__item navbar__link" href="/cn/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a><a class="navbar__item navbar__link" href="/cn/powered-by"><div class="labelWrapperDropdown_Mqbj">Who's Using</div></a><a class="navbar__item navbar__link" href="/cn/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a><a class="navbar__item navbar__link" href="/cn/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link downloadLinkDropdownHide_aDP3" href="/cn/docs/0.13.1/overview"><div class="labelWrapperDropdown_Mqbj">0.13.1<svg width="10" height="6" viewBox="0 0 10 6" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M8.5 1.25 5 4.75l-3.5-3.5" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/cn/docs/next/docker_demo"><div class="labelWrapperDropdown_Mqbj">Next</div></a></li><li><a class="dropdown__link" href="/cn/docs/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.14.1</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.14.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.14.0</div></a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/cn/docs/0.13.1/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.13.1</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.13.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.13.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.12.3/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.12.3</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.12.2/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.12.2</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.12.1/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.12.1</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.12.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.12.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.11.1/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.11.1</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.11.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.11.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.10.1/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.10.1</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.10.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.10.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.9.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.9.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.8.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.8.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.7.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.7.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.6.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.6.0</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.5.3/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.5.3</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.5.2/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.5.2</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.5.1/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.5.1</div></a></li><li><a class="dropdown__link" href="/cn/docs/0.5.0/docker_demo"><div class="labelWrapperDropdown_Mqbj">0.5.0</div></a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" class="navbar__link downloadLinkDropdownHide_aDP3"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Chinese</span></span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#a)"><path d="M14 6.457a6.842 6.842 0 0 0-7-6.02 6.843 6.843 0 0 0-7 6.02v1.085a6.843 6.843 0 0 0 7 6.02 6.843 6.843 0 0 0 7-6.02V6.457Zm-1.094 0h-2.625a9.92 9.92 0 0 0-.376-2.222 6.65 6.65 0 0 0 1.531-.875 5.25 5.25 0 0 1 1.444 3.097h.026Zm-8.032 0a8.479 8.479 0 0 1 .324-1.872 7.376 7.376 0 0 0 3.63 0c.175.61.284 1.239.325 1.872h-4.28Zm4.305 1.085a8.391 8.391 0 0 1-.324 1.873 7.464 7.464 0 0 0-3.658 0 8.479 8.479 0 0 1-.323-1.873h4.305Zm.35-4.375A10.342 10.342 0 0 0 8.75 1.75c.627.194 1.218.49 1.75.875a5.748 5.748 0 0 1-.998.577l.027-.035ZM7.254 1.54A8.75 8.75 0 0 1 8.46 3.552c-.48.11-.97.165-1.461.167-.492-.001-.982-.057-1.461-.167.308-.722.715-1.4 1.207-2.012h.508ZM4.498 3.202a5.748 5.748 0 0 1-.998-.577 6.029 6.029 0 0 1 1.75-.875c-.294.46-.546.947-.753 1.452Zm-1.873.15c.47.358.984.652 1.531.874A9.625 9.625 0 0 0 3.78 6.45H1.155a5.25 5.25 0 0 1 1.47-3.098ZM1.12 7.541h2.625c.038.753.164 1.5.376 2.223a6.649 6.649 0 0 0-1.531.875 5.25 5.25 0 0 1-1.47-3.098Zm3.377 3.255c.207.506.459.992.753 1.453a6.03 6.03 0 0 1-1.75-.875c.312-.226.646-.419.997-.578Zm2.25 1.663a8.594 8.594 0 0 1-1.208-2.013 6.501 6.501 0 0 1 2.922 0 8.54 8.54 0 0 1-1.207 2.013h-.508Zm2.755-1.663c.367.156.716.35 1.042.578a6.338 6.338 0 0 1-1.75.875c.275-.464.512-.95.708-1.453Zm1.873-.148a6.647 6.647 0 0 0-1.531-.875 9.45 9.45 0 0 0 .376-2.223h2.625a5.25 5.25 0 0 1-1.47 3.098Z" fill="#1C1E21"></path></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h14v14H0z"></path></clipPath></defs></svg></div></a><ul class="dropdown__menu"><li><a href="/docs/0.13.1/docker_demo" target="_self" rel="noopener noreferrer" class="dropdown__link"><div class="labelWrapperDropdown_Mqbj">English</div></a></li><li><a href="/cn/docs/0.13.1/docker_demo" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active"><div class="labelWrapperDropdown_Mqbj">Chinese</div></a></li></ul></div><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a><div class="searchBox_fBfG"><div role="button" class="searchButton_g9-U" aria-label="Search"><span class="searchText_RI6l">Search</span><svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="6.864" cy="6.864" r="5.243" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></circle><path d="m10.51 10.783 2.056 2.05" stroke="#1C1E21" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"></path></svg></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/cn/"><div class="navbar__logo"><img src="/cn/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--light_4Vu1"><img src="/cn/assets/images/hudi.png" alt="Apache Hudi" class="themedImage_TMUO themedImage--dark_uzRr"></div></a><button type="button" class="clean-btn navbar-sidebar__close"><svg viewBox="0 0 15 15" width="21" height="21"><g stroke="var(--ifm-color-emphasis-600)" stroke-width="1.2"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><div class="navbar-sidebar__items"><div class="navbar-sidebar__item menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/cn/docs/overview"><div class="labelWrapperDropdown_Mqbj">Docs</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Learn</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Contribute</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Community</div></a></li><li class="menu__list-item"><a class="menu__link" href="/cn/blog"><div class="labelWrapperDropdown_Mqbj">Blog</div></a></li><li class="menu__list-item"><a class="menu__link" href="/cn/powered-by"><div class="labelWrapperDropdown_Mqbj">Who's Using</div></a></li><li class="menu__list-item"><a class="menu__link" href="/cn/roadmap"><div class="labelWrapperDropdown_Mqbj">Roadmap</div></a></li><li class="menu__list-item"><a class="menu__link" href="/cn/releases/download"><div class="labelWrapperDropdown_Mqbj">Download</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj">Versions</div></a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist"><div class="labelWrapperDropdown_Mqbj"><span><svg viewBox="0 0 20 20" width="20" height="20" aria-hidden="true" class="iconLanguage_zID8"><path fill="currentColor" d="M19.753 10.909c-.624-1.707-2.366-2.726-4.661-2.726-.09 0-.176.002-.262.006l-.016-2.063 3.525-.607c.115-.019.133-.119.109-.231-.023-.111-.167-.883-.188-.976-.027-.131-.102-.127-.207-.109-.104.018-3.25.461-3.25.461l-.013-2.078c-.001-.125-.069-.158-.194-.156l-1.025.016c-.105.002-.164.049-.162.148l.033 2.307s-3.061.527-3.144.543c-.084.014-.17.053-.151.143.019.09.19 1.094.208 1.172.018.08.072.129.188.107l2.924-.504.035 2.018c-1.077.281-1.801.824-2.256 1.303-.768.807-1.207 1.887-1.207 2.963 0 1.586.971 2.529 2.328 2.695 3.162.387 5.119-3.06 5.769-4.715 1.097 1.506.256 4.354-2.094 5.98-.043.029-.098.129-.033.207l.619.756c.08.096.206.059.256.023 2.51-1.73 3.661-4.515 2.869-6.683zm-7.386 3.188c-.966-.121-.944-.914-.944-1.453 0-.773.327-1.58.876-2.156a3.21 3.21 0 011.229-.799l.082 4.277a2.773 2.773 0 01-1.243.131zm2.427-.553l.046-4.109c.084-.004.166-.01.252-.01.773 0 1.494.145 1.885.361.391.217-1.023 2.713-2.183 3.758zm-8.95-7.668a.196.196 0 00-.196-.145h-1.95a.194.194 0 00-.194.144L.008 16.916c-.017.051-.011.076.062.076h1.733c.075 0 .099-.023.114-.072l1.008-3.318h3.496l1.008 3.318c.016.049.039.072.113.072h1.734c.072 0 .078-.025.062-.076-.014-.05-3.083-9.741-3.494-11.04zm-2.618 6.318l1.447-5.25 1.447 5.25H3.226z"></path></svg><span>Languages</span></span></div></a></li><li class="menu__list-item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="menu__link header-github-link" aria-label="GitHub repository"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="menu__link header-twitter-link" aria-label="Hudi Twitter Handle"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="menu__link header-slack-link" aria-label="Hudi Slack Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="menu__link header-youtube-link" aria-label="Hudi YouTube Channel"><div class="labelWrapperDropdown_Mqbj"></div></a></li><li class="menu__list-item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="menu__link header-linkedin-link" aria-label="Hudi Linkedin Page"><div class="labelWrapperDropdown_Mqbj"></div></a></li></ul></div><div class="navbar-sidebar__item menu"><button type="button" class="clean-btn navbar-sidebar__back">← Back to main menu</button></div></div></div></nav><div class="main-wrapper docs-wrapper docs-doc-page"><div class="docPage_GMj9"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_i9tI" type="button"></button><aside class="docSidebarContainer_k0Pq"><div class="sidebar_a3j0"><nav class="menu thin-scrollbar menu_cyFh menuWithAnnouncementBar_+O1J"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/cn/docs/0.13.1/overview">Overview</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--active hasHref_TwRn" href="/cn/docs/0.13.1/quick-start-guide">Quick Start</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/cn/docs/0.13.1/quick-start-guide">Spark Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/cn/docs/0.13.1/flink-quick-start-guide">Flink Guide</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/cn/docs/0.13.1/docker_demo">Docker Demo</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/cn/docs/0.13.1/timeline">Concepts</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/cn/docs/0.13.1/table_management">How To</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/cn/docs/0.13.1/migration_guide">Services</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/cn/docs/0.13.1/basic_configurations">Configurations</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist hasHref_TwRn" href="/cn/docs/0.13.1/performance">Guides</a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/cn/docs/0.13.1/use_cases">Use Cases</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/cn/docs/0.13.1/faq">FAQs</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/cn/docs/0.13.1/privacy">Privacy Policy</a></li></ul></nav></div></aside><main class="docMainContainer_Q970"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_zHA2"><div class="theme-doc-version-banner alert alert--warning margin-bottom--md" role="alert"><div>This is documentation for <!-- -->Apache Hudi<!-- --> <b>0.13.1</b>, which is no longer actively maintained.</div><div class="margin-top--md">For up-to-date documentation, see the <b><a href="/cn/docs/docker_demo">latest version</a></b> (<!-- -->0.14.1<!-- -->).</div></div><div class="docItemContainer_oiyr"><article><span class="theme-doc-version-badge badge badge--secondary">Version: <!-- -->0.13.1</span><div class="tocCollapsible_aw-L theme-doc-toc-mobile tocMobile_Tx6Y"><button type="button" class="clean-btn tocCollapsibleButton_zr6a">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Docker Demo</h1></header><h2 class="anchor anchorWithStickyNavbar_y2LR" id="a-demo-using-docker-containers">A Demo using Docker containers<a class="hash-link" href="#a-demo-using-docker-containers" title="Direct link to heading"></a></h2><p>Let's use a real world example to see how Hudi works end to end. For this purpose, a self contained |
| data infrastructure is brought up in a local Docker cluster within your computer. It requires the |
| Hudi repo to have been cloned locally. </p><p>The steps have been tested on a Mac laptop</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="prerequisites">Prerequisites<a class="hash-link" href="#prerequisites" title="Direct link to heading"></a></h3><ul><li><p>Clone the <a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer">Hudi repository</a> to your local machine.</p></li><li><p>Docker Setup : For Mac, Please follow the steps as defined in <a href="https://docs.docker.com/desktop/install/mac-install/" target="_blank" rel="noopener noreferrer">Install Docker Desktop on Mac</a>. For running Spark-SQL queries, please ensure atleast 6 GB and 4 CPUs are allocated to Docker (See Docker -> Preferences -> Advanced). Otherwise, spark-SQL queries could be killed because of memory issues.</p></li><li><p>kcat : A command-line utility to publish/consume from kafka topics. Use <code>brew install kcat</code> to install kcat.</p></li><li><p>/etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 adhoc-1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 adhoc-2</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 namenode</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 datanode1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 hiveserver</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 hivemetastore</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 kafkabroker</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 sparkmaster</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">127.0.0.1 zookeeper</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div></li><li><p>Java : Java SE Development Kit 8.</p></li><li><p>Maven : A build automation tool for Java projects.</p></li><li><p>jq : A lightweight and flexible command-line JSON processor. Use <code>brew install jq</code> to install jq.</p></li></ul><p>Also, this has not been tested on some environments like Docker on Windows.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="setting-up-docker-cluster">Setting up Docker Cluster<a class="hash-link" href="#setting-up-docker-cluster" title="Direct link to heading"></a></h2><h3 class="anchor anchorWithStickyNavbar_y2LR" id="build-hudi">Build Hudi<a class="hash-link" href="#build-hudi" title="Direct link to heading"></a></h3><p>The first step is to build Hudi. <strong>Note</strong> This step builds Hudi on default supported scala version - 2.11.</p><p>NOTE: Make sure you've cloned the <a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer">Hudi repository</a> first. </p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cd <HUDI_WORKSPACE></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">mvn clean package -Pintegration-tests -DskipTests</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="bringing-up-demo-cluster">Bringing up Demo Cluster<a class="hash-link" href="#bringing-up-demo-cluster" title="Direct link to heading"></a></h3><p>The next step is to run the Docker compose script and setup configs for bringing up the cluster. These files are in the <a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer">Hudi repository</a> which you should already have locally on your machine from the previous steps. </p><p>This should pull the Docker images from Docker hub and setup the Docker cluster.</p><div class="tabs-container"><ul role="tablist" aria-orientation="horizontal" class="tabs"><li role="tab" tabindex="0" aria-selected="true" class="tabs__item tabItem_vU9c tabs__item--active">Default</li><li role="tab" tabindex="-1" aria-selected="false" class="tabs__item tabItem_vU9c">Mac AArch64</li></ul><div class="margin-vert--md"><div role="tabpanel"><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cd docker</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./setup_demo.sh</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[+] Running 10/13</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container zookeeper Removed 8.6s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container datanode1 Removed 18.3s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container trino-worker-1 Removed 50.7s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container spark-worker-1 Removed 16.7s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container adhoc-2 Removed 16.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container graphite Removed 16.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container kafkabroker Removed 14.1s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container adhoc-1 Removed 14.1s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container presto-worker-1 Removed 11.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container presto-coordinator-1 Removed 34.6s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">.......</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">......</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[+] Running 17/17</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ adhoc-1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ graphite Pulled 2.8s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ spark-worker-1 Pulled 3.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ kafka Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ datanode1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ hivemetastore Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ hiveserver Pulled 3.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ hive-metastore-postgresql Pulled 2.8s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ presto-coordinator-1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ namenode Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ trino-worker-1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ sparkmaster Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ presto-worker-1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ zookeeper Pulled 2.8s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ adhoc-2 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ historyserver Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ trino-coordinator-1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[+] Running 17/17</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container zookeeper Started 41.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container kafkabroker Started 41.7s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container graphite Started 41.5s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container hive-metastore-postgresql Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container namenode Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container hivemetastore Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container trino-coordinator-1 Runni... 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container presto-coordinator-1 Star... 42.1s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container historyserver Started 41.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container datanode1 Started 49.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container hiveserver Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container trino-worker-1 Started 42.1s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container sparkmaster Started 41.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container spark-worker-1 Started 50.2s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container adhoc-2 Started 38.5s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container adhoc-1 Started 38.5s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container presto-worker-1 Started 38.4s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Copying spark default config and setting up configs</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Copying spark default config and setting up configs</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ docker ps</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div></div><div role="tabpanel" hidden=""><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>Please note the following for Mac AArch64 users</h5></div><div class="admonition-content"><ul><li> The demo must be built and run using the master branch. We currently plan to include support starting with the 0.13.0 release. </li><li> Presto and Trino are not currently supported in the demo. </li></ul></div></div><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cd docker</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./setup_demo.sh --mac-aarch64</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">.......</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">......</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[+] Running 12/12</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ adhoc-1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ spark-worker-1 Pulled 3.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ kafka Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ datanode1 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ hivemetastore Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ hiveserver Pulled 3.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ hive-metastore-postgresql Pulled 2.8s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ namenode Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ sparkmaster Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ zookeeper Pulled 2.8s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ adhoc-2 Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ historyserver Pulled 2.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[+] Running 12/12</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container zookeeper Started 41.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container kafkabroker Started 41.7s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container hive-metastore-postgresql Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container namenode Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container hivemetastore Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container historyserver Started 41.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container datanode1 Started 49.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container hiveserver Running 0.0s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container sparkmaster Started 41.9s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container spark-worker-1 Started 50.2s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container adhoc-2 Started 38.5s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">⠿ Container adhoc-1 Started 38.5s</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Copying spark default config and setting up configs</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Copying spark default config and setting up configs</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ docker ps</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div></div></div></div><p>At this point, the Docker cluster will be up and running. The demo cluster brings up the following services</p><ul><li>HDFS Services (NameNode, DataNode)</li><li>Spark Master and Worker</li><li>Hive Services (Metastore, HiveServer2 along with PostgresDB)</li><li>Kafka Broker and a Zookeeper Node (Kafka will be used as upstream source for the demo)</li><li>Containers for Presto setup (Presto coordinator and worker)</li><li>Containers for Trino setup (Trino coordinator and worker)</li><li>Adhoc containers to run Hudi/Hive CLI commands</li></ul><h2 class="anchor anchorWithStickyNavbar_y2LR" id="demo">Demo<a class="hash-link" href="#demo" title="Direct link to heading"></a></h2><p>Stock Tracker data will be used to showcase different Hudi query types and the effects of Compaction.</p><p>Take a look at the directory <code>docker/demo/data</code>. There are 2 batches of stock data - each at 1 minute granularity. |
| The first batch contains stocker tracker data for some stock symbols during the first hour of trading window |
| (9:30 a.m to 10:30 a.m). The second batch contains tracker data for next 30 mins (10:30 - 11 a.m). Hudi will |
| be used to ingest these batches to a table which will contain the latest stock tracker data at hour level granularity. |
| The batches are windowed intentionally so that the second batch contains updates to some of the rows in the first batch.</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-1--publish-the-first-batch-to-kafka">Step 1 : Publish the first batch to Kafka<a class="hash-link" href="#step-1--publish-the-first-batch-to-kafka" title="Direct link to heading"></a></h3><p>Upload the first batch to Kafka topic 'stock ticks' </p><p><code>cat docker/demo/data/batch_1.json | kcat -b kafkabroker -t stock_ticks -P</code></p><p>To check if the new topic shows up, use</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">kcat -b kafkabroker -L -J | jq .</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">{</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "originating_broker": {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "id": 1001,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "name": "kafkabroker:9092/1001"</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> },</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "query": {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "topic": "*"</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> },</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "brokers": [</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "id": 1001,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "name": "kafkabroker:9092"</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ],</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "topics": [</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "topic": "stock_ticks",</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "partitions": [</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "partition": 0,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "leader": 1001,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "replicas": [</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "id": 1001</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ],</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "isrs": [</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> "id": 1001</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-2-incrementally-ingest-data-from-kafka-topic">Step 2: Incrementally ingest data from Kafka topic<a class="hash-link" href="#step-2-incrementally-ingest-data-from-kafka-topic" title="Direct link to heading"></a></h3><p>Hudi comes with a tool named DeltaStreamer. This tool can connect to variety of data sources (including Kafka) to |
| pull changes and apply to Hudi table using upsert/insert primitives. Here, we will use the tool to download |
| json data from kafka topic and ingest to both COW and MOR tables we initialized in the previous step. This tool |
| automatically initializes the tables in the file-system if they do not exist yet.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --table-type COPY_ON_WRITE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field ts \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path /user/hive/warehouse/stock_ticks_cow \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --table-type MERGE_ON_READ \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field ts \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path /user/hive/warehouse/stock_ticks_mor \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table stock_ticks_mor \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props /var/demo/config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --disable-compaction</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You can use HDFS web-browser to look at the tables |
| <code>http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow</code>.</p><p>You can explore the new partition folder created in the table along with a "commit" / "deltacommit" |
| file under .hoodie which signals a successful commit.</p><p>There will be a similar setup when you browse the MOR table |
| <code>http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor</code></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-3-sync-with-hive">Step 3: Sync with Hive<a class="hash-link" href="#step-3-sync-with-hive" title="Direct link to heading"></a></h3><p>At this step, the tables are available in HDFS. We need to sync with Hive to create new Hive tables and add partitions |
| inorder to run Hive queries against those tables.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --jdbc-url jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --user hive \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --pass hive \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --partitioned-by dt \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --base-path /user/hive/warehouse/stock_ticks_cow \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --database default \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --table stock_ticks_cow \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">.....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2020-01-25 19:51:28,953 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">.....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --jdbc-url jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --user hive \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --pass hive \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --partitioned-by dt \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --base-path /user/hive/warehouse/stock_ticks_mor \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --database default \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --table stock_ticks_mor \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2020-01-25 19:51:51,066 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2020-01-25 19:51:51,569 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>After executing the above command, you will notice</p><ol><li>A hive table named <code>stock_ticks_cow</code> created which supports Snapshot and Incremental queries on Copy On Write table.</li><li>Two new tables <code>stock_ticks_mor_rt</code> and <code>stock_ticks_mor_ro</code> created for the Merge On Read table. The former |
| supports Snapshot and Incremental queries (providing near-real time data) while the later supports ReadOptimized queries.</li></ol><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-4-a-run-hive-queries">Step 4 (a): Run Hive Queries<a class="hash-link" href="#step-4-a-run-hive-queries" title="Direct link to heading"></a></h3><p>Run a hive query to find the latest timestamp ingested for stock symbol 'GOOG'. You will notice that both snapshot |
| (for both COW and MOR _rt table) and read-optimized queries (for MOR _ro table) give the same value "10:29 a.m" as Hudi create a |
| parquet file for the first batch of data.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">beeline -u jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.stats.autogather=false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># List Tables</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> show tables;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| tab_name |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| stock_ticks_cow |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| stock_ticks_mor_ro |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| stock_ticks_mor_rt |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">3 rows selected (1.199 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Look at partitions that were added</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| partition |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| dt=2018-08-31 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (0.24 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># COPY-ON-WRITE Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">=========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:29:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Now, run a projection query:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924221953 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge-On-Read Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">==========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Lets run similar queries against M-O-R table. Lets look at both </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run ReadOptimized Query. Notice that the latest timestamp is 10:29</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:29:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (6.326 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run Snapshot Query. Notice that the latest timestamp is again 10:29</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:29:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (1.606 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run Read Optimized and Snapshot project queries</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-4-b-run-spark-sql-queries">Step 4 (b): Run Spark-SQL Queries<a class="hash-link" href="#step-4-b-run-spark-sql-queries" title="Direct link to heading"></a></h3><p>Hudi support Spark as query processor just like Hive. Here are the same hive queries |
| running in spark-sql</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-1 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$SPARK_INSTALL/bin/spark-shell \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --jars $HUDI_SPARK_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --master local[2] \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-class-path $HADOOP_CONF_DIR \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --conf spark.sql.hive.convertMetastoreParquet=false \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --deploy-mode client \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-memory 1G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --executor-memory 3G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --num-executors 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Welcome to</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ____ __</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> / __/__ ___ _____/ /__</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _\ \/ _ \/ _ `/ __/ '_/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> /___/ .__/\_,_/_/ /_/\_\ version 2.4.4</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> /_/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Type in expressions to have them evaluated.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Type :help for more information.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("show tables").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+--------+------------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|database|tableName |isTemporary|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+--------+------------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|default |stock_ticks_cow |false |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|default |stock_ticks_mor_ro|false |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|default |stock_ticks_mor_rt|false |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+--------+------------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Copy-On-Write Table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">## Run max timestamp query against COW table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[Stage 0:> (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SLF4J: Defaulting to no-operation (NOP) logger implementation</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|symbol|max(ts) |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|GOOG |2018-08-31 10:29:00|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">## Projection Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|_hoodie_commit_time|symbol|ts |volume|open |close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|20180924221953 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|20180924221953 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge-On-Read Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">==========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Lets run similar queries against M-O-R table. Lets look at both</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">ReadOptimized and Snapshot queries supported by M-O-R table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run ReadOptimized Query. Notice that the latest timestamp is 10:29</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|symbol|max(ts) |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|GOOG |2018-08-31 10:29:00|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run Snapshot Query. Notice that the latest timestamp is again 10:29</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|symbol|max(ts) |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|GOOG |2018-08-31 10:29:00|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run Read Optimized and Snapshot project queries</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|_hoodie_commit_time|symbol|ts |volume|open |close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|_hoodie_commit_time|symbol|ts |volume|open |close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+-------------------+------+-------------------+------+---------+--------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-4-c-run-presto-queries">Step 4 (c): Run Presto Queries<a class="hash-link" href="#step-4-c-run-presto-queries" title="Direct link to heading"></a></h3><p>Here are the Presto queries for similar Hive and Spark queries. </p><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><ul><li> Currently, Presto does not support snapshot or incremental queries on Hudi tables. </li><li> This section of the demo is not supported for Mac AArch64 users at this time. </li></ul></div></div><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto> show catalogs;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Catalog</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">-----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hive</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> jmx</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> localfile</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> system</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(4 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190817_134851_00000_j8rcz, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 19 total, 19 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:04 [0 rows, 0B] [0 rows/s, 0B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto> use hive.default;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">USE</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> show tables;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Table</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> stock_ticks_cow</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> stock_ticks_mor_ro</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> stock_ticks_mor_rt</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(3 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181000_00001_segyw, FINISHED, 2 nodes</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 19 total, 19 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:05 [3 rows, 99B] [0 rows/s, 18B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># COPY-ON-WRITE Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">=========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:29:00</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181011_00002_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:12 [197 rows, 613B] [16 rows/s, 50B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180221 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180221 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181141_00003_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:02 [197 rows, 613B] [109 rows/s, 341B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge-On-Read Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">==========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Lets run similar queries against M-O-R table. </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run ReadOptimized Query. Notice that the latest timestamp is 10:29</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> presto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:29:00</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181158_00004_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:02 [197 rows, 613B] [110 rows/s, 343B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180250 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181256_00006_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:02 [197 rows, 613B] [92 rows/s, 286B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-4-d-run-trino-queries">Step 4 (d): Run Trino Queries<a class="hash-link" href="#step-4-d-run-trino-queries" title="Direct link to heading"></a></h3><p>Here are the similar queries with Trino.</p><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><ul><li> Currently, Trino does not support snapshot or incremental queries on Hudi tables. </li><li> This section of the demo is not supported for Mac AArch64 users at this time. </li></ul></div></div><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 trino --server trino-coordinator-1:8091</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino> show catalogs;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Catalog </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hive </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> system </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055038_00000_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 19 total, 19 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">3.74 [0 rows, 0B] [0 rows/s, 0B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino> use hive.default;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">USE</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> show tables;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Table </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> stock_ticks_cow </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> stock_ticks_mor_ro </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> stock_ticks_mor_rt </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(3 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055050_00003_sac73, FINISHED, 2 nodes</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 19 total, 19 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1.84 [3 rows, 102B] [1 rows/s, 55B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># COPY-ON-WRITE Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">=========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:29:00 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055101_00005_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">4.08 [197 rows, 442KB] [48 rows/s, 108KB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054822108 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054822108 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055113_00006_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.40 [197 rows, 450KB] [487 rows/s, 1.09MB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge-On-Read Queries:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">==========================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Lets run similar queries against MOR table.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run ReadOptimized Query. Notice that the latest timestamp is 10:29</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:29:00 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055125_00007_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.50 [197 rows, 442KB] [395 rows/s, 888KB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054844841 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054844841 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055136_00008_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.49 [197 rows, 450KB] [404 rows/s, 924KB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-5-upload-second-batch-to-kafka-and-run-deltastreamer-to-ingest">Step 5: Upload second batch to Kafka and run DeltaStreamer to ingest<a class="hash-link" href="#step-5-upload-second-batch-to-kafka-and-run-deltastreamer-to-ingest" title="Direct link to heading"></a></h3><p>Upload the second batch of data and ingest this batch using delta-streamer. As this batch does not bring in any new |
| partitions, there is no need to run hive-sync</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cat docker/demo/data/batch_2.json | kcat -b kafkabroker -t stock_ticks -P</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Within Docker container, run the ingestion command</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --table-type COPY_ON_WRITE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field ts \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path /user/hive/warehouse/stock_ticks_cow \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table stock_ticks_cow \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props /var/demo/config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark-submit \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --table-type MERGE_ON_READ \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --source-ordering-field ts \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-base-path /user/hive/warehouse/stock_ticks_mor \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --target-table stock_ticks_mor \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --props /var/demo/config/kafka-source.properties \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --disable-compaction</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>With Copy-On-Write table, the second ingestion by DeltaStreamer resulted in a new version of Parquet file getting created. |
| See <code>http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow/2018/08/31</code></p><p>With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file. |
| Take a look at the HDFS filesystem to get an idea: <code>http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor/2018/08/31</code></p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-6-a-run-hive-queries">Step 6 (a): Run Hive Queries<a class="hash-link" href="#step-6-a-run-hive-queries" title="Direct link to heading"></a></h3><p>With Copy-On-Write table, the Snapshot query immediately sees the changes as part of second batch once the batch |
| got committed as each ingestion creates newer versions of parquet files.</p><p>With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file. |
| This is the time, when ReadOptimized and Snapshot queries will provide different results. ReadOptimized query will still |
| return "10:29 am" as it will only read from the Parquet file. Snapshot query will do on-the-fly merge and return |
| latest committed data which is "10:59 a.m".</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">beeline -u jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.stats.autogather=false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Copy On Write Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (1.932 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge On Read Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:29:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (1.6 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Snapshot Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-6-b-run-spark-sql-queries">Step 6 (b): Run Spark SQL Queries<a class="hash-link" href="#step-6-b-run-spark-sql-queries" title="Direct link to heading"></a></h3><p>Running the same queries in Spark-SQL:</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-1 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$SPARK_INSTALL/bin/spark-shell \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --jars $HUDI_SPARK_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-class-path $HADOOP_CONF_DIR \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --conf spark.sql.hive.convertMetastoreParquet=false \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --deploy-mode client \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-memory 1G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --master local[2] \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --executor-memory 3G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --num-executors 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Copy On Write Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|symbol|max(ts) |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|GOOG |2018-08-31 10:59:00|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+------+-------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge On Read Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:29:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (1.6 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Snapshot Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-6-c-run-presto-queries">Step 6 (c): Run Presto Queries<a class="hash-link" href="#step-6-c-run-presto-queries" title="Direct link to heading"></a></h3><p>Running the same queries on Presto for ReadOptimized queries. </p><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>This section of the demo is not supported for Mac AArch64 users at this time.</p></div></div><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto> use hive.default;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">USE</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Copy On Write Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default>select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:59:00</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181530_00007_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:02 [197 rows, 613B] [125 rows/s, 389B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180221 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822181433 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181545_00008_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:02 [197 rows, 613B] [106 rows/s, 332B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge On Read Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:29:00</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181602_00009_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:01 [197 rows, 613B] [139 rows/s, 435B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180250 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_181615_00010_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:01 [197 rows, 613B] [154 rows/s, 480B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-6-d-run-trino-queries">Step 6 (d): Run Trino Queries<a class="hash-link" href="#step-6-d-run-trino-queries" title="Direct link to heading"></a></h3><p>Running the same queries on Trino for Read-Optimized queries.</p><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>This section of the demo is not supported for Mac AArch64 users at this time.</p></div></div><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 trino --server trino-coordinator-1:8091</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino> use hive.default;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">USE</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Copy On Write Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:59:00 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055443_00012_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.63 [197 rows, 442KB] [310 rows/s, 697KB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054822108 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112055352654 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055450_00013_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.65 [197 rows, 450KB] [303 rows/s, 692KB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Merge On Read Table:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:29:00 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055500_00014_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.59 [197 rows, 442KB] [336 rows/s, 756KB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054844841 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20220112054844841 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20220112_055506_00015_sac73, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0.35 [197 rows, 450KB] [556 rows/s, 1.24MB/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">trino:default> exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-7-a-incremental-query-for-copy-on-write-table">Step 7 (a): Incremental Query for COPY-ON-WRITE Table<a class="hash-link" href="#step-7-a-incremental-query-for-copy-on-write-table" title="Direct link to heading"></a></h3><p>With 2 batches of data ingested, lets showcase the support for incremental queries in Hudi Copy-On-Write tables</p><p>Lets take the same projection query example</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">beeline -u jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.stats.autogather=false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924064621 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>As you notice from the above queries, there are 2 commits - 20180924064621 and 20180924065039 in timeline order. |
| When you follow the steps, you will be getting different timestamps for commits. Substitute them |
| in place of the above timestamps.</p><p>To show the effects of incremental-query, let us assume that a reader has already seen the changes as part of |
| ingesting first batch. Now, for the reader to see effect of the second batch, he/she has to keep the start timestamp to |
| the commit time of the first batch (20180924064621) and run incremental query</p><p>Hudi incremental mode provides efficient scanning for incremental queries by filtering out files that do not have any |
| candidate rows using hudi-managed metadata.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">beeline -u jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.stats.autogather=false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">No rows affected (0.009 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.max.commits=3;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">No rows affected (0.009 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning. |
| Here is the incremental query :</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (0.83 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000></span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-7-b-incremental-query-with-spark-sql">Step 7 (b): Incremental Query with Spark SQL:<a class="hash-link" href="#step-7-b-incremental-query-with-spark-sql" title="Direct link to heading"></a></h3><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-1 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$SPARK_INSTALL/bin/spark-shell \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --jars $HUDI_SPARK_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-class-path $HADOOP_CONF_DIR \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --conf spark.sql.hive.convertMetastoreParquet=false \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --deploy-mode client \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-memory 1G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --master local[2] \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --executor-memory 3G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --num-executors 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Welcome to</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ____ __</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> / __/__ ___ _____/ /__</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _\ \/ _ \/ _ `/ __/ '_/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> /___/ .__/\_,_/_/ /_/\_\ version 2.4.4</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> /_/</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Type in expressions to have them evaluated.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Type :help for more information.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> import org.apache.hudi.DataSourceReadOptions</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">import org.apache.hudi.DataSourceReadOptions</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># In the below query, 20180925045257 is the first commit's timestamp</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> val hoodieIncViewDF = spark.read.format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SLF4J: Defaulting to no-operation (NOP) logger implementation</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1")</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">warning: there was one deprecation warning; re-run with -deprecation for details</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr_tmp1 where symbol = 'GOOG'").show(100, false);</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-8-schedule-and-run-compaction-for-merge-on-read-table">Step 8: Schedule and Run Compaction for Merge-On-Read table<a class="hash-link" href="#step-8-schedule-and-run-compaction-for-merge-on-read-table" title="Direct link to heading"></a></h3><p>Lets schedule and run a compaction to create a new version of columnar file so that read-optimized readers will see fresher data. |
| Again, You can use Hudi CLI to manually schedule and run compaction</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-1 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">root@adhoc-1:/opt# /var/hoodie/ws/hudi-cli/hudi-cli.sh</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Table command getting loaded</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">HoodieSplashScreen loaded</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">===================================================================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* ___ ___ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* /\__\ ___ /\ \ ___ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* / / / /\__\ / \ \ /\ \ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* / /__/ / / / / /\ \ \ \ \ \ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* / \ \ ___ / / / / / \ \__\ / \__\ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* / /\ \ /\__\ / /__/ ___ / /__/ \ |__| / /\/__/ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* \/ \ \/ / / \ \ \ /\__\ \ \ \ / / / /\/ / / *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* \ / / \ \ / / / \ \ / / / \ /__/ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* / / / \ \/ / / \ \/ / / \ \__\ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* / / / \ / / \ / / \/__/ *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* \/__/ \/__/ \/__/ Apache Hudi CLI *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">* *</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">===================================================================</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Welcome to Apache Hudi CLI. Please type help if you are looking for help.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hudi->connect --path /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Metadata for table stock_ticks_mor loaded</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->compactions show all</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">20/02/10 03:41:32 INFO timeline.HoodieActiveTimeline: Loaded instants [[20200210015059__clean__COMPLETED], [20200210015059__deltacommit__COMPLETED], [20200210022758__clean__COMPLETED], [20200210022758__deltacommit__COMPLETED], [==>20200210023843__compaction__REQUESTED]]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">___________________________________________________________________</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| Compaction Instant Time| State | Total FileIds to be Compacted|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|==================================================================|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Schedule a compaction. This will use Spark Launcher to schedule compaction</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->compaction schedule --hoodieConfigs hoodie.compact.inline.max.delta.commits=1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Compaction successfully completed for 20180924070031</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Now refresh and check again. You will see that there is a new compaction requested</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->refresh</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Metadata for table stock_ticks_mor loaded</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->compactions show all</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">___________________________________________________________________</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| Compaction Instant Time| State | Total FileIds to be Compacted|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|==================================================================|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | REQUESTED| 1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->compaction run --compactionInstant 20180924070031 --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Compaction successfully completed for 20180924070031</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">## Now check if compaction is completed</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->refresh</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Metadata for table stock_ticks_mor loaded</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">hoodie:stock_ticks_mor->compactions show all</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">___________________________________________________________________</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| Compaction Instant Time| State | Total FileIds to be Compacted|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">|==================================================================|</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | COMPLETED| 1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-9-run-hive-queries-including-incremental-queries">Step 9: Run Hive Queries including incremental queries<a class="hash-link" href="#step-9-run-hive-queries-including-incremental-queries" title="Direct link to heading"></a></h3><p>You will see that both ReadOptimized and Snapshot queries will show the latest committed data. |
| Lets also run the incremental query for MOR table. |
| From looking at the below query output, it will be clear that the fist commit time for the MOR table is 20180924064636 |
| and the second commit time is 20180924070031</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-2 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">beeline -u jdbc:hive2://hiveserver:10000 \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --hiveconf hive.stats.autogather=false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (1.6 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Snapshot Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | _c1 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Incremental Query:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">No rows affected (0.008 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Max-Commits covers both second batch and compaction commit</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">No rows affected (0.007 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">No rows affected (0.013 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Query:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+--+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">exit</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-10-read-optimized-and-snapshot-queries-for-mor-with-spark-sql-after-compaction">Step 10: Read Optimized and Snapshot queries for MOR with Spark-SQL after compaction<a class="hash-link" href="#step-10-read-optimized-and-snapshot-queries-for-mor-with-spark-sql-after-compaction" title="Direct link to heading"></a></h3><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it adhoc-1 /bin/bash</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$SPARK_INSTALL/bin/spark-shell \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --jars $HUDI_SPARK_BUNDLE \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-class-path $HADOOP_CONF_DIR \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --conf spark.sql.hive.convertMetastoreParquet=false \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --deploy-mode client \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --driver-memory 1G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --master local[2] \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --executor-memory 3G \</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --num-executors 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | max(ts) |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">1 row selected (1.6 seconds)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Snapshot Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| symbol | max(ts) |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| GOOG | 2018-08-31 10:59:00 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+---------+----------------------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| _hoodie_commit_time | symbol | ts | volume | open | close |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">+----------------------+---------+----------------------+---------+------------+-----------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3 class="anchor anchorWithStickyNavbar_y2LR" id="step-11--presto-read-optimized-queries-on-mor-table-after-compaction">Step 11: Presto Read Optimized queries on MOR table after compaction<a class="hash-link" href="#step-11--presto-read-optimized-queries-on-mor-table-after-compaction" title="Direct link to heading"></a></h3><div class="admonition admonition-note alert alert--secondary"><div class="admonition-heading"><h5><span class="admonition-icon"><svg xmlns="http://www.w3.org/2000/svg" width="14" height="16" viewBox="0 0 14 16"><path fill-rule="evenodd" d="M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"></path></svg></span>note</h5></div><div class="admonition-content"><p>This section of the demo is not supported for Mac AArch64 users at this time.</p></div></div><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto> use hive.default;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">USE</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"># Read Optimized Query</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">resto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> symbol | _col1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--------+---------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GOOG | 2018-08-31 10:59:00</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(1 row)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_182319_00011_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 49 total, 49 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:01 [197 rows, 613B] [133 rows/s, 414B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> _hoodie_commit_time | symbol | ts | volume | open | close</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">---------------------+--------+---------------------+--------+-----------+----------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822180250 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> 20190822181944 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(2 rows)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Query 20190822_182333_00012_segyw, FINISHED, 1 node</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Splits: 17 total, 17 done (100.00%)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0:02 [197 rows, 613B] [98 rows/s, 307B/s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">presto:default></span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This brings the demo to an end.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="testing-hudi-in-local-docker-environment">Testing Hudi in Local Docker environment<a class="hash-link" href="#testing-hudi-in-local-docker-environment" title="Direct link to heading"></a></h2><p>You can bring up a Hadoop Docker environment containing Hadoop, Hive and Spark services with support for Hudi.</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ mvn pre-integration-test -DskipTests</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The above command builds Docker images for all the services with |
| current Hudi source installed at /var/hoodie/ws and also brings up the services using a compose file. We |
| currently use Hadoop (v2.8.4), Hive (v2.3.3) and Spark (v2.4.4) in Docker images.</p><p>To bring down the containers</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ cd hudi-integ-test</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ mvn docker-compose:down</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If you want to bring up the Docker containers, use</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ cd hudi-integ-test</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">$ mvn docker-compose:up -DdetachedMode=true</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Hudi is a library that is operated in a broader data analytics/ingestion environment |
| involving Hadoop, Hive and Spark. Interoperability with all these systems is a key objective for us. We are |
| actively adding integration-tests under <strong>hudi-integ-test/src/test/java</strong> that makes use of this |
| docker environment (See <strong>hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java</strong> )</p><h3 class="anchor anchorWithStickyNavbar_y2LR" id="building-local-docker-containers">Building Local Docker Containers:<a class="hash-link" href="#building-local-docker-containers" title="Direct link to heading"></a></h3><p>The Docker images required for demo and running integration test are already in docker-hub. The Docker images |
| and compose scripts are carefully implemented so that they serve dual-purpose</p><ol><li>The Docker images have inbuilt Hudi jar files with environment variable pointing to those jars (HUDI_HADOOP_BUNDLE, ...)</li><li>For running integration-tests, we need the jars generated locally to be used for running services within docker. The |
| docker-compose scripts (see <code>docker/compose/docker-compose_hadoop284_hive233_spark244.yml</code>) ensures local jars override |
| inbuilt jars by mounting local Hudi workspace over the Docker location</li><li>As these Docker containers have mounted local Hudi workspace, any changes that happen in the workspace would automatically |
| reflect in the containers. This is a convenient way for developing and verifying Hudi for |
| developers who do not own a distributed environment. Note that this is how integration tests are run.</li></ol><p>This helps avoid maintaining separate Docker images and avoids the costly step of building Hudi Docker images locally. |
| But if users want to test Hudi from locations with lower network bandwidth, they can still build local images |
| run the script |
| <code>docker/build_local_docker_images.sh</code> to build local Docker images before running <code>docker/setup_demo.sh</code></p><p>Here are the commands:</p><div class="codeBlockContainer_J+bg language-java theme-code-block"><div class="codeBlockContent_csEI java"><pre tabindex="0" class="prism-code language-java codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">cd docker</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./build_local_docker_images.sh</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">.....</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] Reactor Summary:</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] Hudi ............................................... SUCCESS [ 2.507 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-common ........................................ SUCCESS [ 15.181 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-aws ........................................... SUCCESS [ 2.621 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-timeline-service .............................. SUCCESS [ 1.811 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-client ........................................ SUCCESS [ 0.065 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-client-common ................................. SUCCESS [ 8.308 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-mr ..................................... SUCCESS [ 3.733 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark-client .................................. SUCCESS [ 18.567 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-sync-common ................................... SUCCESS [ 0.794 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hive-sync ..................................... SUCCESS [ 3.691 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark-datasource .............................. SUCCESS [ 0.121 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark-common_2.11 ............................. SUCCESS [ 12.979 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark2_2.11 ................................... SUCCESS [ 12.516 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark_2.11 .................................... SUCCESS [ 35.649 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-utilities_2.11 ................................ SUCCESS [ 5.881 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-utilities-bundle_2.11 ......................... SUCCESS [ 12.661 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-cli ........................................... SUCCESS [ 19.858 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-java-client ................................... SUCCESS [ 3.221 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-flink-client .................................. SUCCESS [ 5.731 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark3_2.12 ................................... SUCCESS [ 8.627 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-dla-sync ...................................... SUCCESS [ 1.459 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-sync .......................................... SUCCESS [ 0.053 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-mr-bundle .............................. SUCCESS [ 5.652 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hive-sync-bundle .............................. SUCCESS [ 1.623 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-spark-bundle_2.11 ............................. SUCCESS [ 10.930 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-presto-bundle ................................. SUCCESS [ 3.652 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-timeline-server-bundle ........................ SUCCESS [ 4.804 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-trino-bundle .................................. SUCCESS [ 5.991 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-docker ................................. SUCCESS [ 2.061 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-base-docker ............................ SUCCESS [ 53.372 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-base-java11-docker ..................... SUCCESS [ 48.545 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-namenode-docker ........................ SUCCESS [ 6.098 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-datanode-docker ........................ SUCCESS [ 4.825 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-history-docker ......................... SUCCESS [ 3.829 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-hive-docker ............................ SUCCESS [ 52.660 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-sparkbase-docker ....................... SUCCESS [01:02 min]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-sparkmaster-docker ..................... SUCCESS [ 12.661 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-sparkworker-docker ..................... SUCCESS [ 4.350 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-sparkadhoc-docker ...................... SUCCESS [ 59.083 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-presto-docker .......................... SUCCESS [01:31 min]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-trinobase-docker ....................... SUCCESS [02:40 min]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-trinocoordinator-docker ................ SUCCESS [ 14.003 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-hadoop-trinoworker-docker ..................... SUCCESS [ 12.100 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-integ-test .................................... SUCCESS [ 13.581 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-integ-test-bundle ............................. SUCCESS [ 27.212 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-examples ...................................... SUCCESS [ 8.090 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-flink_2.11 .................................... SUCCESS [ 4.217 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-kafka-connect ................................. SUCCESS [ 2.966 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-flink-bundle_2.11 ............................. SUCCESS [ 11.155 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] hudi-kafka-connect-bundle .......................... SUCCESS [ 12.369 s]</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] ------------------------------------------------------------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] BUILD SUCCESS</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] ------------------------------------------------------------------------</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] Total time: 14:35 min</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] Finished at: 2022-01-12T18:41:27-08:00</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[INFO] ------------------------------------------------------------------------</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/apache/hudi/tree/asf-site/website/versioned_docs/version-0.13.1/docker_demo.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_mt2f"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/cn/docs/0.13.1/flink-quick-start-guide"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Flink Guide</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/cn/docs/0.13.1/timeline"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Timeline</div></a></div></nav></div></div><div class="col col--3"><div class="tableOfContents_vrFS thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#a-demo-using-docker-containers" class="table-of-contents__link toc-highlight">A Demo using Docker containers</a><ul><li><a href="#prerequisites" class="table-of-contents__link toc-highlight">Prerequisites</a></li></ul></li><li><a href="#setting-up-docker-cluster" class="table-of-contents__link toc-highlight">Setting up Docker Cluster</a><ul><li><a href="#build-hudi" class="table-of-contents__link toc-highlight">Build Hudi</a></li><li><a href="#bringing-up-demo-cluster" class="table-of-contents__link toc-highlight">Bringing up Demo Cluster</a></li></ul></li><li><a href="#demo" class="table-of-contents__link toc-highlight">Demo</a><ul><li><a href="#step-1--publish-the-first-batch-to-kafka" class="table-of-contents__link toc-highlight">Step 1 : Publish the first batch to Kafka</a></li><li><a href="#step-2-incrementally-ingest-data-from-kafka-topic" class="table-of-contents__link toc-highlight">Step 2: Incrementally ingest data from Kafka topic</a></li><li><a href="#step-3-sync-with-hive" class="table-of-contents__link toc-highlight">Step 3: Sync with Hive</a></li><li><a href="#step-4-a-run-hive-queries" class="table-of-contents__link toc-highlight">Step 4 (a): Run Hive Queries</a></li><li><a href="#step-4-b-run-spark-sql-queries" class="table-of-contents__link toc-highlight">Step 4 (b): Run Spark-SQL Queries</a></li><li><a href="#step-4-c-run-presto-queries" class="table-of-contents__link toc-highlight">Step 4 (c): Run Presto Queries</a></li><li><a href="#step-4-d-run-trino-queries" class="table-of-contents__link toc-highlight">Step 4 (d): Run Trino Queries</a></li><li><a href="#step-5-upload-second-batch-to-kafka-and-run-deltastreamer-to-ingest" class="table-of-contents__link toc-highlight">Step 5: Upload second batch to Kafka and run DeltaStreamer to ingest</a></li><li><a href="#step-6-a-run-hive-queries" class="table-of-contents__link toc-highlight">Step 6 (a): Run Hive Queries</a></li><li><a href="#step-6-b-run-spark-sql-queries" class="table-of-contents__link toc-highlight">Step 6 (b): Run Spark SQL Queries</a></li><li><a href="#step-6-c-run-presto-queries" class="table-of-contents__link toc-highlight">Step 6 (c): Run Presto Queries</a></li><li><a href="#step-6-d-run-trino-queries" class="table-of-contents__link toc-highlight">Step 6 (d): Run Trino Queries</a></li><li><a href="#step-7-a-incremental-query-for-copy-on-write-table" class="table-of-contents__link toc-highlight">Step 7 (a): Incremental Query for COPY-ON-WRITE Table</a></li><li><a href="#step-7-b-incremental-query-with-spark-sql" class="table-of-contents__link toc-highlight">Step 7 (b): Incremental Query with Spark SQL:</a></li><li><a href="#step-8-schedule-and-run-compaction-for-merge-on-read-table" class="table-of-contents__link toc-highlight">Step 8: Schedule and Run Compaction for Merge-On-Read table</a></li><li><a href="#step-9-run-hive-queries-including-incremental-queries" class="table-of-contents__link toc-highlight">Step 9: Run Hive Queries including incremental queries</a></li><li><a href="#step-10-read-optimized-and-snapshot-queries-for-mor-with-spark-sql-after-compaction" class="table-of-contents__link toc-highlight">Step 10: Read Optimized and Snapshot queries for MOR with Spark-SQL after compaction</a></li><li><a href="#step-11--presto-read-optimized-queries-on-mor-table-after-compaction" class="table-of-contents__link toc-highlight">Step 11: Presto Read Optimized queries on MOR table after compaction</a></li></ul></li><li><a href="#testing-hudi-in-local-docker-environment" class="table-of-contents__link toc-highlight">Testing Hudi in Local Docker environment</a><ul><li><a href="#building-local-docker-containers" class="table-of-contents__link toc-highlight">Building Local Docker Containers:</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">About</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/cn/blog/2021/07/21/streaming-data-lake-platform">Our Vision</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/concepts">Concepts</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/community/team">Team</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/releases/release-0.14.1">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/releases/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/powered-by">Who's Using</a></li></ul></div><div class="col footer__col"><div class="footer__title">Learn</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/cn/docs/quick-start-guide">Quick Start</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/docker_demo">Docker Demo</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/talks">Talks</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/videos">Video Guides</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/faq">FAQ</a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Technical Wiki<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div><div class="col footer__col"><div class="footer__title">Hudi On Cloud</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/cn/docs/s3_hoodie">AWS</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/gcs_hoodie">Google Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/oss_hoodie">Alibaba Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/azure_hoodie">Microsoft Azure</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/cos_hoodie">Tencent Cloud</a></li><li class="footer__item"><a class="footer__link-item" href="/cn/docs/ibm_cos_hoodie">IBM Cloud</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/cn/community/get-involved">Get Involved</a></li><li class="footer__item"><a href="https://join.slack.com/t/apache-hudi/shared_invite/zt-2ggm1fub8-_yt4Reu9djwqqVRFC7X49g" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://github.com/apache/hudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://twitter.com/ApacheHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.youtube.com/channel/UCs7AhE0BWaEPZSChrBR-Muw" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>YouTube<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/apache-hudi/?viewAsMember=true" target="_blank" rel="noopener noreferrer" class="footer__link-item"><span>Linkedin<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li><li class="footer__item"><a href="mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><div class="footer__title">Apache</div><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://hudi.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_SRtH"><img src="/cn/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/cn/assets/images/logo-big.png" alt="Apache Hudi™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footer__copyright">Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. |
| Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation. <a href="/docs/privacy">Privacy Policy</a></div></div></div></footer></div> |
| <script src="/cn/assets/js/runtime~main.0acdb754.js"></script> |
| <script src="/cn/assets/js/main.6d6aa24f.js"></script> |
| </body> |
| </html> |