| <!doctype html> |
| <html lang="en" dir="ltr"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta name="generator" content="Docusaurus v2.0.0-beta.1"> |
| <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Pinot™ Blog RSS Feed"> |
| <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Pinot™ Blog Atom Feed"> |
| <link rel="preconnect" href="https://www.google-analytics.com"> |
| <link rel="preconnect" href="https://www.googletagmanager.com"> |
| <script async src="https://www.googletagmanager.com/gtag/js?id=G-ZXG79NJEBY"></script> |
| <script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-ZXG79NJEBY",{})</script> |
| <link rel="search" type="application/opensearchdescription+xml" title="Apache Pinot™" href="/opensearch.xml"> |
| <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Ubuntu|Roboto|Source+Code+Pro"> |
| <link rel="stylesheet" href="https://at-ui.github.io/feather-font/css/iconfont.css"><title data-react-helmet="true">19 posts tagged with "data" | Apache Pinot™</title><meta data-react-helmet="true" property="og:title" content="19 posts tagged with "data" | Apache Pinot™"><meta data-react-helmet="true" property="og:url" content="https://pinot.apache.org/blog/tags/data"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="blog_tags_posts"><meta data-react-helmet="true" name="keywords" content="Pinot,Text analytics,Text index,User-Facing Analytics,Real-time data platform"><link data-react-helmet="true" rel="shortcut icon" href="/img/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://pinot.apache.org/blog/tags/data"><link data-react-helmet="true" rel="alternate" href="https://pinot.apache.org/blog/tags/data" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://pinot.apache.org/blog/tags/data" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://BH4D9OD16A-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.d071271f.css"> |
| <link rel="preload" href="/assets/js/runtime~main.fa80698b.js" as="script"> |
| <link rel="preload" href="/assets/js/main.271ac9bf.js" as="script"> |
| </head> |
| <body> |
| <script>!function(){function e(e){document.documentElement.setAttribute("data-theme",e)}var t=function(){var e=null;try{e=localStorage.getItem("theme")}catch(e){}return e}();null!==t?e(t):window.matchMedia("(prefers-color-scheme: dark)").matches?e("dark"):window.matchMedia("(prefers-color-scheme: light)").matches?e("light"):e("dark")}()</script><div id="__docusaurus"> |
| <div><a href="#main" class="skipToContent_OuoZ shadow--md">Skip to main content</a></div><nav class="navbar navbar--fixed-top navbarHideable_RReh"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><img src="/img/pinot-navbar-logo-722f37.svg" alt="Pinot" class="themedImage_TMUO themedImage--light_4Vu1 navbar__logo"><img src="/img/pinot-navbar-logo-722f37.svg" alt="Pinot" class="themedImage_TMUO themedImage--dark_uzRr navbar__logo"></a></div><div class="navbar__items navbar__items--right"><a href="https://docs.pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Docs</a><a class="navbar__item navbar__link" href="/download">Download</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/blog">Blog</a><a href="https://github.com/apache/pinot" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a><div class="react-toggle displayOnlyInLargeViewport_cxYs react-toggle--checked react-toggle--disabled"><div class="react-toggle-track" role="button" tabindex="-1"><div class="react-toggle-track-check"><span class="toggle_iYfV" style="margin-left:2px">🌙</span></div><div class="react-toggle-track-x"><span class="toggle_iYfV" style="margin-left:2px">☀️</span></div><div class="react-toggle-thumb"></div></div><input type="checkbox" checked="" class="react-toggle-screenreader-only" aria-label="Switch between dark and light mode"></div><div class="searchBox_NKBi"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/img/pinot-navbar-logo-722f37.svg" alt="Pinot" class="themedImage_TMUO themedImage--light_4Vu1 navbar__logo"><img src="/img/pinot-navbar-logo-722f37.svg" alt="Pinot" class="themedImage_TMUO themedImage--dark_uzRr navbar__logo"></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a href="https://docs.pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="menu__link">Docs</a></li><li class="menu__list-item"><a class="menu__link" href="/download">Download</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link navbar__link--active" href="/blog">Blog</a></li><li class="menu__list-item"><a href="https://github.com/apache/pinot" target="_blank" rel="noopener noreferrer" class="menu__link"><span>GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_wgqa"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></span></a></li></ul></div></div></div></nav><div class="main-wrapper blog-wrapper blog-tags-post-page"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_q+wC thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_9G5K margin-bottom--md">All our posts</div><ul class="sidebarItemList_6T4b"><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/09/19/Annoucing-Apache-Pinot-1-0">Announcing Apache Pinot 1.0™</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/08/04/segment-compaction-for-upsert-enabled-tables-in-apache-pinot-3f30657aa077">Segment Compaction for Upsert Enabled Tables in Apache Pinot</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/07/12/star-tree-index-in-apache-pinot-part-3-understanding-the-impact-in-real-customer">Star-Tree Index in Apache Pinot - Part 3 - Understanding the Impact in Real Customer Scenarios</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/06/01/real-time-mastodon-usage-with-apache-kafka-apache-pinot-and-streamlit">Real-Time Mastodon Usage with Apache Kafka, Apache Pinot, and Streamlit</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/05/30/how-to-ingest-streaming-data-from-kafka-to-apache-pinot">How to Ingest Streaming Data from Kafka to Apache Pinot™</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/05/23/change-data-capture-with-apache-pinot-how-does-it-work">Change Data Capture with Apache Pinot - How Does It Work?</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/05/18/apache-pinot-tutorial-for-getting-started-a-step-by-step-guide">Apache Pinot Tutorial for Getting Started - A Step-by-Step Guide</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/05/16/star-tree-indexes-in-apache-pinot-part-1-understanding-the-impact-on-query-performance">StarTree Indexes in Apache Pinot Part-1 - Understanding the Impact on Query Performance</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/05/11/Geospatial-Indexing-in-Apache-Pinot">Geospatial Indexing in Apache Pinot</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/2023/03/30/Apache-Pinot-0-12-Consumer-Record-Lag">Apache Pinot™ 0.12 - Consumer Record Lag</a></li></ul></nav></aside><main class="col col--7"><header class="margin-bottom--xl"><h1>19 posts tagged with "data"</h1><a href="/blog/tags">View All Tags</a></header><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/09/19/Annoucing-Apache-Pinot-1-0">Announcing Apache Pinot 1.0™</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-09-19T00:00:00.000Z">September 19, 2023</time> · 13 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Hubert Dulay"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Hubert Dulay</a></div><small class="avatar__subtitle">Hubert Dulay, Mayank Shrivastava, Neha Pawar</small></div></div></header><div class="markdown"><p>By: Hubert Dulay, Mayank Shrivastava, Neha Pawar</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="what-makes-a-10-release"></a>What Makes a “1.0 Release?”<a class="hash-link" href="#what-makes-a-10-release" title="Direct link to heading">#</a></h2><p>Apache Pinot has continuously evolved since the project’s inception within LinkedIn in 2013. Back then it was developed at a single company with a single use case in mind: to power “who viewed my profile?” Over the ensuing decade the Apache Pinot community expanded to be embraced by many other organizations, and those organizations have expanded its capabilities to address new use cases. Apache Pinot in 2023 is continuously evolving to address emerging needs in the real-time analytics community. Let’s look at how much innovation has gone into Apache Pinot over the years:</p><ul><li>Upserts — data-in-motion tends to stay in motion, and one of the cornerstone capabilities of Apache Pinot is upsert support to handle upsert mutations in real-time.</li><li>Query-time Native JOINs — it was important to get this right, so that they were performant and scalable, allowing high QPS. This we will discuss in more detail below.</li><li>Pluggable architecture — a broad user base requires the ability to extend the database with new customizable index types, routing strategies and storage options</li><li>Handling Semi-Structured/Unstructured Data — Pinot can easily index JSON and text data types at scale.</li><li>Improving ANSI SQL Compliance — to that end, we’ve added better NULL handling, window functions, and as stated above, the capability for native JOINs.</li></ul><p>With all of these features and capabilities, Apache Pinot moves farther and farther from mere database status, and becomes more of a complete platform that can tackle entire new classes of use cases that were beyond its capabilities in earlier days.</p><p>First let’s look at what Apache Pinot 1.0 itself is delivering. The first foundational pillar of what makes something worthy of a “1.0” release is software quality. Over the past year, since September 2022, engineers across the Apache Pinot community have closed over 300 issues to provide new features, optimize performance, expand test coverage, and squash bugs.</p><p>Features are also a key thing that makes a new release worthy of “1.0” status. The most critical part of the 1.0 release is undoubtedly the <a href="https://docs.pinot.apache.org/developers/advanced/v2-multi-stage-query-engine" target="_blank" rel="noopener noreferrer">Multi-Stage Query Engine</a>, which permits Apache Pinot users to do <a href="https://startree.ai/blog/apache-pinot-native-join-support" target="_blank" rel="noopener noreferrer">performant and scalable query-time JOINs</a>.</p><p>The original engine works very well for simpler filter-and-aggregate queries, but the broker could become a bottleneck for more complex queries. The new engine also resolves this by introducing intermediary compute stages on the query servers, and brings Apache Pinot closer to full ANSI SQL semantics. While this query engine has been available within Apache Pinot already (since release 0.11.0), with the release of Apache Pinot 1.0 this feature is functionally complete.</p><p>(While you can read more below, check out the accompanying blog by Apache Pinot PMC Neha Pawar about using query-time JOINs <a href="https://startree.ai/blog/query-time-joins-in-apache-pinot-1-0" target="_blank" rel="noopener noreferrer">here</a>).</p><p>This post is a summary of the high points, but you can find a full list of everything included in the release notes. And if you’d like a <a href="https://youtu.be/2cwRHM4J7kI?si=hEtl6W2eNlMkWqag" target="_blank" rel="noopener noreferrer">video treatment of many of the main features in 1.0</a>, including some helpful animations, watch here:</p><iframe width="560" height="315" src="https://www.youtube.com/embed/2cwRHM4J7kI?si=BMVZanJIuXv9o0du" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"></iframe><p>Otherwise, let’s have a look at some of the highlighted changes:</p><ul><li>Join Support - Part of the Multi-Stage Query Engine </li><li>Improved Upserts - Deletion and Compaction Support</li><li>Encode User-Specified Compressed Log Processor (CLP) During Ingestion</li><li>NULL Support</li><li>Pluggable Index Types [Index Service Provider Interface (SPI)]</li><li>Improved Pinot-Spark Integration - Spark3 Compatibility</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="join-support"></a>Join Support<a class="hash-link" href="#join-support" title="Direct link to heading">#</a></h2><p>Apache Pinot 1.0 introduces native query-time JOIN support equipping Pinot to handle a broad spectrum of JOIN scenarios providing full coverage from user-facing analytics all the way up to ad hoc analytics. Underpinning this innovation is the multi-stage query engine, introduced a year ago, which efficiently manages complex analytical queries, including JOIN operations. This engine alleviates computational burdens by offloading tasks from brokers to a dedicated intermediate compute stage. Additionally, a new planner supporting full SQL semantics enhances Pinot's analytical capabilities.</p><p>JOIN optimization strategies play a pivotal role in Apache Pinot 1.0. These include predicate push-down to individual tables and using indexing and pruning to reduce scanning which speeds up query processing, smart data layout considerations to minimize data shuffling, and query hints for fine-tuning JOIN operations. With support for all JOIN types and three JOIN algorithms, including broadcast join, shuffle distributed hash join, and lookup join, Apache Pinot delivers versatility and scalability. By significantly reducing query latency and simplifying architecture, Apache Pinot 1.0 is a game-changer for real-time OLAP systems. </p><p>For more detailed information on JOINs, please visit this blog <a href="https://startree.ai/blog/query-time-joins-in-apache-pinot-1-0" target="_blank" rel="noopener noreferrer">post</a>.</p><p>Discover How Uber is using Joins in Apache Pinot |
| For a real-world use case, Uber is already using the new join capabilities of Apache Pinot at scale in production. You can watch this video to learn more.</p><iframe width="560" height="315" src="https://www.youtube.com/embed/z4Chhref1BM?si=eCOfxuw8Y_ZP8ZHN" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"></iframe><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="upsert-improvements"></a>Upsert Improvements<a class="hash-link" href="#upsert-improvements" title="Direct link to heading">#</a></h2><p>Support for upserts is one of the key capabilities Apache Pinot offers that differentiates it from other real-time analytics databases. It is a vital feature when real-time streaming data is prone to frequent updates. While upserts have been available in Apache Pinot since 0.6.0, with 1.0 they include two major new enhancements: segment compaction and delete support for upsert tables.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="segment-compaction-for-upsert-tables"></a>Segment Compaction for Upsert Tables<a class="hash-link" href="#segment-compaction-for-upsert-tables" title="Direct link to heading">#</a></h3><p>Pinot’s Upsert tables store all versions of a record ingested into immutable segments on disk. Older records unnecessarily consume valuable storage space when they’re no longer used in query results. Pinot’s Segment Compaction reclaims this valuable storage space by introducing a periodic process that replaces completed segments with compacted segments which only contain the latest version of the records.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"task"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"taskTypeConfigsMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"UpsertCompactionTask"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schedule"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"0 */5 * ? * *"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"bufferTimePeriod"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"7d"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"invalidRecordsThresholdPercent"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"30"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"invalidRecordsThresholdCount"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"100000"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The example above, bufferTimePeriod is set to “7d” which means that any segment that was completed over 7 days ago may be eligible for compaction. However, if you want to ensure that segments are compacted without any additional delay this config can be set to “0d”.</p><p>invalidRecordsThresholdPercent is an optional limit to the amount of older records allowed in the completed segment represented as a percentage of the total number of records in the segment (i.e. old records / total records). In the example, this property is set to “30” which means that if more than 30% of the records in the completed segment are old, then the segment may be selected for compaction.</p><p>invalidRecordsThresholdCount is also a limit similar to the previous property, but allows you to express the threshold as a record count. In the example above, this property is set to “100000” which means that if the segment contains more than 100K records then it may be selected for compaction.</p><p><a href="https://robert-zych.medium.com/segment-compaction-for-upsert-enabled-tables-in-apache-pinot-3f30657aa077" target="_blank" rel="noopener noreferrer">Read more</a> about the design of this feature.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="delete-support-for-upsert-tables"></a>DELETE Support for Upsert Tables<a class="hash-link" href="#delete-support-for-upsert-tables" title="Direct link to heading">#</a></h3><p>Apache Pinot upsert tables now support deleting records. Supporting delete with upsert avoids the need for the user to explicitly filter out invalid records in the query. SELECT <em> FROM table WHERE deleted_column != true becomes as simple as SELECT </em> FROM table. Pinot will only return the latest non-deleted records from the table. This feature opens up the support to ingest Change Data Capture (CDC) data like Debezium where the changes from a source (typically, mutable) will contain DELETE events.</p><p>Deletes itself is implemented as a soft-delete in Apache Pinot with a dedicated boolean column that serves as a delete marker for the record. Pinot automatically filters out records that are marked in this column. For more details, please see the <a href="https://docs.pinot.apache.org/basics/data-import/upsert#delete-column" target="_blank" rel="noopener noreferrer">documentation</a>.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="null-value-support"></a>NULL Value Support<a class="hash-link" href="#null-value-support" title="Direct link to heading">#</a></h2><p>This feature enables Postgres compatible NULL semantics in Apache Pinot queries. The NULL semantics are important for usability for full SQL compatibility which many BI applications like Tableau rely upon when invoking queries to render dashboards. Previously in Pinot, we could not represent NULL. The workaround was to use special values like Integer.MIN_VALUE to represent NULL. Now Pinot 1.0 has full support to represent NULL values. By adding NULL support, Pinot 1.0 has increased the Tableau certification pass rate by 90%.</p><p>Here are some examples of how NULLs will work in Pinot 1.0.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="aggregations"></a>Aggregations<a class="hash-link" href="#aggregations" title="Direct link to heading">#</a></h3><p>Given the following table below, aggregating columns with NULL values will have this behavior.</p><table><thead><tr><th>col1</th><th>col2</th></tr></thead><tbody><tr><td>1</td><td>NULL</td></tr><tr><td>2</td><td>NULL</td></tr><tr><td>3</td><td>1</td></tr></tbody></table><p>Since col1 does not contain NULL values, all the values are included in the aggregation.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">col1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token comment" style="color:rgb(98, 114, 164)">-- returns 6</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">col1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token comment" style="color:rgb(98, 114, 164)">-- returns 3</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>In the select statement below, the NULL values in col2 are not included in the aggregation.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">col2</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token comment" style="color:rgb(98, 114, 164)">-- returns 1</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">col2</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token comment" style="color:rgb(98, 114, 164)">-- returns 1</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="group-by"></a>Group By<a class="hash-link" href="#group-by" title="Direct link to heading">#</a></h3><p>Pinot now supports grouping by NULL. In the example below, we are grouping by col1 which contains a NULL value. Given the table below, grouping by columns with NULL value will have this behavior.</p><table><thead><tr><th>col1</th></tr></thead><tbody><tr><td>a</td></tr><tr><td>NULL</td></tr><tr><td>b</td></tr><tr><td>a</td></tr></tbody></table><p>The following select statement will output the following result.</p><p>select col1, count(*) from table group by col1</p><table><thead><tr><th>col1</th><th>count()</th></tr></thead><tbody><tr><td>a</td><td>2</td></tr><tr><td>b</td><td>1</td></tr><tr><td>NULL</td><td>1</td></tr></tbody></table><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="sorting"></a>Sorting<a class="hash-link" href="#sorting" title="Direct link to heading">#</a></h3><p>Pinot now allows you to specify the location of NULL values when sorting records. The default is to act as though NULLs are larger than non-NULLs.</p><p>Given this list of values, sorting them will result in the following.</p><p><code>values: 1, 2, 3, NULL</code></p><p>Example 1:</p><p>NULL values sort BEFORE all non-NULL values.</p><p>SQL:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> col </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">table</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> col NULLS </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FIRST</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><code>RESULT: NULL, 1, 2, 3 </code></p><p>Example 2:</p><p>NULL values sort AFTER all non-NULL values.</p><p>SQL: </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> col </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">table</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> col </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ASC</span><span class="token plain"> NULLS </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">LAST</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><code>RESULT: 1, 2, 3, NULL</code></p><p>Example 3:</p><p>Default behavior is NULL LAST.</p><p>SQL: </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> col </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">table</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> col</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><code>RESULT: 1, 2, 3, NULL</code></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="index-pluggability"></a>Index Pluggability<a class="hash-link" href="#index-pluggability" title="Direct link to heading">#</a></h2><p>Today, Pinot supports multiple index types, like forward index, bloom filter, and range index. Before Pinot 1.0, index types were all statically defined, which means that in order to create a new index type, you’d need to rebuild Pinot from source. Ideally that shouldn’t be the case.</p><p>To increase speed of development, <a href="https://github.com/apache/pinot/issues/10183" target="_blank" rel="noopener noreferrer">Index Service Provider Interface (SPI)</a>, or index-spi, reduces friction by adding the ability to include new index types at runtime in Pinot. This opens the ability of adding third party indexes by including an external jar in the classpath and adding some configuration. This opens up Pinot indexing to lower-friction innovation from the community.</p><p>For now, SPI-accessible indexes are limited to single field index types. Features like the star-tree index or other multi-column approaches are not yet supported.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="apache-pinot-spark-3-connector-and-passing-pinot-options"></a>Apache Pinot Spark 3 Connector and Passing Pinot Options<a class="hash-link" href="#apache-pinot-spark-3-connector-and-passing-pinot-options" title="Direct link to heading">#</a></h2><p>Apache Spark users can now take advantage of Pinot’s ability to provide high scalability, low latency, and high concurrency within the context of a Spark 3 cluster using the <a href="https://github.com/apache/pinot/blob/master/pinot-connectors/pinot-spark-3-connector/README.md" target="_blank" rel="noopener noreferrer">Apache Pinot Spark 3 Connector</a>.</p><p>This connector supports Apache Spark (2.x and 3.x) as a processor to create and push segment files to the database and can read realtime, offline, and hybrid tables from Pinot.</p><p>Now you can merge your streaming and batch datasets together in Spark to provide a full view of real-time and historical data for your machine learning algorithms and feature stores.</p><p>Performance Features</p><ul><li>Distributed, parallel scan</li><li>Streaming reads using gRPC (optional)</li><li>Column and filter push down to optimize performance</li><li>Support for Pinot’s Query Options that include: maxExecutionThreads, enableNullHandling, skipUpsert, etc.</li></ul><p>Usability Features</p><ul><li>SQL support instead of PQL</li><li>Overlap between realtime and offline segments is queried exactly once for hybrid tables</li><li>Schema discovery - If schema is not specified, the <a href="https://github.com/apache/pinot/blob/master/pinot-connectors/pinot-spark-3-connector/documentation/read_model.md" target="_blank" rel="noopener noreferrer">connector reads the table schema</a> from the Pinot controller, and then converts to the Spark schema.</li></ul><p>Here is an example that reads a Pinot table, by setting the format to “pinot” spark will automatically load the Pinot connector and read the “airlinesStats” table. The queryOptions property allows you to provide <a href="https://docs.pinot.apache.org/users/user-guide-query/query-options" target="_blank" rel="noopener noreferrer">Pinot Query Options</a>.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI scala"><pre tabindex="0" class="prism-code language-scala codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">val data = spark.read</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .format("pinot")</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option("table", "airlineStats")</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option("tableType", "offline")</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .option("queryOptions", "enableNullHandling=true,maxExecutionThreads=1")</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .load()</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> .sql("SELECT * FROM airlineStats WHERE DEST = ‘SFO’")</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">data.show(100)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="petabyte-scale-log-storage-and-search-in-pinot-with-clp"></a>Petabyte-Scale Log Storage and Search in Pinot with CLP<a class="hash-link" href="#petabyte-scale-log-storage-and-search-in-pinot-with-clp" title="Direct link to heading">#</a></h2><p>Compressed Log Processor (CLP) is a tool capable of losslessly compressing text logs and searching them in their compressed state. It achieves a better compression ratio than general purpose compressors alone, while retaining the ability to search the compressed log events without incurring the performance penalty of fully decompressing them. Part of CLP’s algorithm was deployed within <a href="https://www.uber.com/blog/reducing-logging-cost-by-two-orders-of-magnitude-using-clp/" target="_blank" rel="noopener noreferrer">Uber</a> to compress unstructured Spark logs, as they are generated, achieving an unprecedented compression of 169×.</p><p>Log events generated as JSON objects with user-defined schemas, meaning each event may have different keys. Such user-defined schemas make these events challenging to store in a table with a set schema. With Log Storage and Search in Pinot with CLP, users would be able to:</p><ul><li>Store their log events losslessly (without dropping fields)</li><li>Store their logs with some amount of compression</li><li>Query their logs efficiently</li></ul><p>The CLP ingestion pipeline can be used on log events from a stream, such as JSON log events ingested from Kafka. The plugin takes two inputs: a JSON record and a list of fields to encode with CLP.</p><p>The fields to encode can be configured as shown:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ...</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ...</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ...</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.inputformat.clplog.CLPLogMessageDecoder"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.prop.fieldsForClpEncoding"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"<field-name-1>,<field-name-2>"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><code><field-names-1 and 2></code> are a comma-separated list of fields you wish to encode with CLP.</p><p>You can read the design <a href="https://docs.google.com/document/d/1nHZb37re4mUwEA258x3a2pgX13EWLWMJ0uLEDk1dUyU/edit" target="_blank" rel="noopener noreferrer">document</a> for more details into why and how this feature was implemented.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>Apache Pinot’s evolution is expressly due to the humans behind the code, and in reaching 1.0 release status it is proper and fitting to give credit to the open source project’s key committers. Since its early days, over <a href="https://github.com/apache/pinot/graphs/contributors" target="_blank" rel="noopener noreferrer">three hundred contributors</a> have produced more than 1.3 million source lines of code (SLOC).</p><p><img src="https://pinot.apache.org/blogs/apache-pinot-1-0-name-cloud.png" alt="alt"></p><p>The introduction of Apache Pinot 1.0 represents an exceptional stride forward in real-time online analytical processing (OLAP) capabilities, marking a watershed moment in the evolution of real-time analytics systems. This release redefines the limits of what can be achieved in the realm of instant data analysis, presenting a game-changing solution for organizations seeking high throughput and low latency in their OLAP queries. If you would like to get started with Apache Pinot 1.0, you can check out the documentation, and download it now.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="resources"></a>Resources<a class="hash-link" href="#resources" title="Direct link to heading">#</a></h2><p>If you want to try out Apache Pinot, the following resources will help you get started:</p><p>Download page: <a href="https://pinot.apache.org/download/" target="_blank" rel="noopener noreferrer">https://pinot.apache.org/download/</a> </p><p>Getting started: <a href="https://docs.pinot.apache.org/getting-started" target="_blank" rel="noopener noreferrer">https://docs.pinot.apache.org/getting-started</a> </p><p>Join our Slack channel: <a href="https://communityinviter.com/apps/apache-pinot/apache-pinot" target="_blank" rel="noopener noreferrer">https://communityinviter.com/apps/apache-pinot/apache-pinot</a> </p><p>See our upcoming events: <a href="https://www.meetup.com/apache-pinot" target="_blank" rel="noopener noreferrer">https://www.meetup.com/apache-pinot</a> </p><p>Follow us on social media: <a href="https://twitter.com/ApachePinot" target="_blank" rel="noopener noreferrer">https://twitter.com/ApachePinot</a></p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/joins">joins</a><a class="margin-horiz--sm" href="/blog/tags/compression">compression</a><a class="margin-horiz--sm" href="/blog/tags/null-support">null support</a><a class="margin-horiz--sm" href="/blog/tags/pluggable-index">pluggable index</a><a class="margin-horiz--sm" href="/blog/tags/spark-integration">spark integration</a></div><div class="col text--right"><a aria-label="Read more about Announcing Apache Pinot 1.0™" href="/blog/2023/09/19/Annoucing-Apache-Pinot-1-0"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/08/04/segment-compaction-for-upsert-enabled-tables-in-apache-pinot-3f30657aa077">Segment Compaction for Upsert Enabled Tables in Apache Pinot</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-08-04T00:00:00.000Z">August 4, 2023</time> · 4 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/robertzych/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Robert Zych"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/robertzych/" target="_blank" rel="noopener noreferrer">Robert Zych</a></div><small class="avatar__subtitle">Software Engineer</small></div></div></header><div class="markdown"><p>I’m happy to share that my 1st feature contribution to the Apache Pinot project (<a href="https://github.com/apache/pinot/pull/10463" target="_blank" rel="noopener noreferrer">Segment compaction for upsert enabled real-time tables</a>) was merged recently! In this post, I will briefly discuss the problem segment compaction addresses, how to configure it, and what it looks like in action. If you’re unfamiliar with Pinot’s Upsert features, I recommend reviewing <a href="https://dev.startree.ai/docs/pinot/recipes/upserts-full" target="_blank" rel="noopener noreferrer">Full Upserts in Pinot</a> to get started and <a href="https://docs.pinot.apache.org/basics/data-import/upsert" target="_blank" rel="noopener noreferrer">Stream Ingestion with Upsert</a> for more information.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="context-and-configuration"></a>Context and Configuration<a class="hash-link" href="#context-and-configuration" title="Direct link to heading">#</a></h2><p>As Pinot’s Upsert stores all versions of the record ingested into immutable segments on disk, older records unnecessarily consume valuable storage space when they’re no longer used in query results. Pinot’s Segment Compaction reclaims this valuable storage space by introducing a periodic process that replaces the completed segments with compacted segments which only contain the latest version of the records. I recommend reviewing the Minion documentation if you’re unfamiliar with Pinot’s ability to run periodic processes.</p><p>With task scheduling enabled and an available Minion, you can configure segment compaction by adding the following to your table’s config.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"task"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"taskTypeConfigsMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"UpsertCompactionTask"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schedule"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"0 */5 * ? * *"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"bufferTimePeriod"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"7d"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"invalidRecordsThresholdPercent"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"30"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"invalidRecordsThresholdCount"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"100000"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>All the configs above (excluding schedule) determine which completed segments are selected for compaction.</p><p>bufferTimePeriod is the amount of time that has elapsed since the segment was consuming. In the example above, this has been set to “7d” which means that any segment that was completed over 7 days ago may be eligible for compaction. However, if you want to ensure that segments are compacted without any additional delay this config can be set to “0d”.</p><p>invalidRecordsThresholdPercent is a limit to the amount of older records allowed in the completed segment represented as a percentage of the total number of records in the segment (i.e. old records / total records). In the example above, this has been set to “30” which means that if more than 30% of the records in the completed segment are old, then the segment may be selected for compaction. As segment compaction is an expensive operation, it is not recommended to set this config (or invalidRecordsThresholdCount) too close to 1. This config is optional on the condition that invalidRecordsThresholdCount has been set and can be used in conjunction with invalidRecordsThresholdCount.</p><p>invalidRecordsThresholdCount is also a limit similar to invalidRecordsThresholdPercent, but allows you to express the threshold as a record count. In the example above, this has been set to “100000” which means that if the segment contains more than 100K records then it may be selected for compaction.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="example-use-case"></a>Example Use Case<a class="hash-link" href="#example-use-case" title="Direct link to heading">#</a></h2><p>I’ve created a data set that includes 24M records. The data set contains 240K unique keys that have each been duplicated 100 times.</p><p><img src="https://miro.medium.com/v2/resize:fit:4800/0*gEyUq_Tp4Fycgp2r" alt="alt"></p><p>After ingesting the data set there are 6 segments (5 completed segments + 1 consuming segment) with a total estimated size of 22.8MB. Submitting the query “set skipUpsert=true; select count(*) from transcript_upsert” before compaction produces the following query result.</p><p><img src="https://miro.medium.com/v2/resize:fit:1064/0*GE3P1fqAcsr0Xs5A" alt="alt"></p><p>After the compaction tasks are complete, the Minion Task Manager UI reports the following.</p><p><img src="https://miro.medium.com/v2/resize:fit:2000/0*SMxDZNndFwpoeNMI" alt="alt"></p><p>Segment compaction generates a task for each segment to be compacted. 5 tasks were generated in this case because 90% of the records (3.6–4.5M records) are old in all 5 of the completed segments, therefore exceeding the configured thresholds. If a completed segment only contains old records, it is deleted immediately and a task isn’t generated to compact it.</p><p><img src="https://miro.medium.com/v2/resize:fit:1068/0*LB1itt-wCohpz42i" alt="alt"></p><p>Submitting the query again we now see that count matches the set of 240K unique keys.</p><p><img src="https://miro.medium.com/v2/resize:fit:2000/0*cmx4zxoMsD4-tR_u" alt="alt"></p><p>Once compaction has completed and the original segments have been replaced with their compacted counterparts we see that the total number of segments remained the same, but the total estimated size dropped to only 2.77MB! Since compaction can yield very small segments, one improvement would be to merge smaller segments into larger ones as this would improve query latency.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="conclusion"></a>Conclusion<a class="hash-link" href="#conclusion" title="Direct link to heading">#</a></h2><p>In this brief overview of Segment Compaction I covered the problem it addresses, how you can configure it, and demonstrated its ability to reclaim storage space. I’d like to thank Ankit Sultana, Seunghyun Lee, and especially Jackie Jiang for their feedback and support throughout the design and development stages. If you have any questions or feedback, I’m available on the Apache Pinot Slack.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/blog-post">blog post</a><a class="margin-horiz--sm" href="/blog/tags/feature-contribution">feature contribution</a><a class="margin-horiz--sm" href="/blog/tags/segment-compaction">segment compaction</a><a class="margin-horiz--sm" href="/blog/tags/apache-pinot-project">Apache Pinot project</a><a class="margin-horiz--sm" href="/blog/tags/older-records">older records</a><a class="margin-horiz--sm" href="/blog/tags/storage-space">storage space</a><a class="margin-horiz--sm" href="/blog/tags/configuration">configuration</a><a class="margin-horiz--sm" href="/blog/tags/impact">impact</a><a class="margin-horiz--sm" href="/blog/tags/freeing-up-storage">freeing up storage</a></div><div class="col text--right"><a aria-label="Read more about Segment Compaction for Upsert Enabled Tables in Apache Pinot" href="/blog/2023/08/04/segment-compaction-for-upsert-enabled-tables-in-apache-pinot-3f30657aa077"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/07/12/star-tree-index-in-apache-pinot-part-3-understanding-the-impact-in-real-customer">Star-Tree Index in Apache Pinot - Part 3 - Understanding the Impact in Real Customer Scenarios</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-07-12T00:00:00.000Z">July 12, 2023</time> · 8 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Sandeep Dabade, Kulbir Nijjer"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Sandeep Dabade, Kulbir Nijjer</a></div><small class="avatar__subtitle">Solutions Engineers</small></div></div></header><div class="markdown"><p>In <a href="https://startree.ai/blog/star-tree-indexes-in-apache-pinot-part-1-understanding-the-impact-on-query-performance" target="_blank" rel="noopener noreferrer">part 1 of this blog series</a>, we looked at how a star-tree index brought down standalone query latency on a sizable dataset of ~633M records from 1,513ms to 4ms! — nearly 380x faster. </p><p>In <a href="https://startree.ai/blog/star-tree-indexes-in-apache-pinot-part-2-understanding-the-impact-during-high-concurrency" target="_blank" rel="noopener noreferrer">part 2 of this blog series</a>, we imitated a real production scenario by firing hundreds of concurrent queries using JMeter and showcased how using a star-tree index helped achieve a >95% drop in p90th / p95th / p99th latencies and 126x increase in Throughput.</p><p>In this part, we will cover some real customer stories that have seen 95% to 99% improvement in query performance using Star-Tree Index.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="adtech-use-case"></a>AdTech Use Case<a class="hash-link" href="#adtech-use-case" title="Direct link to heading">#</a></h2><p>This was for a leading AdTech platform and a somewhat typical use case; users of the platform (advertisers, publishers, and influencers) wanted to see fresh metrics on how their activities (such as online content, ad, and email campaigns) were performing in real-time so they could tweak things as needed. The application team wanted to provide a rich analytical interface to these users so that not only can they see the current performance but also do custom slicing and dicing of data over a period of time. For example, compare their current campaign performance to one they ran two weeks back, do cohort analysis, and so on.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="why-was-the-existing-system-not-working"></a>Why was the existing system not working?<a class="hash-link" href="#why-was-the-existing-system-not-working" title="Direct link to heading">#</a></h3><p>Their existing tech stack was a mix of OSS and custom-built in-house code, which was both operationally difficult to manage and costly to maintain. Yet more importantly, it wasn’t able to meet the basic throughput and latency requirements required by the platform to sustain user growth as well as provide richer analytic capabilities in the product.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="the-problem-and-challenges"></a>The Problem and Challenges?<a class="hash-link" href="#the-problem-and-challenges" title="Direct link to heading">#</a></h3><p>When the StarTree Sales Engineering team was engaged, the requirements were very clear:</p><ul><li>Throughput: Support 50+ QPS during POC and 200+ for production)</li><li>Latency: P95th latency of 2s, including query that needed aggregation of ~ 2 billion rows</li><li>Scalability: Ability to scale efficiently with future growth in QPS in a non-linear manner</li></ul><p>The biggest challenge was the size of data — 20+ TB and growing — and on top of that, a complex aggregation query driving the summary view for users so they can drill further in to get more details. </p><p>This particular query needed to aggregate close to 2 Billion records at read time and then would be fired for every active user interacting with the platform (so high concurrent QPS). In this case, despite applying all relevant indexes available in their existing system, out-of-the-box query performance was still in the 6-8 seconds range, which is expected given that bulk of the work for the query is happening in the aggregation phase and not during the filtering phase (indexing helps with this).</p><p>In other OLAP systems they explored, the only option available to handle this use case was doing ingestion time rollups. In other words, changing the data to higher granularity. However, this obviously means losing access to raw data and also potentially re-bootstrapping if new use cases come down the road that need raw data access.</p><p>This is exactly the type of scenario that the <a href="https://docs.pinot.apache.org/basics/indexing/star-tree-index" target="_blank" rel="noopener noreferrer">Star-Tree Index</a>, unique to Apache Pinot, is designed to address - handle large aggregation queries at scale that need sub-second performance. The best part is you can apply it anytime without any need to reprocess the data or plan any system downtime. (Segment reload to apply table config changes run as a background task in Apache Pinot.) In this specific case, the same query latencies with the star-tree index applied went down to 15 ms. This implicitly meant that with the same infrastructure footprint, StarTree was able to support ~70 QPS (Queries Per Second) vs < 1 QPS for this most complex query; while still keeping the raw data intact.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="data-size-and-infra-footprint-for-the-pilot"></a>Data Size and Infra Footprint for the Pilot: <a class="hash-link" href="#data-size-and-infra-footprint-for-the-pilot" title="Direct link to heading">#</a></h3><ul><li>Total # of records: ~2 Trillion</li><li>Data Size: ~20 TB</li><li>Capacity: 72 vCPUs across 9 Pinot servers (8 vCPU, 64GB per node). </li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="impact-summary"></a>Impact Summary:<a class="hash-link" href="#impact-summary" title="Direct link to heading">#</a></h3><ul><li>99.76% reduction in latency vs. no Star-Tree Index (6.3 seconds to 15 ms)</li><li>99.99999% reduction in amount of data scanned/aggregated per query (>1.8B docs to <2,400)</li></ul><p><img src="https://www.datocms-assets.com/75153/1689174701-image1.png" alt="Visualization of the impact of start-tree index for an AdTech use case with Apache Pinot"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="cybersecurity-use-case"></a>CyberSecurity Use Case:<a class="hash-link" href="#cybersecurity-use-case" title="Direct link to heading">#</a></h2><p>A cybersecurity company that provides their customers with a real-time threat detection platform with AI, allowing them to analyze network flow logs in real-time with a sophisticated reporting/analytical UI. The initial landing page inside the customer portal is a summary view of everything the platform was monitoring in the user's environment and then provides the capability to drill down into specifics of each. For example, filter requests by a specific application or IP Address.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="why-was-the-existing-system-not-working-1"></a>Why was the existing system not working?<a class="hash-link" href="#why-was-the-existing-system-not-working-1" title="Direct link to heading">#</a></h3><p>Their existing tech stack was a mix of Athena/Presto, which couldn’t meet the throughput and latency requirements with growing data volume across their customers. Additionally, operational overhead around managing some of these systems in-house led to increased cost.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="the-problem-and-challenges-1"></a>The Problem and Challenges?<a class="hash-link" href="#the-problem-and-challenges-1" title="Direct link to heading">#</a></h3><p>Some of the key requirements that StarTree Cloud cluster had to meet:</p><ul><li>Throughput: Up to 200 QPS (200 projected by end of year)</li><li>Latency: <1 second P99</li><li>High ingestion rate: 300k events/sec</li><li>ROI: Provide better cost efficiencies</li></ul><p>Similar to Use case #1, the customer wanted to retain data at the lowest granularity (so no ingestion roll-ups), and given the time column granularity similar challenge with running the complex aggregation query to power off the summary view. Additionally, the requirement to get double-digit throughput(QPS) for the POC with the most efficient compute footprint made it quite challenging.</p><p>Given the overhead while doing complex aggregations, efficient filtering (indexes) wasn’t enough - in this case, with 3 * 4-core/32GB nodes query took more than 15 seconds. We immediately switched the table config to add star-tree index to the table config and do a segment reload, and the results were phenomenal — query latency was reduced to 10ms. </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="data-size-and-infra-footprint-for-the-pilot-1"></a>Data Size and Infra Footprint for the Pilot: <a class="hash-link" href="#data-size-and-infra-footprint-for-the-pilot-1" title="Direct link to heading">#</a></h3><ul><li>Total # of records: ~8 Billion</li><li>Data Size: 500+ GB</li><li>Capacity: 12 vCPUs across 3 Pinot servers (4-core/32GB) </li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="impact-summary-1"></a>Impact Summary:<a class="hash-link" href="#impact-summary-1" title="Direct link to heading">#</a></h3><ul><li>99.94% reduction in query latency (achieving 100 QPS for the same query with no extra hardware)</li><li>99.9998% reduction in data scanned/aggregated per query</li><li>Happy Customer 😃</li></ul><p><img src="https://www.datocms-assets.com/75153/1689175033-image4.png" alt="Visualization of the impact of star-tree index for a Cybersecurity use case with Apache Pinot"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="multiplayer-game-leaderboard-use-case"></a>Multiplayer Game Leaderboard Use Case<a class="hash-link" href="#multiplayer-game-leaderboard-use-case" title="Direct link to heading">#</a></h2><p>A global leader in the interactive entertainment field has an A/B Testing / Experimentation use case that tracks players’ activities to measure the player engagement on the new features being rolled out.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="the-problem-and-challenges-2"></a>The Problem and Challenges?<a class="hash-link" href="#the-problem-and-challenges-2" title="Direct link to heading">#</a></h3><p>Some of the key requirements that StarTree Cloud cluster had to meet:</p><ul><li>Throughput: = 200 QPS </li><li>Latencies: <1 second P99</li><li>Ingestion rate: 50K events/sec</li></ul><p>Given the overhead while doing complex aggregations, efficient filtering (indexes) wasn’t enough - in this case, with 1 * 4-core/32GB nodes query took 163 milliseconds. After switching to a star-tree index, the query latency was reduced to 7ms (a reduction of 95.7%). </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="data-size-and-infra-footprint-for-the-pilot-2"></a>Data Size and Infra Footprint for the Pilot: <a class="hash-link" href="#data-size-and-infra-footprint-for-the-pilot-2" title="Direct link to heading">#</a></h3><ul><li>Total # of records: ~34 Million</li><li>Data Size: 500+ GB</li><li>Capacity: 4 vCPUs - 1 Pinot server (4-cores / 32 GB) </li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="impact-summary-2"></a>Impact Summary:<a class="hash-link" href="#impact-summary-2" title="Direct link to heading">#</a></h3><ul><li>95.70% improvement in query performance as a result of 99.9962% reduction in number of documents and entries scanned. </li></ul><p><img src="https://www.datocms-assets.com/75153/1689175176-image2.png" alt="Visualization of the impact of star-tree index for a Gaming use case with Apache Pinot"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="quick-recap-star-tree-index-performance-improvements"></a>Quick Recap: Star-Tree Index Performance Improvements<a class="hash-link" href="#quick-recap-star-tree-index-performance-improvements" title="Direct link to heading">#</a></h2><p><img src="https://www.datocms-assets.com/75153/1689175271-image3.png" alt="Recap Table of the Impact that star-tree index had on three real-world customers using Apache Pinot™"></p><ul><li>99.99% reduction in data scanned/aggregated per query</li><li>95 to 99% improvement in query performance</li></ul><p>Disk IO is the most expensive operation in query processing. The star-tree index reduces Disk IO significantly. Instead of scanning raw documents from the disk and computing aggregates on the fly, star-tree index scans pre-aggregated documents for the combination of dimensions specified in the query from the disk. </p><p>In part 1 of the series, we saw that the star-tree index reduced the disk reads by 99.999% from 584 Million entries (in case of an inverted index) to 2,045. Query latency came down 99.67% from 1,513 ms to 4 ms! This, in itself, was a HUGE benefit. </p><p>In addition to the drastic improvement in query latency, the memory and CPU usage decreased significantly, freeing up resources for taking up more concurrent workloads. The cumulative effect was the 126 x increase in QPS on this small 4 vCPU Pinot Server, as we saw in part 2 blog of this series. </p><p>And finally, in this part 3 of the blog series, we covered three real production use cases that have seen 95% to 99% improvement in query performance using Star-Tree Index.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="intrigued-by-what-youve-read"></a>Intrigued by What You’ve Read?<a class="hash-link" href="#intrigued-by-what-youve-read" title="Direct link to heading">#</a></h2><p>The next step is to load your data into an open-source <a href="https://docs.pinot.apache.org/basics/getting-started" target="_blank" rel="noopener noreferrer">Apache Pinot</a> cluster or, if you prefer, a fully-managed database-as-a-service (DBaaS). Sign up today for a <a href="https://startree.ai/saas-signup" target="_blank" rel="noopener noreferrer">StarTree Cloud account</a>, free for 30 days. If you have more questions, sign up for the <a href="https://communityinviter.com/apps/startreedata/startree-community" target="_blank" rel="noopener noreferrer">StarTree Community Slack</a>.</p><p><a href="https://startree.ai/saas-signup" target="_blank" rel="noopener noreferrer">GET STARTED ON STARTREE CLOUD</a></p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/blog-post">blog post</a><a class="margin-horiz--sm" href="/blog/tags/implementing">implementing</a><a class="margin-horiz--sm" href="/blog/tags/startree-index">startree index</a><a class="margin-horiz--sm" href="/blog/tags/query-performance">query performance</a><a class="margin-horiz--sm" href="/blog/tags/ad-tech-platform">AdTech platform</a><a class="margin-horiz--sm" href="/blog/tags/reducing-latency">reducing latency</a><a class="margin-horiz--sm" href="/blog/tags/cybersecurity-threat-detection">cybersecurity threat detection</a><a class="margin-horiz--sm" href="/blog/tags/multiplayer-game-leaderboard-tracking">multiplayer game leaderboard tracking</a><a class="margin-horiz--sm" href="/blog/tags/improved-query-performance">improved query performance</a><a class="margin-horiz--sm" href="/blog/tags/cost-savings">cost savings</a><a class="margin-horiz--sm" href="/blog/tags/real-production-use-cases">real production use cases</a><a class="margin-horiz--sm" href="/blog/tags/star-tree-cloud">StarTree Cloud</a><a class="margin-horiz--sm" href="/blog/tags/realtime-analytics">realtime analytics</a><a class="margin-horiz--sm" href="/blog/tags/95-to-99-improvement">95% to 99% improvement</a></div><div class="col text--right"><a aria-label="Read more about Star-Tree Index in Apache Pinot - Part 3 - Understanding the Impact in Real Customer Scenarios" href="/blog/2023/07/12/star-tree-index-in-apache-pinot-part-3-understanding-the-impact-in-real-customer"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/06/01/real-time-mastodon-usage-with-apache-kafka-apache-pinot-and-streamlit">Real-Time Mastodon Usage with Apache Kafka, Apache Pinot, and Streamlit</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-06-01T00:00:00.000Z">June 1, 2023</time> · 7 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Developer Advocate</small></div></div></header><div class="markdown"><p>I recently came across a fascinating blog post written by Simon Aubury that shows <a href="https://simonaubury.com/posts/202302_mastodon_duckdb/" target="_blank" rel="noopener noreferrer">how to analyze user activity, server popularity, and language usage on Mastodon</a>, a decentralized social networking platform that has become quite popular in the last six months. </p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="the-existing-solution-kafka-connect-parquet-seaborn-and-duckdb"></a>The Existing Solution: Kafka Connect, Parquet, Seaborn and DuckDB <a class="hash-link" href="#the-existing-solution-kafka-connect-parquet-seaborn-and-duckdb" title="Direct link to heading">#</a></h2><p>To start, Simon wrote a listener to collect the messages, which he then published into Apache Kafka®. He then wrote a Kafka Connect configuration that consumes messages from Kafka and flushes them after every 1,000 messages into Apache Parquet files stored in an Amazon S3 bucket. </p><p>Finally, he queried those Parquet files using DuckDB and created some charts using the Seaborn library, as reflected in the architecture diagram below:</p><p><img src="https://www.datocms-assets.com/75153/1685637607-image1.png" alt="Flowchart of data collection to data processing" title="Flowchart of data collection to data processing"></p><p>Fig: <a href="https://simonaubury.com/posts/202302_mastodon_duckdb/" target="_blank" rel="noopener noreferrer">Data Collection Architecture</a></p><p>The awesome visualizations that Simon created make me wonder whether we can change what happens downstream of Kafka to make our queries even more real-time. Let’s find out!</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="going-real-time-with-apache-pinot"></a>Going Real-Time with Apache Pinot™<a class="hash-link" href="#going-real-time-with-apache-pinot" title="Direct link to heading">#</a></h2><p>Now <a href="https://startree.ai/resources/what-is-apache-pinot" target="_blank" rel="noopener noreferrer">Apache Pinot</a> comes into the picture. Instead of using Kafka Connect to batch Mastodon toots into groups of 1,000 messages to generate Parquet files, we can stream the data immediately and directly, toot-by-toot into Pinot and then build a real-time dashboard using Streamlit:</p><p><img src="https://www.datocms-assets.com/75153/1685637507-image4.png" alt="Data collection in Mastodon, followed by processing in Apache Kafka, Apache Pinot, and Streamlit" title="Data collection in Mastodon, followed by processing in Apache Kafka, Apache Pinot, and Streamlit"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="setup"></a>Setup<a class="hash-link" href="#setup" title="Direct link to heading">#</a></h2><p>To follow along, first clone my fork of Simon’s GitHub repository:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">git</span><span class="token plain"> clone git@github.com:mneedham/mastodon-stream.git</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">cd</span><span class="token plain"> mastodon-stream</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Then launch all of the components using Docker Compose:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker-compose</span><span class="token plain"> up</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="pinot-schema-and-table"></a>Pinot Schema and Table<a class="hash-link" href="#pinot-schema-and-table" title="Direct link to heading">#</a></h2><p>Similar to what Simon did with DuckDB, we’ll ingest the Mastodon events into a table. Pinot tables have a schema that’s defined in a schema file. </p><p>To come up with a schema file, we need to know the structure of the ingested events. For example:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"m_id"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">110146691030544274</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"created_at"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">1680705124</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"created_at_str"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2023 04 05 15:32:04"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"app"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">""</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"url"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"https://mastodon.social/@Xingcat/110146690810165414"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"base_url"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"https://techhub.social"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"language"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"en"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"favourites"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"username"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Xingcat"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"bot"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tags"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"characters"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">196</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"words"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">36</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"mastodon_text"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Another, “I don’t know what this is yet,” paintings. Many, many layers that look like distressed metal or some sort of rock crosscut. Liking it so far, need to figure out what it’ll wind up being."</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Mapping these fields directly to columns is easiest and will result in a schema file that looks like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"m_id"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"LONG"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"created_at_str"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"app"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"url"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"base_url"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"language"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"username"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"bot"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"BOOLEAN"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon_text"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"favourites"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"words"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"characters"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"tags"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"created_at"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"LONG"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Next up: our table config, shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"created_at"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MILLISECONDS"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lowLevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon-topic"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.inputformat.avro.confluent.KafkaConfluentSchemaRegistryAvroMessageDecoder"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.prop.format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"AVRO"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.prop.schema.registry.rest.url"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://schema-registry:8081"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.prop.schema.registry.schema.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon-topic-value"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"broker:9093"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"customConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"routing"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"instanceSelectorType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"strictReplicaGroup"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The following configs represent the most important ones for ingesting Apache Avro™ messages into Pinot:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.inputformat.avro.confluent.KafkaConfluentSchemaRegistryAvroMessageDecoder"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token property">"stream.kafka.decoder.prop.format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"AVRO"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token property">"stream.kafka.decoder.prop.schema.registry.rest.url"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://schema-registry:8081"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token property">"stream.kafka.decoder.prop.schema.registry.schema.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"mastodon-topic-value"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The KafkaConfluentSchemaRegistryAvroMessageDecoder decoder calls the Schema Registry with the schema name to get back the schema that it will use to decode messages.</p><p>We can create the Pinot table by running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network mastodon </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/pinot:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.12.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost </span><span class="token string" style="color:rgb(255, 121, 198)">"pinot-controller"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can then navigate to the table page of the Pinot UI: </p><p>http://localhost:9000/#/tenants/table/mastodon_REALTIME</p><p>Here, we’ll see the following:</p><p><img src="https://www.datocms-assets.com/75153/1685637837-image6.png" alt="Apache Pinot table config and schema" title="Apache Pinot table config and schema"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="ingest-data-into-kafka"></a>Ingest Data into Kafka<a class="hash-link" href="#ingest-data-into-kafka" title="Direct link to heading">#</a></h2><p>Now, we need to start ingesting data into Kafka. Simon created a script that accomplishes this for us, so we just need to indicate which Mastodon servers to query.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python mastodonlisten.py --baseURL https://data-folks.masto.host </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --public --enableKafka --quiet</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">python mastodonlisten.py --baseURL https://fosstodon.org/ </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --public --enableKafka --quiet</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">python mastodonlisten.py --baseURL https://mstdn.social/ </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --public --enableKafka --quiet</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can then check the ingestion of messages with the <a href="https://docs.confluent.io/platform/current/clients/kafkacat-usage.html" target="_blank" rel="noopener noreferrer">kcat</a> command line tool:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">kcat -C -b localhost:9092 -t mastodon-topic </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -s </span><span class="token assign-left variable" style="color:rgb(189, 147, 249);font-style:italic">value</span><span class="token operator">=</span><span class="token plain">avro -r http://localhost:8081 -e</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="query-pinot"></a>Query Pinot<a class="hash-link" href="#query-pinot" title="Direct link to heading">#</a></h2><p>Now, let’s go to the Pinot UI to see what data we’ve got to play with:</p><p><a href="http://localhost:9000/" target="_blank" rel="noopener noreferrer">http://localhost:9000</a></p><p>We’ll see the following preview of the data in the mastodon table:</p><p><img src="https://www.datocms-assets.com/75153/1685637772-image5.png" alt="SQL Editor, query response stats, and query result in Apache Pinot" title="SQL Editor, query response stats, and query result in Apache Pinot"></p><p>We can then write a query to find the number of messages posted in the last five minutes:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Num toots"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">distinct</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">username</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Num users"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">distinct</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">url</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Num urls"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> mastodon</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">where</span><span class="token plain"> created_at</span><span class="token operator">*</span><span class="token number">1000</span><span class="token plain"> </span><span class="token operator">></span><span class="token plain"> ago</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'PT1M'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> </span><span class="token number">1</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DESC</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><img src="https://www.datocms-assets.com/75153/1685637909-image8.png" alt="Query results for toots, users, and urls" title="Query results for toots, users, and urls"></p><p>We can also query Pinot via the Python client, which we can install by running the following:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">pip </span><span class="token function" style="color:rgb(80, 250, 123)">install</span><span class="token plain"> pinotdb</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Once we’ve done that, let’s open the Python REPL and run the following code:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> pinotdb </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> connect</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> pandas </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> pd</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">conn </span><span class="token operator">=</span><span class="token plain"> connect</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">host</span><span class="token operator">=</span><span class="token string" style="color:rgb(255, 121, 198)">'localhost'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> port</span><span class="token operator">=</span><span class="token number">8099</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> path</span><span class="token operator">=</span><span class="token string" style="color:rgb(255, 121, 198)">'/query/sql'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> scheme</span><span class="token operator">=</span><span class="token string" style="color:rgb(255, 121, 198)">'http'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">curs </span><span class="token operator">=</span><span class="token plain"> conn</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">cursor</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">st</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">header</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"Daily Mastodon Usage"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">query </span><span class="token operator">=</span><span class="token plain"> </span><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">"""</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">select count(*) as "Num toots"</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">, count(distinct(username)) as "Num users"</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">, count(distinct(url)) as "Num urls"</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">from mastodon</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">where created_at*1000 > ago('PT1M')</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">order by 1 DESC;</span></span><span class="token-line" style="color:#F8F8F2"><span class="token triple-quoted-string string" style="color:rgb(255, 121, 198)">"""</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">curs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">execute</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">query</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">df </span><span class="token operator">=</span><span class="token plain"> pd</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">DataFrame</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">curs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> columns</span><span class="token operator">=</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">item</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">for</span><span class="token plain"> item </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">in</span><span class="token plain"> curs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">description</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This produces the resulting DataFrame:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain"> Num toots Num users Num urls</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">0 552 173 192</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="streamlit"></a>Streamlit<a class="hash-link" href="#streamlit" title="Direct link to heading">#</a></h2><p>Next, we’ll create a Streamlit dashboard to package up these queries. We’ll visualize the results using Plotly, which you can install using:</p><p>pip install streamlit plotly</p><p>I’ve created a Streamlit app in the file <a href="https://github.com/mneedham/mastodon-stream/blob/main/app.py" target="_blank" rel="noopener noreferrer">app.py</a>, which you can find in the GitHub repository. Let’s have a look at the kinds of visualizations that we can generate. </p><p>First, we’ll create metrics to show the number of toots, users, and URLs in the last <em>n</em> minutes. <em>n</em> will be configurable from the app as shown in the screenshot below:</p><p><img src="https://www.datocms-assets.com/75153/1685637876-image7.png" alt="Chart of real-time Mastodon usage" title="Chart of real-time Mastodon usage"></p><p>From the screenshot, we can identify mastodon.cloud as the most active server, though it produces only 1,800 messages in 10 minutes or three messages per second. The values in green indicate the change in values compared to the previous 10 minutes.</p><p>We can also create a chart showing the number of messages per minute for the last 10 minutes:</p><p><img src="https://www.datocms-assets.com/75153/1685637945-image9.png" alt="Time of day Mastodon usage" title="Time of day Mastodon usage"></p><p>Based on this chart, we can see that we’re creating anywhere from 200–900 messages per second. Part of the reason lies in the fact that the Mastodon servers sometimes disconnect our listener, and at the moment, I have to manually reconnect.</p><p>Finally, we can look at the toot length by language:</p><p><img src="https://www.datocms-assets.com/75153/1685637644-image2.png" alt="Chart of toot length by language usage" title="Chart of toot length by language usage"></p><p>We see much bigger ranges here than Simon saw in his analysis. He saw a maximum length of 200 characters, whereas we see some messages of up to 4,200 characters. </p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>We hope you enjoyed following along as we explored this fun use case for <a href="https://startree.ai/resources/what-is-real-time-analytics" target="_blank" rel="noopener noreferrer">real-time analytics</a>. As you can see, even though we’re pulling the data from many of the popular Mastodon servers, it’s still not all that much data!</p><p>Give the code a try and let us know how it goes. If you have any questions, feel free to <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">join us on Slack</a>, where we’ll gladly do our best to help you out.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/blog-post">blog post</a><a class="margin-horiz--sm" href="/blog/tags/analyzing-user-activity">analyzing user activity</a><a class="margin-horiz--sm" href="/blog/tags/server-popularity">server popularity</a><a class="margin-horiz--sm" href="/blog/tags/mastodon">Mastodon</a><a class="margin-horiz--sm" href="/blog/tags/kafka-connect">Kafka Connect</a><a class="margin-horiz--sm" href="/blog/tags/parquet">Parquet</a><a class="margin-horiz--sm" href="/blog/tags/seaborn">Seaborn</a><a class="margin-horiz--sm" href="/blog/tags/duck-db">DuckDB</a><a class="margin-horiz--sm" href="/blog/tags/potential">potential</a><a class="margin-horiz--sm" href="/blog/tags/apache-pinot">Apache Pinot</a><a class="margin-horiz--sm" href="/blog/tags/realtime-data-streaming">realtime data streaming</a><a class="margin-horiz--sm" href="/blog/tags/dashboard">dashboard</a><a class="margin-horiz--sm" href="/blog/tags/instructions">instructions</a><a class="margin-horiz--sm" href="/blog/tags/ingesting">ingesting</a><a class="margin-horiz--sm" href="/blog/tags/apache-avro-messages">Apache Avro messages</a><a class="margin-horiz--sm" href="/blog/tags/pinot-table">Pinot table</a><a class="margin-horiz--sm" href="/blog/tags/querying-data">querying data</a></div><div class="col text--right"><a aria-label="Read more about Real-Time Mastodon Usage with Apache Kafka, Apache Pinot, and Streamlit" href="/blog/2023/06/01/real-time-mastodon-usage-with-apache-kafka-apache-pinot-and-streamlit"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/05/30/how-to-ingest-streaming-data-from-kafka-to-apache-pinot">How to Ingest Streaming Data from Kafka to Apache Pinot™</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-05-30T00:00:00.000Z">May 30, 2023</time> · 9 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Barkha Herman"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Barkha Herman</a></div><small class="avatar__subtitle">Developer Advocate</small></div></div></header><div class="markdown"><p>We previously walked through getting started with <a href="https://startree.ai/resources/what-is-apache-pinot" target="_blank" rel="noopener noreferrer">Apache Pinot™</a> using batch data, and now we will learn how to ingest streaming data using Apache Kafka® topics. </p><p>As the story goes, Apache Pinot was created at LinkedIn to provide a platform that could ingest a high number of incoming events (kafka) and provide “fresh” (sub second) analytics to a large number (20+ million) of users, fast (sub second latency). So, really, consuming events is part of the reason why Pinot was created.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="the-obligatory-what-is-apache-pinot-and-startree-section"></a>The obligatory “What is Apache Pinot and StarTree?” section<a class="hash-link" href="#the-obligatory-what-is-apache-pinot-and-startree-section" title="Direct link to heading">#</a></h3><p><a href="https://docs.pinot.apache.org/" target="_blank" rel="noopener noreferrer">Pinot</a> is a real-time, distributed, open source, and free-to-use OLAP datastore, purpose-built to provide ultra low-latency analytics at extremely high throughput. It is open source and free to use.</p><p>How does StarTree come in? StarTree offers a <a href="https://startree.ai/saas-signup" target="_blank" rel="noopener noreferrer">fully managed version of the Apache Pinot real-time analytics system</a> , plus other tools around it that you can try for free. The system includes <a href="https://dev.startree.ai/docs/startree-enterprise-edition/startree-dataset-manager/" target="_blank" rel="noopener noreferrer">StarTree Dataset Manager</a> and <a href="https://dev.startree.ai/docs/procedures/get-started-with-thirdeye/" target="_blank" rel="noopener noreferrer">StarTree ThirdEye</a>, a UI based data ingestion tool, and a real-time anomaly detection and root cause analysis tool, respectively.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-to-install-kafka-alongside-pinot"></a>How to install Kafka alongside Pinot <a class="hash-link" href="#how-to-install-kafka-alongside-pinot" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="prerequisite"></a>Prerequisite<a class="hash-link" href="#prerequisite" title="Direct link to heading">#</a></h3><p>Complete the steps outlined in the <a href="https://startree.ai/blog/apache-pinot-tutorial-for-getting-started-a-step-by-step-guide" target="_blank" rel="noopener noreferrer">introduction to Apache Pinot</a>. </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-1-install-kafka-on-your-pinot-docker-image"></a>Step 1: Install Kafka on your Pinot Docker image<a class="hash-link" href="#step-1-install-kafka-on-your-pinot-docker-image" title="Direct link to heading">#</a></h3><p>Make sure you have completed the first article in the series.</p><p>We will be installing Apache Kafka onto our already existing Pinot docker image. To start the Docker image, run the following command:</p><p>docker run -it --entrypoint /bin/bash -p 9000:9000 apachepinot/pinot:0.12.0</p><p><img src="https://www.datocms-assets.com/75153/1685462020-image1.png" alt="PowerShell 7.3.4 docker run Apache Pinot" title="PowerShell 7.3.4 docker run Apache Pinot"></p><p>We want to override the ENTRYPOINT and run Bash script within the Docker image. If you already have a container running, you can skip this step. I tend to tear down containers after use, so in my case, I created a brand new container.</p><p>Now, start each of the components one at a time like we did in the previous session:</p><p>bin/pinot-admin.sh StartZookeeper &</p><p>bin/pinot-admin.sh StartController &</p><p>bin/pinot-admin.sh StartBroker &</p><p>bin/pinot-admin.sh StartServer &</p><p>Run each of the commands one at a time. The & allows you to continue using the same Bash shell session. If you like, you can create different shells for each service:</p><ol><li>Get the container ID by running docker ps</li><li>Run <code>docker exec -it DOCKER_CONTAINER_ID bash</code> where DOCKER_CONTAINER_ID is the ID received from step 1.</li><li>Run the pinot-admin.sh command to start the desired service</li></ol><p>It should look like this:</p><p><img src="https://www.datocms-assets.com/75153/1685462274-image7.png" alt="Docker with container ID, Image, Command, and Created" title="Docker with container ID, Image, Command, and Created"></p><p>You can now browse to <a href="http://localhost:9000/#/zookeeper" target="_blank" rel="noopener noreferrer">http://localhost:9000/#/zookeeper</a> to see the running cluster:</p><p><img src="https://www.datocms-assets.com/75153/1685462203-image5.png" alt="Empty Zookeeper Browser" title="Empty Zookeeper Browser"></p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-2-install-kafka-on-the-docker-container"></a>Step 2: Install Kafka on the Docker container<a class="hash-link" href="#step-2-install-kafka-on-the-docker-container" title="Direct link to heading">#</a></h3><p>Next, let's install Kafka. We will be installing Kafka on the existing docker container. For this step, download the TAR file, extract the contents, and start Kafka.</p><p><em>Apache Kafka is an open source software platform that provides a unified, high-throughput, low-latency platform for handling real-time data feeds.</em></p><p>Use the following command to download the Kafka image:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token builtin class-name" style="color:rgb(189, 147, 249)">cd</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">..</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> https://downloads.apache.org/kafka/3.4.0/kafka_2.12-3.4.0.tgz --output kafka.tgz --output kafka.tgz</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>It should look this:</p><p><img src="https://www.datocms-assets.com/75153/1685462322-image8.png" alt="Code with Apache Pinot speed results" title="Code with Apache Pinot speed results"></p><p>Note that we’ve changed the directory to keep the Kafka folder separate from the Pinot folder.</p><p>Now, let’s expand the downloaded TAR file, rename the folder for convenience, and delete the downloaded file:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">tar</span><span class="token plain"> -xvf kafka.tgz</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">mv</span><span class="token plain"> kafka_2.12-3.4.0 kafka</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">rm</span><span class="token plain"> -rf kafka.tgz</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>It should look like this:</p><p><img src="https://www.datocms-assets.com/75153/1685462061-image2.png" alt="Code with Apache Kafka" title="Code with Apache Kafka"></p><p><img src="https://www.datocms-assets.com/75153/1685462480-image12.png" alt="Code with kafka version" title="Code with kafka version"> </p><p>Now, Kafka and Pinot reside locally on our Docker container with Pinot up and running. Let’s run the Kafka service. Kafka will use the existing ZooKeeper for configuration management.</p><p>Use the following command to run Kafka:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token builtin class-name" style="color:rgb(189, 147, 249)">cd</span><span class="token plain"> kafka</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./bin/kafka-server-start.sh config/server.properties</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>It should look like this:</p><p><img src="https://www.datocms-assets.com/75153/1685462400-image10.png" alt="Code with cd kafka" title="Code with cd kafka"></p><p>To verify that Kafka is running, let’s look at our ZooKeeper configs by browsing to <a href="http://localhost:9000/#/zookeeper" target="_blank" rel="noopener noreferrer">http://localhost:9000/#/zookeeper:</a></p><p><img src="https://www.datocms-assets.com/75153/1685462099-image3.png" alt="Zookeeper Browser" title="Zookeeper Browser"></p><p>You may have to refresh the page and find many more configuration items appear thanexpectedt. These are Kafka configurations. </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-3-ingest-data-into-kafka"></a>Step 3: Ingest data into Kafka<a class="hash-link" href="#step-3-ingest-data-into-kafka" title="Direct link to heading">#</a></h3><p>In this step, we will ingest data into Kafka. We will be using Wikipedia events since they are easily accessible. We will use a node script to ingest the Wikipedia events, then add them to a Kafka Topic.</p><p>Let’s first create some folders like this:</p><p>cd /opt</p><p>mkdir realtime</p><p>cd realtime</p><p>mkdir events</p><p>It should look like this:</p><p><img src="https://www.datocms-assets.com/75153/1685462150-image4.png" alt="Code with realtime" title="Code with realtime"></p><p>You may have to start a new PowerShell window and connect to Docker for this. Now, let’s install Node.js and any dependencies we might need for the event consumption script:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -fsSL https://deb.nodesource.com/setup_14.x </span><span class="token operator">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">bash</span><span class="token plain"> -</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">apt</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">install</span><span class="token plain"> nodejs</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Node.js takes a few minutes to install. Next, we will create a script to consume the events called wikievents.js. Cut and paste the following code to this file:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI javascript"><pre tabindex="0" class="prism-code language-javascript codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">var</span><span class="token plain"> </span><span class="token maybe-class-name">EventSource</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">require</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"eventsource"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">var</span><span class="token plain"> fs </span><span class="token operator">=</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">require</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"fs"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">var</span><span class="token plain"> path </span><span class="token operator">=</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">require</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"path"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">const</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"> </span><span class="token maybe-class-name">Kafka</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">require</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"kafkajs"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">var</span><span class="token plain"> url </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"https://stream.wikimedia.org/v2/stream/recentchange"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">const</span><span class="token plain"> kafka </span><span class="token operator">=</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">new</span><span class="token plain"> </span><span class="token class-name">Kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token literal-property property">clientId</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wikievents"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token literal-property property">brokers</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token string" style="color:rgb(255, 121, 198)">"localhost:9092"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">const</span><span class="token plain"> producer </span><span class="token operator">=</span><span class="token plain"> kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">producer</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">async</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">function</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">start</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword control-flow" style="color:rgb(189, 147, 249);font-style:italic">await</span><span class="token plain"> producer</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">connect</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">startEvents</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">function</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">startEvents</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token console class-name">console</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">log</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token template-string template-punctuation string" style="color:rgb(255, 121, 198)">`</span><span class="token template-string string" style="color:rgb(255, 121, 198)">Connecting to EventStreams at </span><span class="token template-string interpolation interpolation-punctuation punctuation" style="color:rgb(248, 248, 242)">${</span><span class="token template-string interpolation">url</span><span class="token template-string interpolation interpolation-punctuation punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token template-string template-punctuation string" style="color:rgb(255, 121, 198)">`</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">var</span><span class="token plain"> eventSource </span><span class="token operator">=</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">new</span><span class="token plain"> </span><span class="token class-name">EventSource</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">url</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> eventSource</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method-variable function-variable method function property-access" style="color:rgb(80, 250, 123)">onopen</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">function</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token console class-name">console</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">log</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"--- Opened connection."</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> eventSource</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method-variable function-variable method function property-access" style="color:rgb(80, 250, 123)">onerror</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">function</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token parameter">event</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token console class-name">console</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">error</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"--- Encountered error"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> event</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> eventSource</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method-variable function-variable method function property-access" style="color:rgb(80, 250, 123)">onmessage</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">async</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">function</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token parameter">event</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">const</span><span class="token plain"> data </span><span class="token operator">=</span><span class="token plain"> </span><span class="token known-class-name class-name">JSON</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">parse</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">event</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">data</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">const</span><span class="token plain"> eventPath </span><span class="token operator">=</span><span class="token plain"> path</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">join</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">__dirname</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"./events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> data</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">wiki</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">existsSync</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">eventPath</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token operator">||</span><span class="token plain"> fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">mkdirSync</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">eventPath</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">writeFileSync</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">path</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">join</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">eventPath</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> data</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">meta</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">id</span><span class="token plain"> </span><span class="token operator">+</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">".json"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> event</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">data</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword control-flow" style="color:rgb(189, 147, 249);font-style:italic">await</span><span class="token plain"> producer</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token method function property-access" style="color:rgb(80, 250, 123)">send</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token literal-property property">topic</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wikipedia-events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token literal-property property">messages</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token literal-property property">key</span><span class="token operator">:</span><span class="token plain"> data</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">meta</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token literal-property property">value</span><span class="token operator">:</span><span class="token plain"> event</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token property-access">data</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token function" style="color:rgb(80, 250, 123)">start</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You can use vi to create the file and save it. You can also use Docker Desktop to edit the file.</p><p>To install the two modules referenced in the file above, kafkajs and eventsource, run the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">npm</span><span class="token plain"> i eventsource kafkajs</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Let’s run the program. This will result in the download of many files, so I recommend running the program for just a few minutes. You can stop the run by using Ctrl-C.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">node</span><span class="token plain"> wikievents.js</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Use Ctrl-C to stop the program. Navigate to the events folder to see some new folders created with the various language events downloaded from Wikipedia.</p><p><img src="https://www.datocms-assets.com/75153/1685462366-image9.png" alt="Wikievents node in code" title="Wikievents node in code"></p><p>Navigate to the enwiki folder and review some of the downloaded JSON files.</p><p><img src="https://www.datocms-assets.com/75153/1685462441-image11.png" alt="Code with realtime wikievents" title="Code with realtime wikievents"></p><p>At http://localhost:9000/#/zookeeper, you can find the Kafka topic by locating the ZooKeeper config and expanding config > topics. You may have to refresh your browser.</p><p><img src="https://www.datocms-assets.com/75153/1685462510-image13.png" alt="Zookeeper browser in Apache Pinot topics" title="Zookeeper browser in Apache Pinot topics"></p><p>Here, you should see the wikipedia-events topic that we created using the Node.js script. So far, so good.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-4-connect-kafka-to-pinot"></a>Step 4: Connect Kafka to Pinot<a class="hash-link" href="#step-4-connect-kafka-to-pinot" title="Direct link to heading">#</a></h3><p>With Kafka installed and configured to receive events, we can connect it to Pinot. </p><p>To create a real-time table in Pinot that can consume the Kafka topic, create a schema and a configuration table. The schema configuration is very much like the schema that we created for our batch example. You can use vi to create a file named realtime.schema.json and cut and paste the content below.</p><p>Here’s the JSON for the wikievents schema:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wikievents"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"id"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wiki"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"user"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"title"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"comment"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"stream"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"domain"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"topic"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"type"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"metaJson"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"timestamp"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"LONG"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Creating the table config file is where the magic happens. Use vi (or your favorite editor) to create realtime.tableconfig.json and cut and paste the following content:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wikievents_REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"timestamp"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wikievents"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replication"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"broker"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DefaultTenant"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"server"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DefaultTenant"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tagOverrideConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"invertedIndexColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"rangeIndexColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"autoGeneratedInvertedIndex"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"createInvertedIndexDuringSegmentGeneration"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"sortedColumn"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"bloomFilterColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"wikipedia-events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"localhost:9092"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lowlevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"realtime.segment.flush.threshold.rows"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"0"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"realtime.segment.flush.threshold.time"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"24h"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"realtime.segment.flush.segment.size"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"100M"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"noDictionaryColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"onHeapDictionaryColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"varLengthDictionaryColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"enableDefaultStarTree"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"enableDynamicStarTreeCreation"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"aggregateMetrics"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"nullHandlingEnabled"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"quota"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"routing"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"query"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"metaJson"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"JSONFORMAT(meta)"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"id"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"JSONPATH(metaJson, '$.id')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"stream"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"JSONPATH(metaJson, '$.stream')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"domain"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"JSONPATH(metaJson, '$.domain')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"topic"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"JSONPATH(metaJson, '$.topic')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"isDimTable"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Notice the section called streamConfigs, where we define the source as a Kafka stream, located at localhost:9092, and consume the topic wikipedia-events. That’s all it takes to consume a Kafka Topic into Pinot.</p><p>Don’t believe me? Give it a try!</p><p>Create the table by running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">/opt/pinot/bin/pinot-admin.sh AddTable -schemaFile /opt/realtime/realtime.schema.json -tableConfigFile /opt/realtime/realtime.tableconfig.json -exec</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Now, browse to the following location <a href="http://localhost:9000/#/tables" target="_blank" rel="noopener noreferrer">http://localhost:9000/#/tables,</a> and you should see the newly created table. However, where’s the real-time data, you say?</p><p>Run the node wikievents.js command, then query the newly created wikievents table to see the totalDocs increase in real time:</p><p><img src="https://www.datocms-assets.com/75153/1685462248-image6.png" alt="Apache Pinot query console" title="Apache Pinot query console"></p><p>To avoid running out of space on your computer, make sure to stop the wikievents.js script when you’re done :-D</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="conclusion"></a>Conclusion<a class="hash-link" href="#conclusion" title="Direct link to heading">#</a></h2><p>Congratulations! Using only the table config, we simultaneously consumed Kafka topics directly into Pinot tables and queried events. We also transformed JSON to map to the Pinot table. In the transformConfigs portion of the Pinot table config file, we consumed the nested block meta into a field called metaJson. In the subsequent steps, we referenced the metaJson field with jsonPath to extract fields such as id, stream, domain, and topic. </p><p>Not only does Pinot support easy ingestion from Kafka topics, but it also provides a robust way to transform JSON to OLAP tables. </p><p>In summary, we have:</p><ul><li>Installed and run Kafka</li><li>Consumed events from Wikipedia into Kafka</li><li>Created a real-time table schema and a table in Pinot</li><li>Streamed events from Wikipedia into Pinot tables via Kafka topics</li><li>Run multiple queries</li><li>Performed JSON transformations</li></ul><p>In some upcoming blog posts, we will explore more advanced topics, such as indexes and transformations, not to mention real-time anomaly detection with <a href="https://dev.startree.ai/docs/procedures/get-started-with-thirdeye/" target="_blank" rel="noopener noreferrer">ThirdEye</a>.</p><p>In the meantime, run more queries, load more data, and don’t forget to <a href="https://dev.startree.ai/slack-invite" target="_blank" rel="noopener noreferrer">join the community Slack for support</a> if you get stuck or would like to request a topic for me to write about—you know where to find us!</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/kafka">kafka</a><a class="margin-horiz--sm" href="/blog/tags/streaming">streaming</a><a class="margin-horiz--sm" href="/blog/tags/json">json</a></div><div class="col text--right"><a aria-label="Read more about How to Ingest Streaming Data from Kafka to Apache Pinot™" href="/blog/2023/05/30/how-to-ingest-streaming-data-from-kafka-to-apache-pinot"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/05/23/change-data-capture-with-apache-pinot-how-does-it-work">Change Data Capture with Apache Pinot - How Does It Work?</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-05-23T00:00:00.000Z">May 23, 2023</time> · 10 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Hubert Dulay"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Hubert Dulay</a></div><small class="avatar__subtitle">Developer Advocate</small></div></div></header><div class="markdown"><p>Change Data Capture (CDC) is the process of capturing and communicating changes made to records in a data store, including INSERTs, UPDATEs, and DELETEs transactions to records. </p><p>CDC implementations vary across different types of transactional databases, whether SQL or NoSQL. However, the means to ingest and analyze that data in <a href="https://startree.ai/resources/what-is-apache-pinot" target="_blank" rel="noopener noreferrer">Apache Pinot™</a> will generally remain the same.</p><p>As your applications interact with their data stores, they automatically log the transaction in a construct called a write-ahead log (WAL) in real time. In fact, each transaction reflects an event that has been recorded, naturally giving the WAL event streaming properties. This approach is typically used by relational OLTP databases like PostgreSQL. </p><p>NOTE: NoSQL databases also have the ability to perform CDC but may use other mechanisms than a WAL. CDC for NoSQL databases is outside the scope of this post.</p><p>The WAL is an append-only, immutable stream of events designed to replicate its data to another instance of the data store for high availability in disaster recovery scenarios (see diagram below). The transactions occurring on the left data store (primary) get replicated to the data store to the right (secondary). The applications connect to the primary data store and replicate its data to the secondary data store. If the primary data store goes down, the application switches to the secondary data store.</p><p><img src="https://www.datocms-assets.com/75153/1684857872-image3.png" alt="Primary data store transactions being replicated to a secondary data store" title="Primary data store transactions being replicated to a secondary data store"></p><p>The following diagram shows an example of a WAL in a data store. New transactions get appended to the end of the WAL. The old transactions are on the left, and the newer transactions are on the right.</p><p><img src="https://www.datocms-assets.com/75153/1684857250-image5.png" alt="WAL in a data store with new transactions appended to the end of the WAL" title="WAL in a data store with new transactions appended to the end of the WAL"></p><p>Change data capture enables you to listen to this WAL by capturing these transactions and sending them downstream for processing. The data processing occurs in a different system where we can view the latest version of each record in other applications. Because of the real-time nature of the data, the subscribing applications to the stream of transactions receive real-time transaction events.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="pre-image-post-image-or-diffs"></a>Pre-Image, Post-Image, or Diffs?<a class="hash-link" href="#pre-image-post-image-or-diffs" title="Direct link to heading">#</a></h2><p>An important consideration for CDC is what specific elements of change it captures. Not all CDC implementations are the same. Some provide only the <em>post-image</em> — the complete state to which the record changes after an update. Some only provide the <em>diffs</em> (or <em>deltas</em>) — the specific changes made to the record at the time of the update, not the complete current state of the record. And others can provide the pre-image as well — what the state of the record was before the changes were applied.</p><p>Different transactional databases may only provide one or two of these elements. Usually, it will provide the complete post-image or the diffs (or deltas) to the record. In other cases, a CDC implementation might provide all three data elements — pre-, post-, <em>and</em> diffs. It is very important for you to understand what specific CDC data elements your transactional database provides because of how it limits the kind of analytics you can perform.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-to-capture-change-data-with-debezium"></a>How to Capture Change Data with Debezium<a class="hash-link" href="#how-to-capture-change-data-with-debezium" title="Direct link to heading">#</a></h2><p>Capturing change events requires specific knowledge of the database from which the changes are occurring; and there are many transactional databases. Debezium, an open source project, provides a set of connectors that can subscribe to WALs in many different data stores, such as PostgreSQL, SQL Server, and MongoDB. Their implementation involves the Kafka Connect framework, an open source framework that enables integrations to Apache Kafka®. Two types of connectors exist: source and sink. Debezium connectors are source-only connectors.</p><p>Kafka connectors must run in a Kafka Connect cluster, a highly available and distributed system for running connectors. Kafka connectors cannot run on their own and require a server. The Debezium project provides a Debezium server that can also run Debezium connectors capable of writing to other event streaming platforms besides Kafka, for instance, Amazon Kinesis. The diagram below shows a Debezium connector reading the WAL and writing to a Debezium server. The Debezium server can then write to either Kafka or Kinesis.</p><p><img src="https://www.datocms-assets.com/75153/1684857201-image4.png" alt="Diagram showing a Debezium connector reading the WAL and writing to a Debezium server" title="Diagram showing a Debezium connector reading the WAL and writing to a Debezium server"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="debezium-data-format"></a>Debezium Data Format<a class="hash-link" href="#debezium-data-format" title="Direct link to heading">#</a></h2><p>For details on the Debezium format, <a href="https://debezium.io/documentation/reference/stable/tutorial.html" target="_blank" rel="noopener noreferrer">check out the tutorial</a>. Below, you’ll find an example of a transaction event encoded in JSON coming from the Debezium connector.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schema"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain">...</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"payload"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"before"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"user_id"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">1004</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"first_name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Anne"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"last_name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Kretchmar"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"email"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"annek@noanswer.org"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"after"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"user_id"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">1004</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"first_name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Anne Marie"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"last_name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Kretchmar"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"email"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"annek@noanswer.org"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"source"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2.2.0.Final"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"dbserver1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"server_id"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">223344</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ts_sec"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">1486501486</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"gtid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token null keyword" style="color:rgb(189, 147, 249);font-style:italic">null</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"file"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"mysql-bin.000003"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"pos"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">364</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"row"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"snapshot"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token null keyword" style="color:rgb(189, 147, 249);font-style:italic">null</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"thread"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">3</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"db"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"inventory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"table"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"customers"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"op"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"u"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ts_ms"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">1486501486308</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>A few elements to note:</p><ul><li><p>The schema element never changes and defines the schema of the payload</p></li><li><p>The payload element holds three different elements:</p><ul><li>before: shows the state of the record before it was changed; if this is null, then you can assume that the transaction is an INSERT</li><li>after: shows the state of the record after the record was changed; if this is null, then you can assume that the transaction is a DELETE</li><li>source: constitutes metadata that describes the source of the data</li></ul></li><li><p>The op element defines the actual transaction </p><ul><li><p>Values:</p><ul><li>c for CREATE (or INSERT)</li><li>r for READ (in the case of a snapshot)</li><li>u for UPDATE</li><li>d for DELETE</li></ul></li></ul></li><li><p>The ts_ms element refers to the timestamp in milliseconds of when the transaction occurred</p></li></ul><p>In the op element of the format, you may use a possible r value to determine if the record originated from a snapshot of the entire table in the data store. When the Debezium connector first starts, you could encounter existing records. You can configure the connector to first take a snapshot of the entire table to send as events downstream to its eventual destination. This will affect the treatment of records in the destination, in our case, Apache Pinot.</p><p>In Apache Pinot, we will have to create a schema that corresponds to the Debezium format. This could be defined a number of ways. I chose to bring the comments in the after field so users can access the latest values for any customer. I also kept the op at the top level. Since there are no metrics, that context in the schema is an empty array. I also preserved the after and before fields. Notice they are of type STRING. In Apache Pinot, you can assign a JSON index to any field containing multi-level JSON data. Apache Pinot will index all the values in the JSON payload so that any query referencing data in those JSON fields would be fast. This will allow users to see previous values of the record in cases where the operation was a change. Lastly, I have a date time field to indicate when the last change was made. </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"customers"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"user_id"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"first_name"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"last_name"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"email"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"op"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"before"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"after"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"source"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts_ms"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"LONG"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:SIMPLE_DATE_FORMAT:yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"primaryKeyColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"user_id"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You may have an alternative schema depending on your use case. You don’t need any of the fields I preserved. If at the end you only want the latest version, you can do that easily by only preserving the columns that matter to you.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="materialized-views"></a>Materialized Views<a class="hash-link" href="#materialized-views" title="Direct link to heading">#</a></h2><p>When looking up your record in Pinot, you only need to provide a WHERE clause with the primary key. Pinot will only return one record—the latest version of the record, not the history of the record—as a true materialized view should. Otherwise, you would have to provide more logic in the SQL statement that selects for the latest record. This adds latency to the query and may make downstream aggregations less accurate. Pinot provides a materialized view by implementing upsert for real-time tables with a primary key.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="upsert-in-apache-pinot"></a>Upsert in Apache Pinot<a class="hash-link" href="#upsert-in-apache-pinot" title="Direct link to heading">#</a></h2><p>Unlike any other real-time OLAP, <a href="https://docs.pinot.apache.org/basics/data-import/upsert" target="_blank" rel="noopener noreferrer">Pinot offers native support for upsert</a> for real-time ingestion. Upsert logic says, “If the record exists, update it or otherwise insert it.” </p><p>You need upsert capabilities for dimensional data to simply SELECT for the record’s primary key when retrieving it. Without upsert, you will need to find the latest version of a record by comparing the latest timestamps, which leaves room for error. </p><p>This JSON document shows a schema snippet in Pinot that contains a primaryKeyColumns property. By applying this property, Pinot automatically enables the upsert feature. Upsert is completely transparent to the sender and therefore no specific programming is required.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"primaryKeyColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token string" style="color:rgb(255, 121, 198)">"user_id"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You can further configure the behavior of the upsert to allow for different behaviors: FULL or PARTIAL.</p><p>A FULL upsert means that a new record will replace the older record completely if they share the same primary key.</p><p>PARTIAL only allows updates to specific columns and employs additional strategies.</p><p><img src="https://www.datocms-assets.com/75153/1684857317-image6.png" alt="Table describing the strategy and descriptions of stream ingestion with upsert" title="Table describing the strategy and descriptions of stream ingestion with upsert"></p><p>Source: <a href="https://docs.pinot.apache.org/basics/data-import/upsert" target="_blank" rel="noopener noreferrer">Stream Ingestion with Upsert</a></p><p>Here is a sample snippet of a table configuration containing the property that configures the upsert strategy:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"upsertConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"> </span><span class="token property">"mode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"FULL"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Upsert simplifies client queries in an extremely powerful way. More importantly, upsert assures the accuracy of any aggregations applied to updated columns, which proves especially important when the analytics lead to critical decisions. </p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>Change data capture is the best way to capture changes in a database. Other options require comparing snapshots or applying complex modified timestamp logic. Other solutions only emulate real-time, but change data capture embodies the only genuine real-time event streaming solution.</p><p><a href="https://debezium.io/documentation/reference/stable/index.html" target="_blank" rel="noopener noreferrer">Debezium provides many other CDC connectors</a> that you can find in their documentation. If you do not have a Kafka Connect cluster or do not use Kafka at all, you can use the Debezium server to run the CDC connectors and write to an alternative streaming system, such as Amazon Kinesis, Pub/Sub from Google Cloud, Apache® Pulsar™, Azure Event Hubs, and RabbitMQ.</p><p>Lastly, Apache Pinot enables upsert for any client sinking into it, which means the client does not need to implement upsert logic. Any client can generate a materialized view in Pinot. This makes the resulting table faster to query and provides more accurate analytics.</p><p>To try Pinot in the cloud, <a href="https://startree.ai/saas-signup" target="_blank" rel="noopener noreferrer">visit startree.ai for a free trial</a>.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/cdc">CDC</a><a class="margin-horiz--sm" href="/blog/tags/debezium">Debezium</a></div><div class="col text--right"><a aria-label="Read more about Change Data Capture with Apache Pinot - How Does It Work?" href="/blog/2023/05/23/change-data-capture-with-apache-pinot-how-does-it-work"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/05/18/apache-pinot-tutorial-for-getting-started-a-step-by-step-guide">Apache Pinot Tutorial for Getting Started - A Step-by-Step Guide</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-05-18T00:00:00.000Z">May 18, 2023</time> · 8 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Barkha Herman"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Barkha Herman</a></div><small class="avatar__subtitle">Developer Advocate</small></div></div></header><div class="markdown"><p>How do you get started with <a href="https://startree.ai/resources/what-is-apache-pinot" target="_blank" rel="noopener noreferrer">Apache Pinot™</a>? Good question! To save you the hassle of trying to tackle this on your own, here’s a handy guide that overviews all of the components that make up Pinot and how to set Pinot up.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="the-obligatory-what-is-apache-pinot-and-startree-section"></a>The Obligatory What is Apache Pinot and StarTree Section<a class="hash-link" href="#the-obligatory-what-is-apache-pinot-and-startree-section" title="Direct link to heading">#</a></h2><p><a href="https://startree.ai/what-is-apache-pinot" target="_blank" rel="noopener noreferrer">Pinot</a> is an open source, free-to-use, real-time, and distributed OLAP datastore, purpose built to provide ultra low-latency analytics at extremely high throughput.</p><p>StarTree offers a fully managed version of the Apache Pinot <a href="https://startree.ai/resources/what-is-real-time-analytics" target="_blank" rel="noopener noreferrer">real-time analytics</a> system and other tools around it, such as a real-time anomaly detection and root cause analysis tool, which you can <a href="https://startree.ai/saas-signup" target="_blank" rel="noopener noreferrer">try for free</a>.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="what-do-you-need-to-run-apache-pinot"></a>What do you need to run Apache Pinot?<a class="hash-link" href="#what-do-you-need-to-run-apache-pinot" title="Direct link to heading">#</a></h2><p>The Docker image that we will use runs multiple services. To accommodate this, we recommend at a minimum the following resources in order to run the sample:</p><ul><li>CPUs: four or more</li><li>Memory: 8 GB or more</li><li>Swap: 2 GB or more</li><li>Disk space: 10 GB or more</li></ul><p>Note: When importing custom data or event streaming, you may need more resources. Additionally, note that if not set, Docker will use resources from the host environment as needed and available.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="step-by-step-installation-of-apache-pinot"></a>Step-by-step installation of Apache Pinot<a class="hash-link" href="#step-by-step-installation-of-apache-pinot" title="Direct link to heading">#</a></h2><p>For this intro tutorial, we will use Docker. Alternatively, you can run Pinot locally if you wish. </p><p>The instructions use a Windows 11 computer, but they will work on Macs as well. Also note that I am using VS Code with the Docker extension installed.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-1"></a>Step 1: <a class="hash-link" href="#step-1" title="Direct link to heading">#</a></h3><p>Make sure you have <a href="https://docs.docker.com/get-docker/" target="_blank" rel="noopener noreferrer">Docker installed</a> on your machine.</p><p><em>Docker is a set of platform as a service (PaaS) products that use OS-level virtualization to deliver software in packages called containers.</em></p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-2"></a>Step 2:<a class="hash-link" href="#step-2" title="Direct link to heading">#</a></h3><p>Now, let’s download the Docker image. On a Windows machine, start a new PowerShell command window. Note that this is not the same as a Windows PowerShell command window, as shown below. </p><p><img src="https://www.datocms-assets.com/75153/1684419409-image7.png" alt="Download Docker image on Windows with PowerShell command window" title="Download Docker image on Windows with PowerShell command window"></p><p>Use the following command to get (pull) the image we are looking for:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> pull apachepinot/pinot:0.12.0</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You can also download the latest version like so:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> pull apachepinot/pinot:latest</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Here, apachepinot is the name of the repository in Docker Hub, pinot is the name of the image, and :latest or :0.12.0 is the version for the image. Note that we will be using the 0.12.0 version for this blog post.</p><p><em>Docker Hub is the world’s largest repository of container images in the world.</em> </p><p>You can verify the image was downloaded or pulled by running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> images</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>It should show you the image like so:</p><p><img src="https://www.datocms-assets.com/75153/1684420004-image3.png" alt="Docker images command" title="Docker images command"></p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-3"></a>Step 3:<a class="hash-link" href="#step-3" title="Direct link to heading">#</a></h3><p>Let’s run a container using the Docker image that we downloaded:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run -it --entrypoint /bin/bash -p </span><span class="token number">9000</span><span class="token plain">:9000 apachepinot/pinot:0.12.0</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><img src="https://www.datocms-assets.com/75153/1684420103-image4.png" alt="Running a container with downloaded Docker image" title="Running a container with downloaded Docker image"></p><p>The docker run command runs the image. The -p 9000:00 option maps the docker container port 9000 to the local machine port 9000. This allows us to access the Pinot UI, which defaults to port 9000 to be accessible from the localhost. We are using the –entrypoint to override the default entrypoint and replace it with Bash. We want to override the default behavior so that we can start each component one at a time. The next parameter apachepinot/pinot:0.12.0 is the Docker image we pulled above.</p><p>After running the command, we’ll find ourselves in the Docker container instance running Bash shell. We can use ls to list the contents of the Docker container as shown above.</p><p>If you’re using VS Code, with the Docker extension installed, you can click on the Docker extension and see our container and its content:</p><p><img src="https://www.datocms-assets.com/75153/1684421493-image11.png" alt="VS Code Docker extension open to see container and content" title="VS Code Docker extension open to see container and content"></p><p>Click on the Docker icon in the left menu, and apachepinot/pinot:0.12.0. This should take a few seconds to connect to the running container. Now, you can navigate to the files and see what we have under the opt folder.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-4"></a>Step 4:<a class="hash-link" href="#step-4" title="Direct link to heading">#</a></h3><p>Let’s run the components that are essential to running a Pinot cluster. Change directory to the bin folder and list the contents like so:</p><p><img src="https://www.datocms-assets.com/75153/1684421611-image10.png" alt="Running components, directory changed to bin folder and contents listed" title="Running components, directory changed to bin folder and contents listed"></p><p>In order to start the Pinot cluster, we will need to run the following essential components:</p><ul><li>Apache ZooKeeper™</li><li>Controller</li><li>Broker</li><li>Server</li></ul><p>Start ZooKeeper using the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">./pinot-admin.sh StartZookeeper </span><span class="token operator">&</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>pinot-admin.sh is a shell script for starting the various components. The & allows us to continue using the Bash shell. ZooKeeper is responsible for the configuration for the Pinot cluster and needs to be started first.</p><p>We can start the remaining components using these commands one at a time:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">./pinot-admin.sh StartController </span><span class="token operator">&</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./pinot-admin.sh StartBroker </span><span class="token operator">&</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">./pinot-admin.sh StartServer </span><span class="token operator">&</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The controller controls the cluster health and coordinates with ZooKeeper for configuration and status changes. The broker is responsible for query distribution and result collation, sometimes called Scatter-Gather. Servers manage individual table segments and perform the actual read/writes. To get a better understanding of each component, read this <a href="https://startree.ai/blog/introduction-to-apache-pinot-a-beginners-guide" target="_blank" rel="noopener noreferrer">intro to Apache Pinot</a>.</p><p>At this time, we should have a running Pinot cluster. We can verify via the Pinot Data Explorer by browsing to localhost:9000. You should see something like this:</p><p><img src="https://www.datocms-assets.com/75153/1684419932-image2.png" alt="Pinot data explorer" title="Pinot data explorer"></p><p>What just happened?</p><p>Let’s dive in.</p><p>We have started the four essential components of Pinot, however, you will note that there is not yet any data in our fresh new instance.</p><p>Before we create a table and load data, notice the four navigation menus on the left-hand side. You can look at the cluster status, run queries, inspect ZooKeeper, or launch the Swagger endpoints for the REST API that Pinot supports.</p><p>On the cluster, we notice that we have the essentials deployed: controller, broker, and server. Currently, there are no tables and no minions—dispatchable components used for task management—exist, though Notice also that multi-tenancy support is available in the cluster manager.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="step-5"></a>Step 5:<a class="hash-link" href="#step-5" title="Direct link to heading">#</a></h3><p>Now that we have our Apache Pinot cluster ready, let’s load some data. Of course, before we do that, we have to create a schema. </p><p>Let’s navigate to the folder:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token builtin class-name" style="color:rgb(189, 147, 249)">cd</span><span class="token plain"> /opt/pinot/examples/batch/baseballStats</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You will notice that there are the following files listed here:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">baseballStats_offline_table_config.json </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">baseballStats_schema.json </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">ingestionJobSpec.yaml </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">sparkIngestionJobSpec.yaml </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">rawdata</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>From the names, we can see that there is a schema file, a table config file, an ingestion job, and Apache Spark™ ingestion job files as well as a raw data folder.</p><p>The content of the schema file contains both metric and dimension like so (abbreviated):</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"playerStint"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> … </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"baseOnBalls"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"playerID"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ….</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"playerName"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"baseballStats"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>To create a schema and table for the baseball stats file, run the following command from the /app/pinot/bin folder:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">./pinot-admin.sh AddTable -schemaFile /opt/pinot/examples/batch/baseballStats/baseballStats_schema.json -tableConfigFile /opt/pinot/examples/batch/baseballStats/baseballStats_offline_table_config.json -exec</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You should now see the schema and table created:</p><p><img src="https://www.datocms-assets.com/75153/1684421406-image12.png" alt="Apache Pinot tables created" title="Apache Pinot tables created"></p><p>Next, we’ll want to load some data into the table that we created. We have some sample data in the folder rawdata that we can use to load. We will need a YAML file to perform the actual ingestion job and can use the following command to import data:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">./pinot-admin.sh LaunchDataIngestionJob -jobSpecFile /opt/pinot/examples/batch/baseballStats/ingestionJobSpec.yaml</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If you run into trouble on this step like I did, edit the ingestJobSpec.yaml file using Docker Desktop to change the inputDirURI from relative to absolute path. Then rerun the above command.</p><p><img src="https://www.datocms-assets.com/75153/1684419802-image1.png" alt="Editing the .yaml file with Docker Desktop" title="Editing the .yaml file with Docker Desktop"></p><p>You should now be able to see the table has been populated like so:</p><p><img src="https://www.datocms-assets.com/75153/1684421215-image8.png" alt="Apache Pinot table populated" title="Apache Pinot table populated"></p><p>Now, let’s run some queries. From localhost:9000, select the Query Console in the left-hand menu. Then type in some of these queries:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token operator">*</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> baseballStats </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">runs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> playerName </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> baseballStats </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">group</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> playerName </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">runs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">desc</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You should see results like so:</p><p><img src="https://www.datocms-assets.com/75153/1684421163-image6.png" alt="Apache Pinot query console" title="Apache Pinot query console"></p><p>And there you have it!</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="whats-under-the-hood"></a>What’s under the hood?<a class="hash-link" href="#whats-under-the-hood" title="Direct link to heading">#</a></h2><p>If you’re curious to go a step further and see what the segments look like and what the actual data on disk looks like, keep reading! In the Tables section of localhost:9000, you can scroll down to find a segment:</p><p><img src="https://www.datocms-assets.com/75153/1684421358-image9.png" alt="Apache Pinot data on disk segment" title="Apache Pinot data on disk segment"></p><p>Clicking on this gives the specifics of the segment:</p><p><img src="https://www.datocms-assets.com/75153/1684420155-image5.png" alt="Segment specifics in Pinot UI" title="Segment specifics in Pinot UI"></p><p>Pinot allows you to easily inspect your segments and tables in one easy-to-use UI. You can find what’s where and keep an eye on size, location, number of documents, etc.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="conclusion"></a>Conclusion<a class="hash-link" href="#conclusion" title="Direct link to heading">#</a></h2><p>Congratulations!</p><p>Together, we’ve:</p><ul><li>Installed and ran Apache Pinot components</li><li>Created a table schema and a table</li><li>Loaded data in a table</li><li>Ran a few queries</li><li>Explored the Pinot UI</li></ul><p>In my next article, we’ll consume event streaming data using Apache Pinot and Apache Kafka®.</p><p>In the meantime, run more queries, load more data, and don’t forget to <a href="https://communityinviter.com/apps/startreedata/startree-community" target="_blank" rel="noopener noreferrer">join the Community Slack</a> for support if you get stuck!</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/data-explorer">data explorer</a><a class="margin-horiz--sm" href="/blog/tags/getting-started">getting started</a><a class="margin-horiz--sm" href="/blog/tags/streaming">streaming</a><a class="margin-horiz--sm" href="/blog/tags/kafka">kafka</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot Tutorial for Getting Started - A Step-by-Step Guide" href="/blog/2023/05/18/apache-pinot-tutorial-for-getting-started-a-step-by-step-guide"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/05/16/star-tree-indexes-in-apache-pinot-part-1-understanding-the-impact-on-query-performance">StarTree Indexes in Apache Pinot Part-1 - Understanding the Impact on Query Performance</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-05-16T00:00:00.000Z">May 16, 2023</time> · 7 min read</div><div class="avatar margin-vert--md"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Sandeep Dabade"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer">Sandeep Dabade</a></div><small class="avatar__subtitle">Solutions engineer</small></div></div></header><div class="markdown"><p>Star-tree is a <a href="https://docs.pinot.apache.org/basics/indexing/star-tree-index" target="_blank" rel="noopener noreferrer">specialized index</a> in <a href="https://startree.ai/resources/what-is-apache-pinot" target="_blank" rel="noopener noreferrer">Apache Pinot™</a>. This index dynamically builds a tree structure to maintain aggregates for a group of dimensions. With star-tree Index, the query latency becomes a function of just a tree traversal with computational complexity of log(<em>n</em>).</p><p><a href="https://startree.ai/blog/a-tale-of-three-real-time-olap-databases#query" target="_blank" rel="noopener noreferrer">This comprehensive blog</a> explains in depth how the star-tree Index differs from traditional materialized views (MVs). In particular, read the section Star-Tree Index: Pinot’s intelligent materialized view. Particularly this one key passage:</p><p><em>Star-Tree Index: Pinot’s Intelligent Materialized View:</em> </p><p><em>The star-tree index provides an intelligent way to build materialized views within Pinot. Traditional MVs work by fully materializing the computation for each source record that matches the specified predicates. Although useful, this can result in non-trivial storage overhead. On the other hand, the star-tree index allows us to partially materialize the computations and provide the ability to tune the space-time tradeoff by providing a configurable threshold between pre-aggregation and data scans.</em></p><p><img src="https://www.datocms-assets.com/75153/1684246698-image5.png"></p><p>In this three-part blog series, we will compare and contrast query performance of a star-tree index with an inverted index, something that most of the OLAP databases end up using for such queries. </p><p>In this first part, we will showcase how a star-tree index brought down standalone query latency on a sizable dataset of ~633M records from 1,513ms to 4ms! — nearly 380x faster.</p><p><img src="https://www.datocms-assets.com/75153/1684246806-image7.png"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="1-the-dataset"></a>1. The Dataset:<a class="hash-link" href="#1-the-dataset" title="Direct link to heading">#</a></h2><p>We used New York City Taxi Data for this comparison. Original source: <a href="https://www.kaggle.com/c/nyc-taxi-trip-duration" target="_blank" rel="noopener noreferrer">here</a>. Below are the high level details about this dataset. </p><p><img src="https://www.datocms-assets.com/75153/1684246816-image6.png"></p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="schema"></a>Schema:<a class="hash-link" href="#schema" title="Direct link to heading">#</a></h3><p>The dataset has 8 dimension fields and 11 metric columns as listed below. </p><p><img src="https://www.datocms-assets.com/75153/1684246732-image2.png"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="2-query-pattern"></a>2. Query Pattern<a class="hash-link" href="#2-query-pattern" title="Direct link to heading">#</a></h2><p>The query pattern involved slicing and dicing the data (GROUPING) BY various dimensions (Date, Month and Year), aggregating different metrics (total trips, distance and passengers count) and FILTERING BY a time range that could go as wide as 1 year.</p><p><img src="https://www.datocms-assets.com/75153/1684246872-image10.png"></p><p>Note: A key thing to note is that a single star-tree index covers a wide range of OLAP queries that comprise the dimensions, metrics and aggregate functions specified in it. </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="star-tree-index-config"></a>Star-Tree Index Config:<a class="hash-link" href="#star-tree-index-config" title="Direct link to heading">#</a></h3><p>To support the various query patterns specified above, we defined the following star-tree index.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"starTreeIndexConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionsSplitOrder"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"dropoff_date_str"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"dropoff_month"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"dropoff_year"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"skipStarNodeCreationForDimensions"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"functionColumnPairs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"COUNT__*"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"SUM__passenger_count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"SUM__total_amount"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"SUM__trip_distance"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"AVG__passenger_count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"AVG__total_amount"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"AVG__trip_distance"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MIN__passenger_count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MIN__total_amount"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MIN__trip_distance"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MAX__passenger_count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MAX__total_amount"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MAX__trip_distance"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"maxLeafRecords"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">10000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This one star-tree index can get us insights to questions such as:</p><ul><li>How many trips were completed in a given day, month or year? </li><li>How many passengers traveled in a given day, month or year? </li><li>What is the daily / monthly / annual average trip revenue? </li><li>What is the daily / monthly / annual average trip revenue, trip duration and distance traveled? </li><li>What is the daily / monthly / annual breakdown of total number of trips, total distance traveled and total revenue generated in 2015?</li><li>And many more…</li></ul><p>We will use one such variant query for this illustration:</p><ul><li>What is the total number of trips, total distance traveled and total revenue generated by day in 2015?</li></ul><p>We used a very small infrastructure footprint for this comparison test. </p><p><img src="https://www.datocms-assets.com/75153/1687549350-screen-shot-2023-06-22-at-1-32-51-pm.png"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="4-query-results-and-stats"></a>4. Query Results and Stats<a class="hash-link" href="#4-query-results-and-stats" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="iteration-1-wo-any-apache-pinot-optimizations"></a>Iteration #1: w/o any Apache Pinot optimizations:<a class="hash-link" href="#iteration-1-wo-any-apache-pinot-optimizations" title="Direct link to heading">#</a></h3><p>First, we ran the query without any optimizations offered in Apache Pinot. </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token comment" style="color:rgb(98, 114, 164)">-- Iteration #1: w/o optimizations > 120s</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> toDateTime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">tpep_dropoff_datetime</span><span class="token operator">/</span><span class="token number">1000</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'yyyy-MM-dd'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total # of Trips"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">trip_distance</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total distance traveled"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">passenger_count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total # of Passengers"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">total_amount</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total Revenue"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> nyc_taxi_demo</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"> </span><span class="token operator">BETWEEN</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2015-01-01'</span><span class="token plain"> </span><span class="token operator">AND</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2015-12-31'</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ASC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">1000</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This was a wide time range query (365 days). It required scanning across ~146M out of ~633M documents. In addition, it involved performing an expensive ToDateTime transformation on the tpep_dropoff_datetime entry in each of those ~146M documents during query time. </p><p>Result: The query took 131,425 milliseconds (~131.4s; ~2m 11s) to complete. </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="iteration-2-w-inverted-index"></a>Iteration #2: w/ Inverted Index <a class="hash-link" href="#iteration-2-w-inverted-index" title="Direct link to heading">#</a></h3><p>In this iteration, we used a derived column - dropoff_date_str - which performed the ToDateTime transformation for every record during ingestion time. Since the cardinality of this derived column was much lower (granularity was at Day level instead of milliseconds), this enabled us to use an inverted index on this column.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token comment" style="color:rgb(98, 114, 164)">-- Iteration #2: w/ Ingestion Time Transformation</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> dropoff_date_str </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total # of Trips"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">trip_distance</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total distance traveled"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">passenger_count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total # of Passengers"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">total_amount</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total Revenue"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> nyc_taxi_demo</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"> </span><span class="token operator">BETWEEN</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2015-01-01'</span><span class="token plain"> </span><span class="token operator">AND</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2015-12-31'</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ASC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">1000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">option</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">useStarTree</span><span class="token operator">=</span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> timeoutMs</span><span class="token operator">=</span><span class="token number">20000</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><img src="https://www.datocms-assets.com/75153/1684246716-image1.png"></p><p>Result: The query completed in 1,513 milliseconds. (~1.5s); from ~131s to ~1.5s was a BIG improvement. However, results still took more than a second — which is a relatively long time for an OLAP database, especially if it is faced with multiple concurrent queries.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="iteration-3-w-star-tree-index"></a>Iteration #3: w/ Star-Tree Index: <a class="hash-link" href="#iteration-3-w-star-tree-index" title="Direct link to heading">#</a></h3><p>In this iteration, we ran the same query with star-tree index enabled. </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token comment" style="color:rgb(98, 114, 164)">-- Iteration #3: w/ Ingestion Time Transformation + StarTree Index</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> dropoff_date_str </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total # of Trips"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">trip_distance</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total distance traveled"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">passenger_count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total # of Passengers"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">sum</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">total_amount</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Total Revenue"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> nyc_taxi_demo</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"> </span><span class="token operator">BETWEEN</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2015-01-01'</span><span class="token plain"> </span><span class="token operator">AND</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2015-12-31'</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Date"</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ASC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">1000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">option</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">useStarTree</span><span class="token operator">=</span><span class="token boolean">true</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><img src="https://www.datocms-assets.com/75153/1684246852-image9.png"></p><p>Result: The query completed in 4 milliseconds! Notice in particular that the numDocsScanned came down from ~146M to 409! </p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="comparison"></a>Comparison:<a class="hash-link" href="#comparison" title="Direct link to heading">#</a></h3><p>Let’s take a closer look at the <a href="https://docs.pinot.apache.org/users/api/querying-pinot-using-standard-sql/response-format" target="_blank" rel="noopener noreferrer">query response stats</a> across all three iterations to understand the “how” part of this magic of indexing in Apache Pinot. </p><p><img src="https://www.datocms-assets.com/75153/1684246748-image3.png"></p><ol><li><p>The dataset has 633,694,594 records (documents) spread across 130 segments. </p></li><li><p>Query Stats: </p><ol><li>w/o any index optimizations (Iteration #1), the query scanned ALL 633,694,594 records (check numEntriesScannedInFilter) during processing. Also, numEntriesScannedPostFilter was 584,147,312 (numDocsScanned = ~146M). All 130 segments were processed which was very inefficient. </li><li>w/ Inverted Index (Iteration #2), numEntriesScannedInFilter was 0; numEntriesScannedPostFilter was 584,147,312 (numDocsScanned = ~146M) which meant that the query selectivity was low (the query had to scan a lot of records during post filter phase; about 92% of overall records). This is an indication of when a star-tree index could help.</li><li>w/ Star-tree Index (Iteration #3), numEntriesScannedInFilter was 0; numEntriesScannedPostFilter was only 2,045 (numDocsScanned = 409). The star-tree index helped improve query performance tremendously by providing high query selectivity.</li></ol></li></ol><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="5-impact-summary"></a>5. Impact Summary:<a class="hash-link" href="#5-impact-summary" title="Direct link to heading">#</a></h2><p><img src="https://www.datocms-assets.com/75153/1684246766-image4.png"></p><ol><li>356,968x improvement (or 99.999% drop) in num docs scanned from ~146M to 409.</li><li>378.5x improvement (~99.736% drop) in query latency from 1,513 ms to 4 ms.</li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="key-benefits-of-the-star-tree-index"></a>Key Benefits of the Star-Tree Index:<a class="hash-link" href="#key-benefits-of-the-star-tree-index" title="Direct link to heading">#</a></h3><ul><li><p>User controllable: Tune space vs. time overhead</p></li><li><p>Flexible: create any number of indexes. The right index is chosen based on the query structure.</p></li><li><p>Transparent: Unlike traditional MVs, users don’t need to know about the existence of a star-tree index. The same query will be accelerated with a star-tree index in place.</p></li><li><p>Dynamic: Very easy to generate a new index at any point of time.</p></li><li><p>Disk IO is the most expensive operation in query processing. Latency is linear to the number of disk reads a query has to perform. Star-Tree Index brings the number of disk reads down exponentially. </p><ul><li>In this example, star-tree Index reduced the disk reads by 99.999% from ~584 Million entries (~146 Million documents or records) in case of an inverted index to 2,045 entries (409 documents or records). Query latency came down from 1,513 ms to 4 ms! </li></ul></li></ul><p><a href="https://startree.ai/blog/star-tree-indexes-in-apache-pinot-part-2-understanding-the-impact-during-high-concurrency" target="_blank" rel="noopener noreferrer">In part 2 of this series,</a> we will perform throughput tests to measure the impact of star-tree index under high load.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/star-tree-index">star-tree index</a></div><div class="col text--right"><a aria-label="Read more about StarTree Indexes in Apache Pinot Part-1 - Understanding the Impact on Query Performance" href="/blog/2023/05/16/star-tree-indexes-in-apache-pinot-part-1-understanding-the-impact-on-query-performance"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/05/11/Geospatial-Indexing-in-Apache-Pinot">Geospatial Indexing in Apache Pinot</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-05-11T00:00:00.000Z">May 11, 2023</time> · 9 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p><a href="https://youtu.be/J-4iHPolZz0" target="_blank" rel="noopener noreferrer"><img src="https://i3.ytimg.com/vi/J-4iHPolZz0/maxresdefault.jpg" alt="Watch the video"></a></p><p>It’s been over 18 months since <a href="https://medium.com/apache-pinot-developer-blog/introduction-to-geospatial-queries-in-apache-pinot-b63e2362e2a9" target="_blank" rel="noopener noreferrer">geospatial indexes were added to Apache Pinot™</a>, giving you the ability to retrieve data based on geographic location—a common requirement in many analytics use cases. Using geospatial queries in combination with time series queries in Pinot, you can perform complex spatiotemporal analysis, such as analyzing changes in weather patterns over time or tracking the movement of objects, vehicles, or people. Pinot's support for geospatial data indexing means fast and efficient querying of large-scale, location-based datasets distributed across multiple nodes.</p><p>In that time, more indexing functionality has been added, so I wanted to take an opportunity to have a look at the current state of things.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="what-is-geospatial-indexing"></a>What is geospatial indexing?<a class="hash-link" href="#what-is-geospatial-indexing" title="Direct link to heading">#</a></h2><p>Geospatial indexing is a technique used in database management systems to store and retrieve spatial data based on its geographic location. It involves creating an index that allows for efficient querying of location-based data, such as latitude and longitude coordinates or geographical shapes.</p><p>Geospatial indexing organizes spatial data in such a way that enables fast and accurate retrieval of data based on its proximity to a specific location or geographic region. This indexing can be used to answer queries such as "What are the restaurants with a 30-minute delivery window to my current location?" or "What are the boundaries of this specific city or state?"</p><p>Geospatial indexing is commonly used in geographic information systems (GIS), mapping applications, and location-based services such as ride-sharing apps, social media platforms, and navigation tools. It plays a crucial role in spatial data analysis, spatial data visualization, and decision-making processes.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-do-geospatial-indexes-work-in-apache-pinot"></a>How do geospatial indexes work in Apache Pinot?<a class="hash-link" href="#how-do-geospatial-indexes-work-in-apache-pinot" title="Direct link to heading">#</a></h2><p>We can index points using <a href="https://h3geo.org/" target="_blank" rel="noopener noreferrer">H3</a>, an open source library that originated at Uber. This library provides hexagon-based hierarchical gridding. Indexing a point means that the point is translated to a geoId, which corresponds to a hexagon. Its neighbors in H3 can be approximated by a ring of hexagons. Direct neighbors have a distance of 1, their neighbors are at a distance of 2, and so on.</p><p>For example, if the central hexagon covers the Westminster area of central London, neighbors at distance 1 are colored blue, those at distance 2 are in green, and those at distance 3 are in red.</p><p><img src="https://www.datocms-assets.com/75153/1683813508-image5.png" alt="Geospatial Indexing In Apache Pinot" title="Geospatial Indexing In Apache Pinot"></p><p>Let’s learn how to use geospatial indexing with help from a dataset that captures the latest location of trains moving around the UK. We’re streaming this data into a <code>trains</code> topic in Apache Kafka®. Here’s one message from this stream:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">kcat -C -b localhost:9092 -t trains -c1</span><span class="token operator">|</span><span class="token plain"> jq</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trainCompany"</span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"CrossCountry"</span><span class="token plain">,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"atocCode"</span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"XC"</span><span class="token plain">,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lat"</span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">:</span><span class="token plain"> </span><span class="token number">50.692726</span><span class="token plain">,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lon"</span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">:</span><span class="token plain"> -3.5040767,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2023-03-09 10:57:11.1678359431"</span><span class="token plain">,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trainId"</span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"202303096771054"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’re going to ingest this data into Pinot, so let’s create a schema:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trains"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trainCompany"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trainId"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"atocCode"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"point"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"BYTES"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The point column will store a point object that represents the current location of a train. We’ll see how that column gets populated from our table config, as shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trains"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trains"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replication"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"fieldConfigList"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"point"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"encodingType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"RAW"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"indexType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"H3"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"properties"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"resolutions"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"7"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"noDictionaryColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token string" style="color:rgb(255, 121, 198)">"point"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"trains"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka-geospatial:9093"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lowlevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"point"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STPoint(lon, lat, 1)"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The point column is populated by the following function under <code>transformConfigs</code>: </p><p><code>STPoint(lon, lat, 1)</code></p><p>In earlier versions of Pinot, you’d need to ensure that the schema included lat and lon columns, but that no longer applies. </p><p>We define the geospatial index on the point column under <code>fieldConfigList</code>. We can configure what H3 calls <a href="https://h3geo.org/docs/core-library/restable" target="_blank" rel="noopener noreferrer">resolutions</a>, which defines the size of a hexagon and their number. A resolution of 7 means that there will be 98,825,150 hexagons, each covering an area of 5.16 km². We also need to add the geospatial column to <code>tableIndexConfig.noDictionaryColumns</code>.</p><p>We can go ahead and create that table using the <code>AddTable</code> command and Pinot will automatically start ingesting data from Kafka.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="when-is-the-geospatial-index-used"></a>When is the geospatial index used?<a class="hash-link" href="#when-is-the-geospatial-index-used" title="Direct link to heading">#</a></h2><p>The geospatial index is used when a WHERE clause in a query calls the StDistance, StWithin, or StContains functions.</p><p><code>ST\_Distance</code></p><p>Let’s say we want to find all the trains within a 10 km radius of Westminster. We could write a query to answer this question using the STDistance function. The query might look like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> trainId</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> atocCode</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> trainCompany</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> ST\_AsText</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">point</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> STDistance</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">point</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> toSphericalGeography</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">ST_GeomFromText</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'POINT (-0.13624 51.499507)'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> distance</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> trains </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> distance </span><span class="token operator"><</span><span class="token plain"> </span><span class="token number">10000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> distance</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> ts </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DESC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span><span class="token plain"> </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>These results from running the query would follow:</p><p><img src="https://www.datocms-assets.com/75153/1683813581-image1.png" alt="Sample Geospatial Indexing In Apache Pinot Query Result" title="Sample Geospatial Indexing In Apache Pinot Query Result"></p><p>Let’s now go into a bit more detail about what happens when we run the query.</p><p>The 10 km radius covers the area inside the white circle on the diagram below:</p><p><img src="https://www.datocms-assets.com/75153/1683813641-image7.png" alt="Geospatial Indexing In Apache Pinot Circle" title="Geospatial Indexing In Apache Pinot Circle"></p><p>Pinot’s query planner will first translate the distance of 10 km into a number of rings, in this case, two. It will then find all the hexagons located two rings away from the white one. Some of these hexagons will fit completely inside the white circle, and some will overlap with the circle.</p><p>If a hexagon fully fits, then we can get all the records inside this hexagon and return them. For those that partially fit, we’ll need to apply the distance predicate before working out which records to return.</p><p><code>ST\_Within/ST\_Contains</code></p><p>Let’s say that rather than specifying a distance, we instead want to draw a polygon and find the trains that fit inside that polygon. We could use either the <code>ST\_Within</code> or <code>ST\_Contains</code> functions to answer this question.</p><p>The query might look like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> trainId</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> atocCode</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> trainCompany</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> ST\_AsText</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">point</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> trains </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> StWithin</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">point</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> toSphericalGeography</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">ST_GeomFromText</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'POLYGON((</span></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> -0.1296371966600418 51.508053828550544,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> -0.1538461446762085 51.497007194317064,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> -0.13032652437686923 51.488276935884414,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> -0.10458670556545259 51.497003019756846,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> -0.10864421725273131 51.50817152245844,</span></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> -0.1296371966600418 51.508053828550544))'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token number">1</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> ts </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DESC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The results from running the query are shown below:</p><p><img src="https://www.datocms-assets.com/75153/1683813749-image4.png" alt="Sample Geospatial Indexing In Apache Pinot Query Result" title="Sample Geospatial Indexing In Apache Pinot Query Result"></p><p>If we change the query to show trains outside of a central London polygon, we’d see the following results:</p><p><img src="https://www.datocms-assets.com/75153/1683813705-image3.png" alt="Sample Geospatial Indexing In Apache Pinot Query Result" title="Sample Geospatial Indexing In Apache Pinot Query Result"></p><p>So what’s actually happening when we run this query? </p><p>The polygon covers the area inside the white shape as shown below:</p><p><img src="https://www.datocms-assets.com/75153/1683813802-image2.png" alt="Geospatial Indexing In Apache Pinot Polygon" title="Geospatial Indexing In Apache Pinot Polygon"></p><p>Pinot’s query planner will first find all the coordinates on the exterior of the polygon. It will then find the hexagons that fit within that geofence. Those hexagons get added to the potential cells list. </p><p>The query planner then takes each of those hexagons and checks whether they fit completely inside the original polygon. If they do, then they get added to the fully contained cells list. If we have any cells in both lists, we remove them from the potential cells list.</p><p>Next, we find the records for the fully contained cells list and those for the potential cells list. </p><p>If we are finding records that fit inside the polygon, we return those in the fully contained list and apply the STWithin/StContains predicate to work out which records to return from the potential list.</p><p>If we are finding records outside the polygon, we will create a new fully contained list, which will actually contain the records that are outside the polygon. This list contains all of the records in the database except the ones in the potential list and those in the initial fully contained list. </p><p>This one was a bit tricky for me to get my head around, so let’s just quickly go through an example. Imagine that we store 10 records in our database and our potential and fully contained lists hold the following values:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">potential </span><span class="token operator">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token number">1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token number">2</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token number">3</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">fullyContained </span><span class="token operator">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token number">4</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token number">5</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token number">6</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>First, compute newFullyContained to find all the records not in potential: </p><p><code>newFullyContained = [4,5,6,7,8,9]</code></p><p>Then we can remove the values in fullyContained, which gives us:</p><p><code>newFullyContained = [7,8,9]</code></p><p>We will return all the records in <code>newFullyContained</code> and apply the <code>STWithin</code> or <code>StContains</code> predicate to work out which records to return from the potential list.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-do-you-know-the-index-usage"></a>How do you know the index usage?<a class="hash-link" href="#how-do-you-know-the-index-usage" title="Direct link to heading">#</a></h2><p>We can write queries that use <code>STDistance</code>, <code>STWithin</code>, and <code>STContains</code> without using a geospatial index, but if we’ve got one defined, we’ll want to get the peace of mind of its actual use.</p><p>You can check by prefixing a query with <code>EXPLAIN PLAN FOR</code>, which will return a list of the operators in the query plan. </p><p>If our query uses <code>STDistance</code>, we should expect to see the <code>FILTER\_H3\_INDEX</code> operator. If it uses STWithin or STContains, we should expect to see the INCLUSION_FILTER_H3_INDEX operator.</p><p>See this example query plan:</p><p><img src="https://www.datocms-assets.com/75153/1683813851-image6.png" alt="Apache Pinot Geospatial Indexing Query Plan" title="Apache Pinot Geospatial Indexing Query Plan"></p><p>The <a href="https://dev.startree.ai/" target="_blank" rel="noopener noreferrer">StarTree Developer Hub</a> contains a <a href="https://dev.startree.ai/docs/pinot/recipes/geospatial-indexing#how-do-i-check-that-the-geospatial-index-is-being-used" target="_blank" rel="noopener noreferrer">geospatial indexing guide</a> that goes through this in more detail.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>I hope you found this blog post useful and now understand how geospatial indexes work and when to use them in Apache Pinot.</p><p>Give them a try, and let us know how you get on! If you want to use, or are already using geospatial queries in Apache Pinot, we’d love to hear how — feel free to <a href="/contact-us">contact us</a> and tell us more! To help get you started, <a href="/saas-signup">sign up for a free trial of fully managed Apache Pinot</a>. And if you run into any technical questions, feel free to find me on the <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">StarTree Community Slack</a>.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/geospatial-indexing">geospatial indexing</a></div><div class="col text--right"><a aria-label="Read more about Geospatial Indexing in Apache Pinot" href="/blog/2023/05/11/Geospatial-Indexing-in-Apache-Pinot"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/03/30/Apache-Pinot-0-12-Consumer-Record-Lag">Apache Pinot™ 0.12 - Consumer Record Lag</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-03-30T00:00:00.000Z">March 30, 2023</time> · 5 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p><a href="https://youtu.be/JJEh_kBfJts" target="_blank" rel="noopener noreferrer"><img src="https://i3.ytimg.com/vi/JJEh_kBfJts/maxresdefault.jpg" alt="Watch the video"></a></p><p>The Apache Pinot community recently released version <a href="https://docs.pinot.apache.org/basics/releases/0.12.0" target="_blank" rel="noopener noreferrer">0.12.0</a>, which has lots of goodies for you to play with. I’ve been exploring and writing about those features in a series of blog posts.</p><p>This post will explore a new API endpoint that lets you check how much Pinot is lagging when ingesting from Apache Kafka.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="why-do-we-need-this"></a>Why do we need this?<a class="hash-link" href="#why-do-we-need-this" title="Direct link to heading">#</a></h2><p>A common question in the Pinot community is how to work out the consumption status of real-time tables. </p><p>This was a tricky one to answer, but Pinot 0.12 sees the addition of a new API that lets us see exactly what’s going on.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="worked-example"></a>Worked Example<a class="hash-link" href="#worked-example" title="Direct link to heading">#</a></h2><p>Let’s have a look at how it works with help from a worked example. </p><p>First, we’re going to create a Kafka topic with 5 partitions:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">exec</span><span class="token plain"> -it kafka-lag-blog kafka-topics.sh </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--bootstrap-server localhost:9092 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--partitions </span><span class="token number">5</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--topic events </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">--create </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’re going to populate this topic with data from a data generator, which is shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> uuid</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> random</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> json</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> click</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> time</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token decorator annotation punctuation" style="color:rgb(248, 248, 242)">@click</span><span class="token decorator annotation punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token decorator annotation punctuation" style="color:rgb(248, 248, 242)">command</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token decorator annotation punctuation" style="color:rgb(248, 248, 242)">@click</span><span class="token decorator annotation punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token decorator annotation punctuation" style="color:rgb(248, 248, 242)">option</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'--sleep'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> default</span><span class="token operator">=</span><span class="token number">0.0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">help</span><span class="token operator">=</span><span class="token string" style="color:rgb(255, 121, 198)">'Sleep between each message'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">generate</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">sleep</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">while</span><span class="token plain"> </span><span class="token boolean">True</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ts </span><span class="token operator">=</span><span class="token plain"> datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">now</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">strftime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"%Y-%m-%dT%H:%M:%S.%fZ"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">id</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">str</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">uuid</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">uuid4</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> count </span><span class="token operator">=</span><span class="token plain"> random</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">randint</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token number">1000</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">print</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">json</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">dumps</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token string" style="color:rgb(255, 121, 198)">"tsString"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> time</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">sleep</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">sleep</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">if</span><span class="token plain"> __name__ </span><span class="token operator">==</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'__main__'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> generate</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can see an example of the messages generated by this script by running the following:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python datagen.py --sleep </span><span class="token number">0.01</span><span class="token plain"> </span><span class="token operator file-descriptor important">2</span><span class="token operator">></span><span class="token plain">/dev/null </span><span class="token operator">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">head</span><span class="token plain"> -n3 </span><span class="token operator">|</span><span class="token plain"> jq -c</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You should see something like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"2023-03-17T12:10:03.854680Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"f3b7b5d3-b352-4cfb-a5e3-527f2c663143"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"count"</span><span class="token operator">:</span><span class="token number">690</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"2023-03-17T12:10:03.864815Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"eac57622-4b58-4456-bb38-96d1ef5a1ed5"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"count"</span><span class="token operator">:</span><span class="token number">522</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"2023-03-17T12:10:03.875723Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"65926a80-208a-408b-90d0-36cf74c8923a"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token property">"count"</span><span class="token operator">:</span><span class="token number">154</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>So far, so good. Let’s now ingest this data into Kafka:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python datagen.py --sleep </span><span class="token number">0.01</span><span class="token plain"> </span><span class="token operator file-descriptor important">2</span><span class="token operator">></span><span class="token plain">/dev/null </span><span class="token operator">|</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">jq -cr --arg sep ø </span><span class="token string" style="color:rgb(255, 121, 198)">'[.uuid, tostring] | join($sep)'</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">kcat -P -b localhost:9092 -t events -K </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Next we’re going to create a Pinot schema and table. First, the schema config:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>And now, the table config:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replication"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka-lag-blog:9093"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lowlevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"realtime.segment.flush.threshold.rows"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"10000000"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SSSSSS''Z''')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can create both the table and schema using the <em>AddTable</em> command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network lag_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.12.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost </span><span class="token string" style="color:rgb(255, 121, 198)">"pinot-controller-lag-blog"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Now let’s call the /consumingSegmentsInfo endpoint to see what’s going on:</p><p><code>curl "http://localhost:9000/tables/events/consumingSegmentsInfo" 2>/dev/null | jq</code></p><p>The output of calling this end point is shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"_segmentToConsumingInfoMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"events__0__0__20230317T1133Z"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"serverName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Server_172.29.0.4_8098"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"consumerState"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"CONSUMING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"lastConsumedTimestamp"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">1679052823350</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"partitionToOffsetMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"0"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"969"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"partitionOffsetInfo"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"currentOffsetsMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"0"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"969"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"latestUpstreamOffsetMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"0"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"969"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"recordsLagMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"0"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"0"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"availabilityLagMsMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"0"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"26"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">…</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we look under <em>partitionOffsetInfo</em>, we can see what’s going on:</p><ul><li>currentOffsetsMap is Pinot’s current offset</li><li>latestUpstreamOffsetMap is Kafka’s offset</li><li>recordsLagMap is the record lag</li><li>availabilityLagMsMap is the time lag</li></ul><p>This output is a bit unwieldy, so let’s create a bash function to tidy up the output into something that’s easier to consume:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">function consuming_info</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> curl </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events/consumingSegmentsInfo"</span><span class="token plain"> </span><span class="token number">2</span><span class="token operator">></span><span class="token operator">/</span><span class="token plain">dev</span><span class="token operator">/</span><span class="token plain">null </span><span class="token operator">|</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> jq </span><span class="token operator">-</span><span class="token plain">rc '</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">_segmentToConsumingInfoMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> segment</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> $k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">currentOffsetsMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> pinot</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">latestUpstreamOffsetMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> recordLag</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">recordsLagMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> timeLagMs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">availabilityLagMsMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain">keys_unsorted </span><span class="token operator">|</span><span class="token plain"> @tsv</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token builtin" style="color:rgb(189, 147, 249)">map</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain">@tsv</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain">' </span><span class="token operator">|</span><span class="token plain"> column </span><span class="token operator">-</span><span class="token plain">t</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> printf </span><span class="token string" style="color:rgb(255, 121, 198)">"\n"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Let’s call the function:</p><p><code>consuming\_info</code></p><p>We’ll see the following output:</p><p><img src="https://www.datocms-assets.com/75153/1680190272-image2.png" alt="Consumer record lag output" title="Consumer record lag output"></p><p>Now let’s put it in a script and call the watch command so that it will be refreshed every couple of seconds:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">!</span><span class="token comment" style="color:rgb(98, 114, 164)">#/bin/bash</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">function consuming_info</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> curl </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events/consumingSegmentsInfo"</span><span class="token plain"> </span><span class="token number">2</span><span class="token operator">></span><span class="token operator">/</span><span class="token plain">dev</span><span class="token operator">/</span><span class="token plain">null </span><span class="token operator">|</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> jq </span><span class="token operator">-</span><span class="token plain">rc '</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">_segmentToConsumingInfoMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> segment</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> $k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">currentOffsetsMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> pinot</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">latestUpstreamOffsetMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> recordLag</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">recordsLagMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> timeLagMs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">partitionOffsetInfo</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">availabilityLagMsMap </span><span class="token operator">|</span><span class="token plain"> keys</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> $k </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain">$k</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain">keys_unsorted </span><span class="token operator">|</span><span class="token plain"> @tsv</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"> </span><span class="token operator">|</span><span class="token builtin" style="color:rgb(189, 147, 249)">map</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain">@tsv</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain">' </span><span class="token operator">|</span><span class="token plain"> column </span><span class="token operator">-</span><span class="token plain">t</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> printf </span><span class="token string" style="color:rgb(255, 121, 198)">"\n"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">export </span><span class="token operator">-</span><span class="token plain">f consuming_info</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">watch bash </span><span class="token operator">-</span><span class="token plain">c consuming_info</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Give permissions to run it as a script:</p><p><code>chmod u+x watch\_consuming\_info.sh</code></p><p>And finally, run it:</p><p><code>./watch\_consuming\_info.sh</code></p><p>This will print out a new table every two seconds. Let’s now make things more interesting by removing the sleep from our ingestion command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python datagen.py </span><span class="token operator file-descriptor important">2</span><span class="token operator">></span><span class="token plain">/dev/null </span><span class="token operator">|</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">jq -cr --arg sep ø </span><span class="token string" style="color:rgb(255, 121, 198)">'[.uuid, tostring] | join($sep)'</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">kcat -P -b localhost:9092 -t events -Kø</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>And now if we look at the watch output:</p><p><img src="https://www.datocms-assets.com/75153/1680190286-image1.png" alt="Apache Pinot Consumer Record Lag" title="Apache Pinot Consumer Record Lag"></p><p>We get some transitory lag, but it generally goes away by the next time the command is run. </p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>I love this feature, and it solves a problem I’ve struggled with when using my datasets. I hope you’ll find it just as useful.</p><p>Give it a try, and let us know how you get on. If you have any questions about this feature, feel free to join us on <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">Slack</a>, where we’ll be happy to help you out.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/consumer-record-lag">consumer record lag</a><a class="margin-horiz--sm" href="/blog/tags/kafka">kafka</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.12 - Consumer Record Lag" href="/blog/2023/03/30/Apache-Pinot-0-12-Consumer-Record-Lag"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/02/21/Apache-Pinot-0-12-Configurable-Time-Boundary">Apache Pinot™ 0.12 - Configurable Time Boundary</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-02-21T00:00:00.000Z">February 21, 2023</time> · 4 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p><a href="https://youtu.be/lB3RaKJ0Hbs" target="_blank" rel="noopener noreferrer"><img src="https://i3.ytimg.com/vi/lB3RaKJ0Hbs/maxresdefault.jpg" alt="Watch the video"></a></p><p>The Apache Pinot community recently released version <a href="https://docs.pinot.apache.org/basics/releases/0.12.0" target="_blank" rel="noopener noreferrer">0.12.0</a>, which has lots of goodies for you to play with. This is the first in a series of blog posts showing off some of the new features in this release.</p><p>This post will explore the ability to configure the time boundary when working with hybrid tables.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="what-is-a-hybrid-table"></a>What is a hybrid table?<a class="hash-link" href="#what-is-a-hybrid-table" title="Direct link to heading">#</a></h2><p>A hybrid table is the term used to describe a situation where we have an offline and real-time table with the same name. The offline table stores historical data, while the real-time data continuously ingests data from a streaming data platform.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-do-you-query-a-hybrid-table"></a>How do you query a hybrid table?<a class="hash-link" href="#how-do-you-query-a-hybrid-table" title="Direct link to heading">#</a></h2><p>When you write a query against a hybrid table, the Pinot query engine needs to work out which records to read from the offline table and which to read from the real-time table.</p><p>It does this by computing the time boundary, determined by looking at the maximum end time of segments in the offline table and the segment ingestion frequency specified for the offline table.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI"><pre tabindex="0" class="prism-code language-undefined codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">timeBoundary = <Maximum end time of offline segments> - <Ingestion Frequency></span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The ingestion frequency can either be 1 hour or 1 day, so one of these values will be used.</p><p>When a query for a hybrid table is received by a Pinot Broker, the broker sends a time boundary annotated version of the query to the offline and real-time tables. Any records from or before the time boundary are read from the offline table; anything greater than the boundary comes from the real-time table.</p><p><img src="https://www.datocms-assets.com/75153/1676991003-image2.png" alt="Apache Pinot computing the time boundary" title="Apache Pinot computing the time boundary"> </p><p>For example, if we executed the following query:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> events</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The broker would send the following query to the offline table:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> events_OFFLINE</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> timeColumn </span><span class="token operator"><=</span><span class="token plain"> $timeBoundary</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>And the following query to the real-time table:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> events_REALTIME</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> timeColumn </span><span class="token operator">></span><span class="token plain"> $timeBoundary</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The results of the two queries are merged by the broker before being returned to the client.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="so-whats-the-problem"></a>So, what’s the problem?<a class="hash-link" href="#so-whats-the-problem" title="Direct link to heading">#</a></h2><p>If we have some overlap in the data in our offline and real-time tables, this approach works well, but if we have no overlap, we will end up with unexpected results.</p><p>For example, let’s say that the most recent timestamp in the events offline table is 2023-01-09T18:41:17, our ingestion frequency is 1 hour, and the real-time table has data starting from 2023-01-09T18:41:18.</p><p>This will result in a boundary time of 2023-01-09T17:41:17, which means that any records with timestamps between 17:41 and 18:41 will be excluded from query results.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="and-the-solution"></a>And the solution?<a class="hash-link" href="#and-the-solution" title="Direct link to heading">#</a></h2><p>The 0.12 release sees the addition of the tables/{tableName}/timeBoundary API, which lets us set the time boundary to the maximum end time of all offline segments.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X POST </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/{tableName}/timeBoundary"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>In this case, that will result in a new boundary time of 2023-01-09T18:41:17, which is exactly what we need.</p><p>We’ll then be able to query the events table and have it read the offline table to get all records on or before 2023-01-09T18:41:17 and the real-time table for everything else.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="neat-anything-else-i-should-know"></a>Neat, anything else I should know?<a class="hash-link" href="#neat-anything-else-i-should-know" title="Direct link to heading">#</a></h2><p>Something to keep in mind when updating the time boundary is that it’s a one-off operation. It won’t be automatically updated if you add a new, more recent segment to the offline table.</p><p>In this scenario, you need to call the tables/{tableName}/timeBoundary API again.</p><p>And if you want to revert to the previous behavior where the time boundary is computed by subtracting the ingestion frequency from the latest end time, you can do that too:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X DELETE </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/{tableName}/timeBoundary"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>I love this feature, and it solves a problem I’ve struggled with when using my datasets. I hope you’ll find it just as useful.</p><p>Give it a try, and let us know how you get on. If you have any questions about this feature, feel free to join us on <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">Slack</a>, where we’ll be happy to help you out.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/hybrid-tables">hybrid tables</a><a class="margin-horiz--sm" href="/blog/tags/time-boundary">time boundary</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.12 - Configurable Time Boundary" href="/blog/2023/02/21/Apache-Pinot-0-12-Configurable-Time-Boundary"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables">Apache Pinot™ 0.11 - Deduplication on Real-Time Tables</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2023-01-29T00:00:00.000Z">January 29, 2023</time> · 8 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p>Last fall, the Apache Pinot community released version <a href="https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4" target="_blank" rel="noopener noreferrer">0.11.0</a>, which has lots of goodies for you to play with.</p><p>In this post, we’re going to learn about the <a href="https://docs.pinot.apache.org/basics/data-import/dedup" target="_blank" rel="noopener noreferrer">deduplication for the real-time tables feature</a>. </p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="why-do-we-need-deduplication-on-real-time-tables"></a>Why do we need deduplication on real-time tables?<a class="hash-link" href="#why-do-we-need-deduplication-on-real-time-tables" title="Direct link to heading">#</a></h2><p>This feature was built to deal with duplicate data in the streaming platform. </p><p>Users have previously used the upsert feature to de-duplicate data, but this has the following limitations:</p><ul><li>It forces us to keep redundant records that we don’t want to keep, which increases overall storage costs.</li><li>We can’t yet use the StarTree index with upserts, so the speed benefits we get from using that indexing technique are lost.</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-does-dedup-differ-from-upserts"></a>How does dedup differ from upserts?<a class="hash-link" href="#how-does-dedup-differ-from-upserts" title="Direct link to heading">#</a></h2><p>Both upserts and dedup keep track of multiple documents that have the same primary key. They differ as follows:</p><ul><li>Upserts are used when we want to get the latest copy of a document for a given primary key. It’s likely that some or all of the other fields will be different. Pinot stores all documents it receives, but when querying it will only return the latest document for each primary key.</li><li>Dedup is used when we know that multiple documents with the same primary key are identical. Only the first event received for a given primary key is stored in Pinot—any future events with the same primary key are thrown away.</li></ul><p>Let’s see how to use this functionality with help from a worked example.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="setting-up-apache-kafka-and-apache-pinot"></a>Setting up Apache Kafka and Apache Pinot<a class="hash-link" href="#setting-up-apache-kafka-and-apache-pinot" title="Direct link to heading">#</a></h2><p>We’re going to spin up Kafka and Pinot using the following Docker Compose config:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI yaml"><pre tabindex="0" class="prism-code language-yaml codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token key atrule">version</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"3"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">services</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">zookeeper</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">image</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> zookeeper</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">3.8.0</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">hostname</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> zookeeper</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">container_name</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> zookeeper</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">dedup</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">blog</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">ports</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2181:2181"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">ZOOKEEPER_CLIENT_PORT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token number">2181</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">ZOOKEEPER_TICK_TIME</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token number">2000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">networks</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> dedup_blog</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">image</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> wurstmeister/kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">latest</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">restart</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> unless</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">stopped</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">container_name</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka-dedup-blog"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">ports</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"9092:9092"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">expose</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"9093"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">depends_on</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> zookeeper</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">KAFKA_ZOOKEEPER_CONNECT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> zookeeper</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">dedup</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">blog</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">2181/kafka</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">KAFKA_BROKER_ID</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token number">0</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">KAFKA_ADVERTISED_HOST_NAME</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">dedup</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">blog</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">KAFKA_ADVERTISED_LISTENERS</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> PLAINTEXT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">//kafka</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">dedup</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">blog</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token number">9093</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain">OUTSIDE</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">//localhost</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token number">9092</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">KAFKA_LISTENERS</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> PLAINTEXT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">//0.0.0.0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token number">9093</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain">OUTSIDE</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">//0.0.0.0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token number">9092</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">KAFKA_LISTENER_SECURITY_PROTOCOL_MAP</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> PLAINTEXT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">PLAINTEXT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain">OUTSIDE</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">PLAINTEXT</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">networks</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> dedup_blog</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">pinot-controller</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">image</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> apachepinot/pinot</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">0.11.0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">arm64</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">command</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"QuickStart -type EMPTY"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">container_name</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"pinot-controller-dedup-blog"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">volumes</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> ./config</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain">/config</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">restart</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> unless</span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain">stopped</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">ports</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"9000:9000"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">networks</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> dedup_blog</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">networks</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">dedup_blog</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> dedup_blog</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can spin up our infrastructure using the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker-compose</span><span class="token plain"> up</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="data-generation"></a>Data Generation<a class="hash-link" href="#data-generation" title="Direct link to heading">#</a></h2><p>Let’s imagine that we want to ingest events generated by the following Python script:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> datetime</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> uuid</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> random</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> json</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">while</span><span class="token plain"> </span><span class="token boolean">True</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ts </span><span class="token operator">=</span><span class="token plain"> datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">now</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">strftime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"%Y-%m-%dT%H:%M:%S.%fZ"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">id</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">str</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">uuid</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">uuid4</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> count </span><span class="token operator">=</span><span class="token plain"> random</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">randint</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token number">1000</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">print</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> json</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">dumps</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token string" style="color:rgb(255, 121, 198)">"tsString"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token number">3</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can view the data generated by this script by pasting the above code into a file called datagen.py and then running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python datagen.py </span><span class="token operator file-descriptor important">2</span><span class="token operator">></span><span class="token plain">/dev/null </span><span class="token operator">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">head</span><span class="token plain"> -n3 </span><span class="token operator">|</span><span class="token plain"> jq</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’ll see the following output:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2023-01-03T10:59:17.355081Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"f94"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"count"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">541</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2023-01-03T10:59:17.355125Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"057"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"count"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">96</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2023-01-03T10:59:17.355141Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"d7b"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"count"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">288</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we generate only 25,000 events, we’ll get some duplicates, which we can see by running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python datagen.py </span><span class="token operator file-descriptor important">2</span><span class="token operator">></span><span class="token plain">/dev/null </span><span class="token operator">|</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">jq -r </span><span class="token string" style="color:rgb(255, 121, 198)">'.uuid'</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">head</span><span class="token plain"> -n25000 </span><span class="token operator">|</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">uniq</span><span class="token plain"> -cd</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The results of running that command are shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI text"><pre tabindex="0" class="prism-code language-text codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">2 3a2</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2 a04</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2 433</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2 291</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">2 d73</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’re going to pipe this data into a Kafka stream called events, like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">python datagen.py </span><span class="token operator file-descriptor important">2</span><span class="token operator">></span><span class="token plain">/dev/null </span><span class="token operator">|</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">jq -cr --arg sep 😊 </span><span class="token string" style="color:rgb(255, 121, 198)">'[.uuid, tostring] | join($sep)'</span><span class="token plain"> </span><span class="token operator">|</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">kcat -P -b localhost:9092 -t events -K😊</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The construction of the key/value structure comes from Robin Moffatt’s <a href="https://rmoff.net/2020/09/30/setting-key-value-when-piping-from-jq-to-kafkacat/" target="_blank" rel="noopener noreferrer">excellent blog post</a>. Since that blog post was written, kcat has started supporting multi byte separators, which is why we can use a smiley face to separate our key and value.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="pinot-schematable-config"></a>Pinot Schema/Table Config<a class="hash-link" href="#pinot-schematable-config" title="Direct link to heading">#</a></h2><p>Next, we’re going to create a Pinot table and schema with the same name. Let’s first define a schema:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Note that the timestamp field is called ts and not tsString, as it is in the Kafka stream. We’re going to transform the DateTime string value held in that field into a proper timestamp using a transformation function. </p><p>Our table config is described below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replication"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka-dedup-blog:9093"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lowlevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SSSSSS''Z''')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Let’s create the table using the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network dedup_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/pinot/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost </span><span class="token string" style="color:rgb(255, 121, 198)">"pinot-controller-dedup-blog"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Now we can navigate to <a href="http://localhost:9000/" target="_blank" rel="noopener noreferrer">http://localhost:9000</a> and run a query that will return a count of the number of each uuid:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> uuid</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> events </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">group</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> uuid</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The results of this query are shown below:</p><p><img src="https://www.datocms-assets.com/75153/1673273173-image4.png" alt="Sample Apache Pinot real-time query response stats including duplicates" title="Sample Apache Pinot real-time query response stats including duplicates"></p><p>We can see loads of duplicates! </p><p>Now let’s add a table and schema that uses the de-duplicate feature, starting with the schema:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events_dedup"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"primaryKeyColumns"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The main difference between this schema and the events schema is that we need to specify a primary key. This key can be any number of fields, but in this case, we’re only using the uuid field.</p><p>Next, the table config:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events_dedup"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events_dedup"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replication"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"kafka-dedup-blog:9093"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"lowlevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"routing"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"instanceSelectorType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"strictReplicaGroup"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dedupConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"dedupEnabled"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">true</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"hashFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"NONE"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SSSSSS''Z''')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The changes to notice here are:</p><ul><li>"dedupConfig": {"dedupEnabled": true, "hashFunction": "NONE"} - This enables the feature and indicates that we won’t use a hash function on our primary key.</li><li>"routing": {"instanceSelectorType": "strictReplicaGroup"} - This makes sure that all segments of the same partition are served from the same server to ensure data consistency across the segments. </li></ul><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network dedup_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/pinot/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema-dedup.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table-dedup.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost </span><span class="token string" style="color:rgb(255, 121, 198)">"pinot-controller-dedup-blog"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> uuid, count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">from events_dedup</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">group by uuid</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">order by count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">limit </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><img src="https://www.datocms-assets.com/75153/1673273248-image3.png" alt="Sample Apache Pinot real-time query response stats deduplicated" title="Sample Apache Pinot real-time query response stats deduplicated"></p><p>We have every combination of hex values (16^3=4096) and no duplicates! Pinot’s de-duplication feature has done its job.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-does-it-work"></a>How does it work? <a class="hash-link" href="#how-does-it-work" title="Direct link to heading">#</a></h2><p>When we’re not using the deduplication feature, events are ingested from our streaming platform into Pinot, as shown in the diagram below:</p><p><img src="https://www.datocms-assets.com/75153/1673273272-pinot_0-11-de-duplication-diagram_1-v2.png" alt="Events ingested from a streaming platform into Apache Pinot without using the deduplication feature" title="Events ingested from a streaming platform into Apache Pinot without using the deduplication feature"></p><p>When de-dup is enabled, we have to check whether records can be ingested, as shown in the diagram below:</p><p><img src="https://www.datocms-assets.com/75153/1673273289-pinot_0-11-de-duplication-diagram_2-v3.png" alt="Events ingested from a streaming platform into Apache Pinot using the deduplication feature" title="Events ingested from a streaming platform into Apache Pinot using the deduplication feature"></p><p>De-dup works out whether a primary key has already been ingested by using an in memory map of (primary key -> corresponding segment reference).</p><p>We need to take that into account when using this feature, otherwise, we’ll end up using all the available memory on the Pinot Server. Below are some tips for using this feature:</p><ul><li>Try to use a simple primary key type and avoid composite keys. If you don’t have a simple primary key, consider using one of the available hash functions to reduce the space taken up.</li><li>Create more partitions in the streaming platform than you might otherwise create. The number of partitions determines the partition numbers of the Pinot table. The more partitions you have in the streaming platform, the more Pinot servers you can distribute the Pinot table to, and the more horizontally scalable the table will be.</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>This feature makes it easier to ensure that we don’t end up with duplicate data in our Apache Pinot estate. </p><p>So give it a try and let us know how you get on. If you have any questions about this feature, feel free to join us on <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">Slack</a>, where we’ll be happy to help you out.</p><p>And if you’re interested in how this feature was implemented, you can look at the <a href="https://github.com/apache/pinot/pull/8708" target="_blank" rel="noopener noreferrer">pull request on GitHub</a>.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/deduplication">deduplication</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.11 - Deduplication on Real-Time Tables" href="/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2022/11/28/Apache-Pinot-Pausing-Real-Time-Ingestion">Apache Pinot™ 0.11 - Pausing Real-Time Ingestion</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-11-28T00:00:00.000Z">November 28, 2022</time> · 7 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p><a href="https://youtu.be/u9CwDpMZRog" target="_blank" rel="noopener noreferrer"><img src="https://i3.ytimg.com/vi/u9CwDpMZRog/maxresdefault.jpg" alt="Watch the video"></a></p><p>The Apache Pinot community recently released version <a href="https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4" target="_blank" rel="noopener noreferrer">0.11.0</a>, which has lots of goodies for you to play with.</p><p>In this post, we will learn about a feature that lets you pause and resume real-time data ingestion. Sajjad Moradi has <a href="https://medium.com/apache-pinot-developer-blog/pause-stream-consumption-on-apache-pinot-772a971ef403" target="_blank" rel="noopener noreferrer">also written a blog post about this feature</a>, so you can treat this post as a complement to that one.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-does-real-time-ingestion-work"></a>How does real-time ingestion work?<a class="hash-link" href="#how-does-real-time-ingestion-work" title="Direct link to heading">#</a></h2><p>Before we get into this feature, let’s first recap how real-time ingestion works.</p><p>This only applies to tables that have the REALTIME type. These tables ingest data that comes in from a streaming platform (e.g., Kafka). </p><p>Pinot servers ingest rows into consuming segments that reside in volatile memory. </p><p>Once a segment reaches the <a href="https://dev.startree.ai/docs/pinot/recipes/configuring-segment-threshold" target="_blank" rel="noopener noreferrer">segment threshold,</a> it will be persisted to disk as a completed segment, and a new consuming segment will be created. This new segment takes over the ingestion of new events from the streaming platform.</p><p>The diagram below shows what things might look like when we’re ingesting data from a Kafka topic that has 3 partitions:</p><p><img src="https://www.datocms-assets.com/75153/1669733133-pinot_0-11-realtime_injestion-diagram-v1.png" alt="Apache pinot 0.11 Real Time Data Ingestion" title="Apache pinot 0.11 Real Time Data Ingestion"></p><p>A table has one consuming segment per partition but would have many completed segments.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="why-do-we-need-to-pause-and-resume-ingestion"></a>Why do we need to pause and resume ingestion?<a class="hash-link" href="#why-do-we-need-to-pause-and-resume-ingestion" title="Direct link to heading">#</a></h2><p>There are many reasons why you might want to pause and resume ingestion of a stream. Some of the common ones are described below:</p><ul><li>There’s a problem with the underlying stream, and we need to restart the server, reset offsets, or recreate a topic</li><li>We want to ingest data from different streams into the same table.</li><li>We made a mistake in our ingestion config in Pinot, and it’s now throwing exceptions and isn’t able to ingest any more data.</li></ul><p>The 0.11 release adds the following REST API endpoints:</p><ul><li>/tables/{tableName}/pauseCompletion</li><li>/tables/{tableName}/resumeCompletion</li></ul><p>As the names suggest, these endpoints can be used to pause and resume streaming ingestion for a specific table. This release also adds the /tables/{tableName}/pauseStatus endpoint, which returns the pause status for a table.</p><p>Let’s see how to use this functionality with help from a worked example.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="data-generation"></a>Data Generation<a class="hash-link" href="#data-generation" title="Direct link to heading">#</a></h2><p>Let’s imagine that we want to ingest events generated by the following Python script:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI python"><pre tabindex="0" class="prism-code language-python codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> datetime</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> uuid</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> random</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">import</span><span class="token plain"> json</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">while</span><span class="token plain"> </span><span class="token boolean">True</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ts </span><span class="token operator">=</span><span class="token plain"> datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">datetime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">now</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">strftime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">"%Y-%m-%dT%H:%M:%S.%fZ"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">id</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">str</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">uuid</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">uuid4</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> count </span><span class="token operator">=</span><span class="token plain"> random</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">randint</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token number">0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token number">1000</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">print</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> json</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">dumps</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token string" style="color:rgb(255, 121, 198)">"tsString"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(189, 147, 249)">id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We can view the data generated by this script by pasting the above code into a file called datagen.py and then running the following command:</p><p>python datagen.py 2>/dev/null | head -n3 | jq</p><p>We’ll see the following output:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2022-11-23T12:08:44.127481Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"e1c58795-a009-4e21-ae76-cdd66e090797"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"count"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">203</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2022-11-23T12:08:44.127531Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"4eedce04-d995-4e99-82ab-6f836b35c580"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"count"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">216</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tsString"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"2022-11-23T12:08:44.127550Z"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"uuid"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"6d72411b-55f5-4f9f-84e4-7c8c5c4581ff"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"count"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">721</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’re going to pipe this data into a Kafka stream called ‘events’ like this:</p><p>python datagen.py | kcat -P -b localhost:9092 -t events</p><p>We’re not setting a key for these messages in Kafka for simplicity’s sake, but Robin Moffat has an <a href="https://rmoff.net/2020/09/30/setting-key-value-when-piping-from-jq-to-kafkacat/" target="_blank" rel="noopener noreferrer">excellent blog post that explains how to do it</a>.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="pinot-schematable-config"></a>Pinot Schema/Table Config<a class="hash-link" href="#pinot-schematable-config" title="Direct link to heading">#</a></h2><p>We want to ingest this data into a Pinot table with the same name. Let’s first define a schema:</p><p>Schema:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dimensionFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"uuid"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"STRING"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metricFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"count"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"INT"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dateTimeFieldSpecs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"dataType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"format"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS:EPOCH"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularity"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"1:MILLISECONDS"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Note that the timestamp field is called ts and not tsString, as it is in the Kafka stream. We will transform the DateTime string value held in that field into a proper timestamp using a transformation function. </p><p>Our table config is described below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableName"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"REALTIME"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"segmentsConfig"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timeColumnName"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"schemaName"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replication"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"replicasPerPartition"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"1"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamConfigs"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"streamType"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"kafka"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.topic.name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"events"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.broker.list"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"kafka-pause-resume:9093"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.type"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"lowlevel"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.prop.auto.offset.reset"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"smallest"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.consumer.factory.class.name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"stream.kafka.decoder.class.name"</span><span class="token operator">:</span><span class="token string" style="color:rgb(255, 121, 198)">"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SS''Z''')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"tenants"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"metadata"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Our transformation has a subtle error. The second parameter passed to the FromDateTime function describes the format of the DateTime string, which we defined as:</p><p>YYYY-MM-dd''T''HH:mm:ss.SS''Z''</p><p>But tsString has values in the following format:</p><p>2022-11-23T12:08:44.127550Z</p><p>i.e., we don’t have enough S values - there should be 5 rather than 2. </p><p>If we create the table using the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network pause-resume </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/pinot/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost pinot-controller-pause-resume </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Pinot will immediately start trying to ingest data from Kafka, and it will throw a lot of exceptions that look like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI log"><pre tabindex="0" class="prism-code language-log codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">java.lang.RuntimeException: Caught exception while executing function: fromDateTime(tsString,'YYYY-MM-dd'T'HH:mm:ss.SS'Z'')</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">…</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">Caused by: java.lang.IllegalStateException: Caught exception while invoking method: public static long org.apache.pinot.common.function.scalar.DateTimeFunctions.fromDateTime(java.lang.String,java.lang.String) with arguments: [2022-11-23T11:12:34.682504Z, YYYY-MM-dd'T'HH:mm:ss.SS'Z']</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>At this point, we’d usually be stuck and would need to fix the transformation function and then restart the Pinot server. With the pause/resume feature, we can fix this problem without resorting to such drastic measures. </p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="the-pauseresume-flow"></a>The Pause/Resume Flow<a class="hash-link" href="#the-pauseresume-flow" title="Direct link to heading">#</a></h2><p>Instead, we can follow these steps:</p><ul><li>Pause ingestion for the table</li><li>Fix the transformation function</li><li>Resume ingestion</li><li>Profit $$$</li></ul><p>We can pause ingestion by running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X POST </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events/pauseConsumption"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The response should be something like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"pauseFlag"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">true</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"consumingSegments"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events__0__0__20221123T1106Z"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"description"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Pause flag is set. Consuming segments are being committed. Use /pauseStatus endpoint in a few moments to check if all consuming segments have been committed."</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Let’s follow the response’s advice and check the consuming segments status:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X GET </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events/pauseStatus"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’ll see the following response:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"pauseFlag"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">true</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"consumingSegments"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>So far, so good. Now we need to fix the table. We have a config, table-fixed.json, that contains a working transformation config. These are the lines of interest:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"ingestionConfig"</span><span class="token operator">:</span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformConfigs"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"columnName"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"transformFunction"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SSSSSS''Z''')"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We now have five S values rather than two, which should sort out our ingestion.</p><p>Update the table config:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X PUT </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"Content-Type: application/json"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -d @pinot/config/table-fixed.json</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>And then resume ingestion. You can pass in the query string parameter consumeFrom, which takes a value of smallest or largest. We’ll pass in smallest since no data has been consumed yet:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X POST </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events/resumeConsumption?consumeFrom=smallest"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>The response will be like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"pauseFlag"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"consumingSegments"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"description"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"Pause flag is cleared. Consuming segments are being created. Use /pauseStatus endpoint in a few moments to double check."</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Again, let’s check the consuming segments status:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">curl</span><span class="token plain"> -X GET </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"http://localhost:9000/tables/events/pauseStatus"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -H </span><span class="token string" style="color:rgb(255, 121, 198)">"accept: application/json"</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This time we will see some consuming segments:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"pauseFlag"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"consumingSegments"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"events__0__22__20221123T1124Z"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Navigate to <a href="http://localhost:9000/#/query" target="_blank" rel="noopener noreferrer">http://localhost:9000/#/query</a> and click on the events table. You should see the following:</p><p><img src="https://www.datocms-assets.com/75153/1669668611-image2.png" alt="Sample events table containing records" title="Sample events table containing records"></p><p>We have records! We can also run our data generator again, and more events will be ingested.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>This feature makes real-time data ingestion a bit more forgiving when things go wrong, which has got to be a good thing in my book.</p><p>When you look at the name of this feature, it can seem a bit esoteric and perhaps not something that you’d want to use, but I think you’ll find it to be extremely useful.</p><p>So give it a try and let us know how you get on. If you have any questions about this feature, feel free to join us on <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">Slack</a>, where we’ll be happy to help you out.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/pause">pause</a><a class="margin-horiz--sm" href="/blog/tags/resume">resume</a><a class="margin-horiz--sm" href="/blog/tags/real-time-ingestion">real-time ingestion</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.11 - Pausing Real-Time Ingestion" href="/blog/2022/11/28/Apache-Pinot-Pausing-Real-Time-Ingestion"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2022/11/22/Apache-Pinot-Timestamp-Indexes">Apache Pinot™ 0.11 - Timestamp Indexes</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-11-22T00:00:00.000Z">November 22, 2022</time> · 8 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p><a href="https://youtu.be/DetGpHZuzDU?si=f0ejecqPBbBK21z-" target="_blank" rel="noopener noreferrer"><img src="https://i3.ytimg.com/vi/DetGpHZuzDU/maxresdefault.jpg" alt="Watch the video"></a></p><p>The recent Apache <a href="https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4" target="_blank" rel="noopener noreferrer">Pinot™ 0.11.0</a> release has lots of goodies for you to play with. This is the third in a series of blog posts showing off some of the new features in this release.</p><p>Pinot introduced the TIMESTAMP data type in the 0.8 release, which stores the time in millisecond epoch long format internally. The community feedback has been that the queries they’re running against timestamp columns don’t need this low-level granularity. </p><p>Instead, users write queries that use the datetrunc function to filter at a coarser grain of functionality. Unfortunately, this approach results in scanning data and time value conversion work that takes a long time at large data volumes.</p><p>The <a href="https://docs.pinot.apache.org/basics/indexing/timestamp-index" target="_blank" rel="noopener noreferrer">timestamp index</a> solves that problem! In this blog post, we’ll use it to get an almost 5x query speed improvement on a relatively small dataset of only 7m rows.</p><p><img src="https://www.datocms-assets.com/75153/1669133004-image1.png" alt="Time in milliseconds with and without timestamp indexes bar chart" title="Time in milliseconds with and without timestamp indexes bar chart"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="spinning-up-pinot"></a>Spinning up Pinot<a class="hash-link" href="#spinning-up-pinot" title="Direct link to heading">#</a></h2><p>We’re going to be using the Pinot Docker container, but first, we’re going to create a network, as we’ll need that later on:</p><p>docker network create timestamp_blog</p><p>We’re going to spin up the empty <a href="https://docs.pinot.apache.org/basics/getting-started/quick-start" target="_blank" rel="noopener noreferrer">QuickStart</a> in a container named pinot-timestamp-blog:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">8000</span><span class="token plain">:8000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">9000</span><span class="token plain">:9000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --name pinot-timestamp-blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network timestamp_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> QuickStart -type EMPTY</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Or if you’re on a Mac M1, change the name of the image to have the arm-64 suffix, like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">8000</span><span class="token plain">:8000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">9000</span><span class="token plain">:9000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network timestamp_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --name pinot-timestamp-blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> QuickStart -type EMPTY</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Once that’s up and running, we’ll be able to access the Pinot Data Explorer at <a href="http://localhost:9000/" target="_blank" rel="noopener noreferrer">http://localhost:9000</a>, but at the moment, we don’t have any data to play with.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="importing-chicago-crime-dataset"></a>Importing Chicago Crime Dataset<a class="hash-link" href="#importing-chicago-crime-dataset" title="Direct link to heading">#</a></h2><p>The <a href="https://startree.ai/blog/analyzing-chicago-crimes-with-apache-pinot-and-streamlit" target="_blank" rel="noopener noreferrer">Chicago Crime dataset</a> is a small to medium-sized dataset with 7 million records representing reported crimes in the City of Chicago from 2001 until today.</p><p>It contains details of the type of crime, where it was committed, whether an arrest was recorded, which beat it occurred on, and more.</p><p>Each of the crimes has an associated timestamp, which makes it a good dataset to demonstrate timestamp indexes.</p><p>You can find the code used in this blog post in the <a href="https://github.com/startreedata/pinot-recipes/tree/main/recipes/analyzing-chicago-crimes" target="_blank" rel="noopener noreferrer">Analyzing Chicago Crimes</a> recipe section of <a href="https://github.com/startreedata/pinot-recipes" target="_blank" rel="noopener noreferrer">Pinot Recipes GitHub repository</a>. From here on, I’m assuming that you’ve downloaded this repository and are in the recipes/analyzing-chicago-crimes directory.</p><p>We’re going to create a schema and table named crimes by running the following command: </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network timestamp_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost pinot-timestamp-blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We should see the following output: </p><p>2022/11/03 13:07:57.169 INFO [AddTableCommand] [main] {"unrecognizedProperties":{},"status":"TableConfigs crimes successfully added"}</p><p>A screenshot of the schema is shown below:</p><p><img src="https://www.datocms-assets.com/75153/1669132979-image3.png" alt="Chicago crime dataset table schema" title="Chicago crime dataset table schema"></p><p>We won’t go through the table config and schema files in this blog post because we did that in the last post, but you can find them in the <a href="https://github.com/startreedata/pinot-recipes/tree/main/recipes/analyzing-chicago-crimes/config" target="_blank" rel="noopener noreferrer">config</a> directory on GitHub. </p><p>Now, let’s import the dataset. </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network timestamp_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/data:/data </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 LaunchDataIngestionJob </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -jobSpecFile /config/job-spec.yml </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -values </span><span class="token assign-left variable" style="color:rgb(189, 147, 249);font-style:italic">controllerHost</span><span class="token operator">=</span><span class="token plain">pinot-timestamp-blog </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>It will take a few minutes to load, but once that command has finished, we’re ready to query the crimes table.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="querying-crimes-by-date"></a>Querying crimes by date<a class="hash-link" href="#querying-crimes-by-date" title="Direct link to heading">#</a></h2><p>The following query finds the number of crimes that happened after 16th January 2017, grouped by week of the year, with the most crime-filled weeks shown first:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> datetrunc</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'WEEK'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> DateEpoch</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> tsWeek</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> crimes </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> tsWeek </span><span class="token operator">></span><span class="token plain"> fromDateTime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'2017-01-16'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'yyyy-MM-dd'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">group</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> tsWeek</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DESC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we run that query, we’ll see the following results:</p><p><img src="https://www.datocms-assets.com/75153/1669133027-image6.png" alt="Chicago crime dataset query result" title="Chicago crime dataset query result"></p><p>And, if we look above the query result, there’s metadata about the query, including the time that it took to run.</p><p><img src="https://www.datocms-assets.com/75153/1669133059-image5.png" alt="Chicago crime dataset metadata about the query, including the time that it took to run" title="Chicago crime dataset metadata about the query, including the time that it took to run"></p><p>The query took 141 ms to execute, so that’s our baseline.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="adding-the-timestamp-index"></a>Adding the timestamp index<a class="hash-link" href="#adding-the-timestamp-index" title="Direct link to heading">#</a></h2><p>We could add a timestamp index directly to this table and then compare query performance, but to make it easier to do comparisons, we’re going to create an identical table with the timestamp index applied. </p><p>The full table config is available in the <a href="https://github.com/startreedata/pinot-recipes/blob/main/recipes/analyzing-chicago-crimes/config/table-index.json" target="_blank" rel="noopener noreferrer">config/table-index.json</a> file, and the main change is that we’ve added the following section to add a timestamp index on the DateEpoch column:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"fieldConfigList"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DateEpoch"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"encodingType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DICTIONARY"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"indexTypes"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timestampConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularities"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DAY"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"WEEK"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MONTH"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p><em>encodingType</em> will always be ‘DICTIONARY’ and <em>indexTypes</em> must contain ‘TIMESTAMP’. We should specify granularities based on our query patterns.</p><p>As a rule of thumb, work out which values you most commonly pass as the first argument to the <a href="https://docs.pinot.apache.org/configuration-reference/functions/datetrunc" target="_blank" rel="noopener noreferrer">datetrunc function</a> in your queries and include those values.</p><p>The full list of valid granularities is: <em>millisecond</em>, <em>second</em>, <em>minute</em>, <em>hour</em>, <em>day</em>, <em>week</em>, <em>month</em>, <em>quarter</em>, and <em>year</em>.</p><p>Our new table is called crimes_indexed, and we’re also going to create a new schema with all the same columns called crimes_indexed, as Pinot requires the table and schema names to match.</p><p>We can create the schema and table by running the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network timestamp_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 AddTable </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -schemaFile /config/schema-index.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -tableConfigFile /config/table-index.json </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -controllerHost pinot-timestamp-blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -exec </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’ll populate that table by copying the segment that we created earlier for the crimes table. </p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network timestamp_blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/data:/data </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 LaunchDataIngestionJob </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -jobSpecFile /config/job-spec-download.yml </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -values </span><span class="token assign-left variable" style="color:rgb(189, 147, 249);font-style:italic">controllerHost</span><span class="token operator">=</span><span class="token plain">pinot-timestamp-blog </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If you’re curious how that job spec works, I <a href="https://www.markhneedham.com/blog/2021/12/06/apache-pinot-copy-segment-new-table/" target="_blank" rel="noopener noreferrer">wrote a blog post explaining it in a bit more detail</a>.</p><p>Once the Pinot Server has downloaded this segment, it will apply the timestamp index to the DateEpoch column. </p><p>For the curious, we can see this happening in the log files by connecting to the Pinot container and running the following grep command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">docker </span><span class="token builtin class-name" style="color:rgb(189, 147, 249)">exec</span><span class="token plain"> -iti pinot-timestamp-blog </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">grep</span><span class="token plain"> -rni -A10 </span><span class="token string" style="color:rgb(255, 121, 198)">"Successfully downloaded segment:.*crimes_indexed_OFFLINE.*"</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> logs/pinot-all.log</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We’ll see something like the following (tidied up for brevity):</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI log"><pre tabindex="0" class="prism-code language-log codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token plain">[BaseTableDataManager] Successfully downloaded segment: crimes_OFFLINE_0 of table: crimes_indexed_OFFLINE to index dir: /tmp/1667490598253/quickstart/PinotServerDataDir0/crimes_indexed_OFFLINE/crimes_OFFLINE_0</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[V3DefaultColumnHandler] Starting default column action: ADD_DATE_TIME on column: $DateEpoch$DAY</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[SegmentDictionaryCreator] Created dictionary for LONG column: $DateEpoch$DAY with cardinality: 7969, range: 978307200000 to 1666742400000</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[V3DefaultColumnHandler] Starting default column action: ADD_DATE_TIME on column: $DateEpoch$WEEK</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[SegmentDictionaryCreator] Created dictionary for LONG column: $DateEpoch$WEEK with cardinality: 1139, range: 978307200000 to 1666569600000</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[V3DefaultColumnHandler] Starting default column action: ADD_DATE_TIME on column: $DateEpoch$MONTH</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[SegmentDictionaryCreator] Created dictionary for LONG column: $DateEpoch$MONTH with cardinality: 262, range: 978307200000 to 1664582400000</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[RangeIndexHandler] Creating new range index for segment: crimes_OFFLINE_0, column: $DateEpoch$DAY</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[RangeIndexHandler] Created range index for segment: crimes_OFFLINE_0, column: $DateEpoch$DAY</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[RangeIndexHandler] Creating new range index for segment: crimes_OFFLINE_0, column: $DateEpoch$WEEK</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">[RangeIndexHandler] Created range index for segment: crimes_OFFLINE_0, column: $DateEpoch$WEEK</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"> |
| </span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="what-does-a-timestamp-index-do"></a>What does a timestamp index do?<a class="hash-link" href="#what-does-a-timestamp-index-do" title="Direct link to heading">#</a></h2><p>So, the timestamp index has now been created, but what does it actually do?</p><p>When we add a timestamp index on a column, Pinot creates a derived column for each granularity and adds a range index for each new column.</p><p>In our case, that means we’ll have these extra columns: $DateEpoch$DAY, $DateEpoch$WEEK, and $DateEpoch$MONTH. </p><p>We can check if the extra columns and indexes have been added by navigating to the <a href="http://localhost:9000/#/tenants/table/crimes_indexed_OFFLINE/crimes_OFFLINE_0" target="_blank" rel="noopener noreferrer">segment page</a> and typing $Date$Epoch in the search box. You should see the following:</p><p><img src="https://www.datocms-assets.com/75153/1669133112-image2.png" alt="Apache Pinot timestamp index on a column" title="Apache Pinot timestamp index on a column"></p><p>These columns will be assigned the following values:</p><ul><li>$DateEpoch$DAY = dateTrunc(‘DAY’, DateEpoch)</li><li>$DateEpoch$WEEK = dateTrunc(‘WEEK’, DateEpoch)</li><li>$DateEpoch$MONTH = dateTrunc(‘MONTH’, DateEpoch)</li></ul><p>Pinot will also rewrite any queries that use the dateTrunc function with DAY, WEEK, or MONTH and the DateEpoch field to use those new columns.</p><p>This means that this query:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> datetrunc</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'WEEK'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> DateEpoch</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> tsWeek</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> crimes_indexed</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> tsWeek</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Would be rewritten as:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> $DateEpoch$WEEK </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> tsWeek</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> crimes_indexed</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> tsWeek</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>And our query:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> datetrunc</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'WEEK'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> DateEpoch</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> tsWeek</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> crimes </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> tsWeek </span><span class="token operator">></span><span class="token plain"> fromDateTime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'2017-01-16'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'yyyy-MM-dd'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">group</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> tsWeek</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DESC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Would be rewritten as:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> $DateEpoch$WEEK </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">as</span><span class="token plain"> tsWeek</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> crimes </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> tsWeek </span><span class="token operator">></span><span class="token plain"> fromDateTime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">'2017-01-16'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'yyyy-MM-dd'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">group</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> tsWeek</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">order</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">by</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">count</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DESC</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">limit</span><span class="token plain"> </span><span class="token number">10</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="re-running-the-query"></a>Re-running the query<a class="hash-link" href="#re-running-the-query" title="Direct link to heading">#</a></h2><p>Let’s now run our initial query against the <em>crimes_indexed</em> table. We’ll get exactly the same results as before, but let’s take a look at the query stats:</p><p><img src="https://www.datocms-assets.com/75153/1669133083-image4.png" alt="Chicago crime dataset updated query stats" title="Chicago crime dataset updated query stats"></p><p>This time the query takes 36 milliseconds rather than 140 milliseconds. That’s an almost 5x improvement, thanks to the timestamp index.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>Hopefully, you’ll agree that timestamp indexes are pretty cool, and achieving a 5x query improvement without much work is always welcome!</p><p>If you’re using timestamps in your Pinot tables, be sure to try out this index and let us know how it goes on the <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">StarTree Community Slack</a> . We’re always happy to help out with any questions or problems you encounter.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/timestamp">Timestamp</a><a class="margin-horiz--sm" href="/blog/tags/datetrunc">datetrunc</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.11 - Timestamp Indexes" href="/blog/2022/11/22/Apache-Pinot-Timestamp-Indexes"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2022/11/17/Apache Pinot-Inserts-from-SQL">Apache Pinot™ 0.11 - Inserts from SQL</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-11-17T00:00:00.000Z">November 17, 2022</time> · 4 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p>The Apache Pinot community recently released version <a href="https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4" target="_blank" rel="noopener noreferrer">0.11.0</a>, which has lots of goodies for you to play with. This is the second in a series of blog posts showing off some of the new features in this release.</p><p>In this post, we’re going to explore the <a href="https://docs.pinot.apache.org/basics/data-import/from-query-console" target="_blank" rel="noopener noreferrer">INSERT INTO clause</a>, which makes ingesting batch data into Pinot as easy as writing a SQL query.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="batch-importing-the-job-specification"></a>Batch importing: The Job Specification<a class="hash-link" href="#batch-importing-the-job-specification" title="Direct link to heading">#</a></h2><p>The power of this new clause is only fully appreciated if we look at what we had to do before it existed. </p><p>In the <a href="https://www.youtube.com/watch?v=1EMBx1XeI9o" target="_blank" rel="noopener noreferrer">Batch Import JSON from Amazon S3 into Apache Pinot | StarTree Recipes</a> video (and <a href="https://dev.startree.ai/docs/pinot/recipes/ingest-csv-files-from-s3" target="_blank" rel="noopener noreferrer">accompanying developer guide</a>), we showed how to ingest data into Pinot from an S3 bucket.</p><p>The contents of that bucket are shown in the screenshot below:</p><p><img src="https://www.datocms-assets.com/75153/1668701275-image4.png" alt="Sample data ingested into Apache Pinot from a S3 bucket" title="Sample data ingested into Apache Pinot from a S3 bucket"></p><p>Let’s quickly recap the steps that we had to do to import those files into Pinot. We have a table called events, which has the following schema:</p><p><img src="https://www.datocms-assets.com/75153/1668701353-image1.png" alt="Events schema table" title="Events schema table"></p><p>We first created a job specification file, which contains a description of our import job. The job file is shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI yaml"><pre tabindex="0" class="prism-code language-yaml codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token key atrule">executionFrameworkSpec</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'standalone'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">segmentGenerationJobRunnerClassName</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentGenerationJobRunner'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">segmentTarPushJobRunnerClassName</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentTarPushJobRunner'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">segmentUriPushJobRunnerClassName</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentUriPushJobRunner'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">jobType</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> SegmentCreationAndTarPush</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">inputDirURI</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'s3://marks-st-cloud-bucket/events/'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">includeFileNamePattern</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'glob:**/*.json'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">outputDirURI</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'/data'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">overwriteOutput</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token boolean important">true</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">pinotFSSpecs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token key atrule">scheme</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> s3</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">className</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> org.apache.pinot.plugin.filesystem.S3PinotFS</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">configs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">region</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'eu-west-2'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token key atrule">scheme</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> file</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">className</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> org.apache.pinot.spi.filesystem.LocalPinotFS</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">recordReaderSpec</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">dataFormat</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'json'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">className</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'org.apache.pinot.plugin.inputformat.json.JSONRecordReader'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">tableSpec</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token key atrule">tableName</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'events'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token key atrule">pinotClusterSpecs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">-</span><span class="token plain"> </span><span class="token key atrule">controllerURI</span><span class="token punctuation" style="color:rgb(248, 248, 242)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'http://${PINOT_CONTROLLER}:9000'</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>At a high level, this file describes a batch import job that will ingest files from the S3 bucket at s3://marks-st-cloud-bucket/events/ where the files match the glob:**/*.json pattern.</p><p>We can import the data by running the following command from the terminal:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> --network ingest-json-files-s3 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -v </span><span class="token environment constant" style="color:rgb(189, 147, 249)">$PWD</span><span class="token plain">/config:/config </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -e </span><span class="token assign-left variable" style="color:rgb(189, 147, 249);font-style:italic">AWS_ACCESS_KEY_ID</span><span class="token operator">=</span><span class="token plain">AKIARCOCT6DWLUB7F77Z </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -e </span><span class="token assign-left variable" style="color:rgb(189, 147, 249);font-style:italic">AWS_SECRET_ACCESS_KEY</span><span class="token operator">=</span><span class="token plain">gfz71RX+Tj4udve43YePCBqMsIeN1PvHXrVFyxJS </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0 LaunchDataIngestionJob </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -jobSpecFile /config/job-spec.yml </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -values </span><span class="token assign-left variable" style="color:rgb(189, 147, 249);font-style:italic">PINOT_CONTROLLER</span><span class="token operator">=</span><span class="token plain">pinot-controller</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>And don’t worry, those credentials have already been deleted; I find it easier to understand what values go where if we use real values. </p><p>Once we’ve run this command, if we go to the Pinot UI at <a href="http://localhost:9000/" target="_blank" rel="noopener noreferrer">http://localhost:9000</a> and click through to the events table from the Query Console menu, we’ll see that the records have been imported, as shown in the screenshot below:</p><p><img src="https://www.datocms-assets.com/75153/1668701512-image3.png" alt="Sample imported records shown in the Apache Pinot Query Console menu" title="Sample imported records shown in the Apache Pinot Query Console menu"></p><p>This approach works, and we may still prefer to use it when we need fine-grained control over the ingestion parameters, but it is a bit heavyweight for your everyday data import!</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="batch-importing-with-sql"></a>Batch Importing with SQL<a class="hash-link" href="#batch-importing-with-sql" title="Direct link to heading">#</a></h2><p>Now let’s do the same thing in SQL.</p><p>There are some prerequisites to using the SQL approach, so let’s go through those now, so you don’t end up with a bunch of exceptions when you try this out! </p><p>First of all, you must have a <a href="https://docs.pinot.apache.org/basics/components/minion" target="_blank" rel="noopener noreferrer">Minion</a> in the Pinot cluster, as this is the component that will do the data import.</p><p>You’ll also need to include the following in your table config:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"task"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"taskTypeConfigsMap"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"> </span><span class="token property">"SegmentGenerationAndPushTask"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>As long as you’ve done those two things, we’re ready to write our import query! A query that imports JSON files from my S3 bucket is shown below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INSERT</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INTO</span><span class="token plain"> events</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FILE</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'s3://marks-st-cloud-bucket/events/'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">OPTION</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> taskName</span><span class="token operator">=</span><span class="token plain">events</span><span class="token operator">-</span><span class="token plain">task</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> includeFileNamePattern</span><span class="token operator">=</span><span class="token plain">glob:</span><span class="token operator">*</span><span class="token operator">*</span><span class="token operator">/</span><span class="token operator">*</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">json</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> input</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">className</span><span class="token operator">=</span><span class="token plain">org</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">apache</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">pinot</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">plugin</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">filesystem</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">S3PinotFS</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> input</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">prop</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">accessKey</span><span class="token operator">=</span><span class="token plain">AKIARCOCT6DWLUB7F77Z</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> input</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">prop</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">secretKey</span><span class="token operator">=</span><span class="token plain">gfz71RX</span><span class="token operator">+</span><span class="token plain">Tj4udve43YePCBqMsIeN1PvHXrVFyxJS</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> input</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">fs</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">prop</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">region</span><span class="token operator">=</span><span class="token plain">eu</span><span class="token operator">-</span><span class="token plain">west</span><span class="token operator">-</span><span class="token number">2</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we run this query, we’ll see the following output:</p><p><img src="https://www.datocms-assets.com/75153/1668701654-image5.png" alt="Sample events_OFFLINE query result" title="Sample events_OFFLINE query result"></p><p>We can check on the state of the ingestion job via the Swagger REST API. If we navigate to <a href="http://localhost:9000/help#/Task/getTaskState" target="_blank" rel="noopener noreferrer">http://localhost:9000/help#/Task/getTaskState</a>, paste Task_SegmentGenerationAndPushTask_events-task as our task name, and then click Execute, we’ll see the following:</p><p><img src="https://www.datocms-assets.com/75153/1668701727-image2.png" alt="Checking the state of an ingestion job screen" title="Checking the state of an ingestion job screen"></p><p>If we see the state COMPLETED, this means the data has been ingested, which we can check by going back to the Query console and clicking on the events table.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>I have to say that batch ingestion of data into Apache Pinot has always felt a bit clunky, but with this new clause, it’s super easy, and it’s gonna save us all a bunch of time.</p><p>Also, anything that means I’m not writing YAML files has got to be a good thing!</p><p>So give it a try and let us know how you get on. If you have any questions about this feature, feel free to join us on <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">Slack</a>, where we’ll be happy to help you out.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/insert">Insert</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.11 - Inserts from SQL" href="/blog/2022/11/17/Apache Pinot-Inserts-from-SQL"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2022/11/08/Apache Pinot-How-do-I-see-my-indexes">Apache Pinot™ 0.11 - How do I see my indexes?</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-11-08T00:00:00.000Z">November 8, 2022</time> · 4 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661544338-mark-needham.png" alt="Mark Needham"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/markhneedham/" target="_blank" rel="noopener noreferrer">Mark Needham</a></div><small class="avatar__subtitle">Mark Needham</small></div></div></header><div class="markdown"><p>We recently released <a href="https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4" target="_blank" rel="noopener noreferrer">Pinot 0.11.0</a> , which has lots of goodies for you to play with. This is the first in a series of blog posts showing off some of the new features in this release.</p><p>A common question from the community is: how can you work out which indexes are currently defined on a Pinot table? This information has always been <a href="https://docs.pinot.apache.org/users/api/pinot-rest-admin-interface" target="_blank" rel="noopener noreferrer">available via the REST API</a>, but sometimes you simply want to see it on the UI and not have to parse your way through a bunch of JSON. Let's see how it works!</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="spinning-up-pinot"></a>Spinning up Pinot<a class="hash-link" href="#spinning-up-pinot" title="Direct link to heading">#</a></h2><p>We’re going to spin up the Batch <a href="https://docs.pinot.apache.org/basics/getting-started/quick-start" target="_blank" rel="noopener noreferrer">QuickStart</a> in Docker using the following command:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">8000</span><span class="token plain">:8000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">9000</span><span class="token plain">:9000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> QuickStart -type BATCH</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Or if you’re on a Mac M1, change the name of the image to have the arm-64 suffix, like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">8000</span><span class="token plain">:8000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">9000</span><span class="token plain">:9000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.11.0-arm64 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> QuickStart -type BATCH</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Once that’s up and running, navigate to <a href="http://localhost:9000/#/" target="_blank" rel="noopener noreferrer">http://localhost:9000/#/</a> and click on Tables. Under the tables section click on airlineStats_OFFLINE. You should see a page that looks like this:</p><p><img src="https://www.datocms-assets.com/75153/1667915561-image1-edittable.png" alt="airlineStats_OFFLINE page" title="airlineStats_OFFLINE page"></p><p>Click on Edit Table. This will show a window with the config for this table.</p><p><img src="https://www.datocms-assets.com/75153/1667915654-image3.png" alt="Window with configuration for airlineStats_OFFLINE table" title="Window with configuration for airlineStats_OFFLINE table"></p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="indexing-config"></a>Indexing Config<a class="hash-link" href="#indexing-config" title="Direct link to heading">#</a></h2><p>We’re interested in the tableIndexConfig and fieldConfigList sections. These sections are responsible for defining indexes, which are applied to a table on a per segment basis. </p><ul><li>tableIndexConfig is responsible for inverted, JSON, range, Geospatial, and StarTree indexes.</li><li>fieldConfigList is responsible for timestamp and text indexes.</li></ul><p>tableIndexConfig is defined below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"tableIndexConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"rangeIndexVersion"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">2</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"autoGeneratedInvertedIndex"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"createInvertedIndexDuringSegmentGeneration"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"loadMode"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MMAP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"enableDefaultStarTree"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"enableDynamicStarTreeCreation"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"aggregateMetrics"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"nullHandlingEnabled"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"optimizeDictionaryForMetrics"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token boolean">false</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"noDictionarySizeRatioThreshold"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token number">0</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>From reading this config we learn that no indexes have been explicitly defined.</p><p>Now for fieldConfigList, which is defined below:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI json"><pre tabindex="0" class="prism-code language-json codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token property">"fieldConfigList"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"name"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"ts"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"encodingType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DICTIONARY"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"indexType"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"indexTypes"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"TIMESTAMP"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"timestampConfig"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">{</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token property">"granularities"</span><span class="token operator">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">[</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"DAY"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"WEEK"</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">"MONTH"</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">}</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">]</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>From reading this config we learn that a timestamp index is being applied to the <em>ts</em> column. It is applied at DAY, WEEK, and MONTH granularities, which means that the derived columns $ts$DAY, $ts$WEEK, and $ts$MONTH will be created for the segments in this table.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="viewing-indexes"></a>Viewing Indexes<a class="hash-link" href="#viewing-indexes" title="Direct link to heading">#</a></h2><p>Now, close the table config modal, and under the segments section, open airlineStats_OFFLINE_16071_16071_0 and airlineStats_OFFLINE_16073_16073_0 in new tabs.</p><p>If you look at one of those segments, you’ll see the following grid that lists columns/field names against the indexes defined on those fields.</p><p><img src="https://www.datocms-assets.com/75153/1667915996-image7.png" alt="Segment grid that lists columns/field names against the indexes defined on those fields" title="Segment grid that lists columns/field names against the indexes defined on those fields"></p><p>All the fields on display are persisting their values using the dictionary/forward <a href="https://docs.pinot.apache.org/basics/indexing/forward-index" target="_blank" rel="noopener noreferrer">index format</a> ). Still, we can also see that the Quarter column is sorted and has an inverted index, neither of which we explicitly defined.</p><p>This is because Pinot will automatically create sorted and inverted indexes for columns whose data is sorted when the segment is created. </p><p>So the data for the Quarter column was sorted, and hence it has a sorted index.</p><p>I’ve written a couple of blog posts explaining how sorted indexes work on offline and real-time tables:</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="adding-an-index"></a>Adding an Index<a class="hash-link" href="#adding-an-index" title="Direct link to heading">#</a></h2><p>Next, let’s see what happens if we add an explicit index. We’re going to add an inverted index to the FlightNum column. Go to Edit Table config again and update tableIndexConfig to have the following value:</p><p><img src="https://www.datocms-assets.com/75153/1667916147-image6.png" alt="Inverted index addition" title="Inverted index addition"></p><p>If you go back to the page for segment airlineStats_OFFLINE_16073_16073_0, notice that it does not have an inverted index for this field.</p><p><img src="https://www.datocms-assets.com/75153/1667916232-image2.png" alt="page for segment airlineStats_OFFLINE_16073_16073_0 without an inverted index" title="page for segment airlineStats_OFFLINE_16073_16073_0 without an inverted index"></p><p>This is because indexes are applied on a per segment basis. If we want the inverted index on the FlightNum column in this segment, we can click <em>Reload Segment</em> on this page, or we can go back to the table page and click <em>Reload All Segments</em>. </p><p>If we do that, all the segments in the airlineStats_OFFLINE table will eventually have an inverted index on FlightNum.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="summary"></a>Summary<a class="hash-link" href="#summary" title="Direct link to heading">#</a></h2><p>As I mentioned in the introduction, information about the indexes on each segment has always been available via the REST API, but this feature democratizes that information. </p><p>If you have any questions about this feature, feel free to join us on <a href="https://stree.ai/slack" target="_blank" rel="noopener noreferrer">Slack</a>, where we’ll be happy to help you out.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/indexes">Indexes</a></div><div class="col text--right"><a aria-label="Read more about Apache Pinot™ 0.11 - How do I see my indexes?" href="/blog/2022/11/08/Apache Pinot-How-do-I-see-my-indexes"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2022/08/02/GapFill-Function-For-Time-Series-Datasets-In-Pinot">GapFill Function For Time-Series Datasets In Pinot</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-08-02T00:00:00.000Z">August 2, 2022</time> · 9 min read</div><div class="avatar margin-vert--md"><a href="https://www.linkedin.com/in/lakshmanan-velusamy-a778a517/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://www.datocms-assets.com/75153/1661479772-lakshmanan-portait.jpeg" alt="Weixiang Sun,Lakshmanan Velusamy"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://www.linkedin.com/in/lakshmanan-velusamy-a778a517/" target="_blank" rel="noopener noreferrer">Weixiang Sun,Lakshmanan Velusamy</a></div><small class="avatar__subtitle">Weixiang Sun,Lakshmanan Velusamy</small></div></div></header><div class="markdown"><p>Many real-world datasets are time-series in nature, tracking the value or state changes of entities over time. The values may be polled and recorded at constant time intervals or at random irregular intervals or only when the value/state changes. There are many real-world use cases of time series data. Here are some specific examples:</p><ul><li>Telemetry from sensors monitoring the status of industrial equipment.</li><li>Real-time vehicle data such as speed, braking, and acceleration, to produce the driver's risk score trend.</li><li>Server performance metrics such as CPU, I/O, memory, and network usage over time.</li><li>An automated system tracking the status of a store or items in an online marketplace.</li></ul><p>Let us use an IOT dataset tracking the occupancy status of the individual parking slots in a parking garage using automated sensors in this post. The granularity of recorded data points might be sparse or the events could be missing due to network and other device issues in the IOT environment. The following figure demonstrates entities emitting values at irregular intervals as the value changes. Polling and recording values of all entities regularly at a lower granularity would consume more resources, take up more space on disk and during processing and incur high costs. But analytics applications that are operating on these datasets, might be querying for values at a lower granularity than the data recording interval (Ex: A dashboard showing the total no of occupied parking slots at 15 min granularity in the past week when the sensors are not recording status as frequent).</p><p><img src="https://www.datocms-assets.com/75153/1661700264-entities-emitting-data.png" alt="Entities emitting data over time at irregular intervals" title="Entities emitting data over time at irregular intervals"></p><p>It is important for Pinot to provide the on-the-fly interpolation (filling the missing data) functionality to better handle time-series data.</p><p>Starting from the 0.11.0 release, we introduced the new query syntax, gapfilling functions to interpolate data and perform powerful aggregations and data processing over time series data.</p><p>We will discuss the query syntax with an example and then the internal architecture.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="processing-time-series-data-in-pinot"></a>Processing time series data in Pinot<a class="hash-link" href="#processing-time-series-data-in-pinot" title="Direct link to heading">#</a></h2><p>Let us use the following sample data set tracking the status of parking lots in the parking space to understand this feature in detail.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="sample-dataset"></a>Sample Dataset:<a class="hash-link" href="#sample-dataset" title="Direct link to heading">#</a></h3><p><img src="https://www.datocms-assets.com/75153/1661700333-parking-data-table.png" alt="Sample parking lot dataset" title="Sample parking lot dataset"></p><p>parking_data table</p><p>Use case: We want to find out the total number of parking lots that are occupied over a period of time, which would be a common use case for a company that manages parking spaces.</p><p>Let us take 30 minutes time bucket as an example:</p><p><img src="https://www.datocms-assets.com/75153/1661700377-30-min-bucket-example.png" alt="Sample parking lot dataset with 30 minute time bucket" title="Sample parking lot dataset with 30 minute time bucket"></p><p>In the 30 mins aggregation results table above, we can see a lot of missing data as many lots didn't have anything recorded in those 30-minute windows. To calculate the number of occupied parking lots per time bucket, we need to gap-fill the missing data for each of these 30-minute windows.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="interpolating-missing-data"></a>Interpolating missing data<a class="hash-link" href="#interpolating-missing-data" title="Direct link to heading">#</a></h2><p>There are multiple ways to infer and fill the missing values. In the current version, we introduce the following methods, which are more common:</p><ul><li>FILL_PREVIOUS_VALUE can be used to fill time buckets missing values for entities with the last observed value. If no previous observed value can be found, the default value is used as an alternative.</li><li>FILL_DEFAULT_VALUE can be used to fill time buckets missing values for entities with the default value depending on the data type.</li></ul><p>More advanced gapfilling strategies such as using the next observed value, the value from the previous day or past week, or the value computed using a subquery shall be introduced in the future.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="gapfill-query-with-a-use-case"></a>Gapfill Query with a Use Case:<a class="hash-link" href="#gapfill-query-with-a-use-case" title="Direct link to heading">#</a></h2><p>Let us write a query to <em>get</em> <em>the total number of occupied parking lots every 30 minutes over time on the parking lot dataset</em> discussed above.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="query-syntax"></a>Query Syntax:<a class="hash-link" href="#query-syntax" title="Direct link to heading">#</a></h3><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> time_col</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">SUM</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> occupied_slots_count</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> GAPFILL</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">time_col</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'1:MILLISECONDS:SIMPLE_DATE_FORMAT:yyyy-MM-dd HH:mm:ss.SSS'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'2021-10-01 09:00:00.000'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'2021-10-01 12:00:00.000'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'30:MINUTES'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> FILL</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'FILL_PREVIOUS_VALUE'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> TIMESERIESON</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">lot_id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> lot_id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> DATETIMECONVERT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">event_time</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'1:MILLISECONDS:EPOCH'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'1:MILLISECONDS:SIMPLE_DATE_FORMAT:yyyy-MM-dd HH:mm:ss.SSS'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'30:MINUTES'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> time_col</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> lot_id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> lastWithTime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">is_occupied</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> event_time</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'INT'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> parking_data</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> event_time </span><span class="token operator">>=</span><span class="token plain"> </span><span class="token number">1633078800000</span><span class="token plain"> </span><span class="token operator">AND</span><span class="token plain"> event_time </span><span class="token operator"><=</span><span class="token plain"> </span><span class="token number">1633089600000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> </span><span class="token number">1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token number">2</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> </span><span class="token number">1</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">LIMIT</span><span class="token plain"> </span><span class="token number">100</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">LIMIT</span><span class="token plain"> </span><span class="token number">100</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> </span><span class="token number">1</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">LIMIT</span><span class="token plain"> </span><span class="token number">100</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This query suggests three main steps:</p><ol><li>The raw data will be aggregated;</li><li>The aggregated data will be gapfilled;</li><li>The gapfilled data will be aggregated.</li></ol><p>We make one assumption that the raw data is sorted by timestamp. The Gapfill and Post-Gapfill Aggregation will not sort the data.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="query-components"></a>Query components:<a class="hash-link" href="#query-components" title="Direct link to heading">#</a></h3><p>The following concepts were added to interpolate and handle time-series data.</p><ul><li>LastWithTime(dataColumn, timeColumn, 'dataType') - To get the last value of dataColumn where the timeColumn is used to define the time of dataColumn. This is useful to pick the latest value when there are multiple values found within a time bucket. Please see <a href="https://docs.pinot.apache.org/users/user-guide-query/supported-aggregations" target="_blank" rel="noopener noreferrer">https://docs.pinot.apache.org/users/user-guide-query/supported-aggregations</a> for more details.</li><li>Fill(colum, FILL_TYPE) - To fill the missing data of the column with the FILL_TYPE.</li><li>TimeSeriesOn - To specify the columns to uniquely identify entities whose data will be interpolated.</li><li>Gapfill - Specify the time range, the time bucket size, how to fill the missing data, and entity definition.</li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="query-workflow"></a>Query Workflow<a class="hash-link" href="#query-workflow" title="Direct link to heading">#</a></h3><p>The innermost sql will convert the raw event table to the following table.</p><p><img src="https://www.datocms-assets.com/75153/1661700439-innermost-sql.png" alt="Sample parking lot query workflow innermost SQL" title="Sample parking lot query workflow innermost SQL"></p><p>The second most nested sql will gap fill the returned data as below:</p><p><img src="https://www.datocms-assets.com/75153/1661700473-second-most.png" alt="Sample parking lot query workflow second most SQL" title="Sample parking lot query workflow second most SQL"></p><p>The outermost query will aggregate the gapfilled data as follows:</p><p><img src="https://www.datocms-assets.com/75153/1661700517-outermost.png" alt="Sample parking lot query workflow outermost SQL" title="Sample parking lot query workflow outermost SQL"></p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="other-supported-query-scenarios"></a>Other Supported Query Scenarios:<a class="hash-link" href="#other-supported-query-scenarios" title="Direct link to heading">#</a></h3><p>The above example demonstrates the support to aggregate before and post gapfilling. Pre and/or post aggregations can be skipped if they are not needed. The gapfilling query syntax is flexible to support the following use cases:</p><ul><li>Select/Gapfill - Gapfill the missing data for the time bucket. Just the raw events are fetched, gapfilled, and returned. No aggregation is needed.</li><li>Aggregate/Gapfill - If there are multiple entries within the time bucket we can pick a representative value by applying an aggregate function. Then the missing data for the time buckets will be gap filled.</li><li>Gapfill/Aggregate - Gapfill the data and perform some form of aggregation on the interpolated data.</li></ul><p>For detailed query syntax and how it works, please refer to the documentation here: <a href="https://docs.pinot.apache.org/users/user-guide-query/gap-fill-functions" target="_blank" rel="noopener noreferrer">https://docs.pinot.apache.org/users/user-guide-query/gap-fill-functions</a>.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="how-does-it-work"></a>How does it work?<a class="hash-link" href="#how-does-it-work" title="Direct link to heading">#</a></h2><p>Let us use the sample query given above as an example to understand what's going on behind the scenes and how Pinot executes the gapfill queries.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="request-flow"></a>Request Flow<a class="hash-link" href="#request-flow" title="Direct link to heading">#</a></h3><p>Here is the list of steps in executing the query at a high level:</p><ol><li>Pinot Broker receives the gapfill query. It will strip off the gapfill part and send out the stripped SQL query to the pinot server.</li><li>The pinot server will process the query as a normal query and return the result back to the pinot broker.</li><li>The pinot broker will run the DataTableReducer to merge the results from pinot servers. The result will be sent to GapfillProcessor.</li><li>The GapfillProcessor will gapfill the received result and apply the filter against the gap-filled result.</li><li>Post-Gapfill aggregation and filtering will be applied to the result from the last step.</li></ol><p>There are two gapfill-specific steps:</p><ol><li>When Pinot Broker Server receives the gapfill SQL query, it will strip out gapfill related information and send out the stripped SQL query to the pinot server</li><li>GapfillProcessor will process the result from BrokerReducerService. The gapfill logic will be applied to the reduced result.</li></ol><p><img src="https://www.datocms-assets.com/75153/1661700601-gapfill-steps.png" alt="Gapfill steps" title="Gapfill steps"></p><p>Here is the stripped version of the sql query sent to servers for the query shared above:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> DATETIMECONVERT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">event_time</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'1:MILLISECONDS:EPOCH'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'1:MILLISECONDS:SIMPLE_DATE_FORMAT:yyyy-MM-dd HH:mm:ss.SSS'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token string" style="color:rgb(255, 121, 198)">'30:MINUTES'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> time_col</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> lot_id</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> lastWithTime</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">is_occupied</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> event_time</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'INT'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> parking_data</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> event_time </span><span class="token operator">>=</span><span class="token plain"> </span><span class="token number">1633078800000</span><span class="token plain"> </span><span class="token operator">AND</span><span class="token plain"> event_time </span><span class="token operator"><=</span><span class="token plain"> </span><span class="token number">1633089600000</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> </span><span class="token number">1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token number">2</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ORDER</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> </span><span class="token number">1</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">LIMIT</span><span class="token plain"> </span><span class="token number">100</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="execution-plan"></a>Execution Plan<a class="hash-link" href="#execution-plan" title="Direct link to heading">#</a></h3><p>The sample execution plan for this query is as shown in the figure below:</p><p><img src="https://www.datocms-assets.com/75153/1661700642-execution-plan.png" alt="Sample query execution plan" title="Sample query execution plan"></p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="time-and-space-complexity"></a>Time and Space complexity:<a class="hash-link" href="#time-and-space-complexity" title="Direct link to heading">#</a></h3><p>Let us say there are M entities, R rows returned from servers, and N time buckets. The data is gapfilled time bucket by time bucket to limit the broker memory usage to O(M + N + R). When the data is gapfilled for a time bucket, it will be aggregated and stored in the final result (which has N slots). The previous values for each of the M entities are maintained in memory and carried forward as the gapfilling is performed in sequence. The time complexity is O(M * N) where M is the number of entities and N is the number of time buckets.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor" id="challenges"></a>Challenges<a class="hash-link" href="#challenges" title="Direct link to heading">#</a></h3><p><img src="https://www.datocms-assets.com/75153/1661700716-challenges.png" alt="Sample server challenges graph" title="Sample server challenges graph"></p><p>As the time-series datasets are enormous and partitioned, it's hard to get answers to the following questions:</p><ul><li>How many different entities exist within the query time frame. In the temporal partition scheme demonstrated above, a server/partition may not know the answer.</li><li>What's the previously observed value for entities especially for the first data points in a time bucket where previous time buckets don’t exist in the same server.</li></ul><p>For the scenario shown in the figure above, server2 may not know about the circle entity, as there are no events for the circle in Server2. It would also not know the last observed value for the square entity frame beginning of the time bucket till the first observed value timestamp within the partition.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="the-future-work"></a>The Future Work<a class="hash-link" href="#the-future-work" title="Direct link to heading">#</a></h2><p>When doing the gapfill for one or a few entities, there might not be too much data. But when we deal with a large dataset that has multiple entities queried over a long date range without any filtering, this gets tricky. Since gapfill happens at the pinot broker, it will become very slow and the broker will become a bottleneck. The raw data transferred from servers to brokers would be enormous. Data explodes when interpolated. Parallelism is limited as the single broker instance is handling the query.</p><p>The next step of the gapfill project is to remove the pinot broker as a bottleneck. The gapfill logic will be pushed down to the servers and be running where the data live. This will reduce the data transmission and increase the parallelism and performance of gapfill.</p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/interpolation">interpolation</a><a class="margin-horiz--sm" href="/blog/tags/gapfilling">gapfilling</a></div><div class="col text--right"><a aria-label="Read more about GapFill Function For Time-Series Datasets In Pinot" href="/blog/2022/08/02/GapFill-Function-For-Time-Series-Datasets-In-Pinot"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2022/04/04/Announcing-Apache-Pinot-0-10">Announcing Apache Pinot 0.10</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-04-04T00:00:00.000Z">April 4, 2022</time> · 5 min read</div><div class="avatar margin-vert--md"><a href="https://twitter.com/ApachePinot" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://pinot.apache.org/authors/pinot_team.jpg" alt="Apache Pinot Engineering Team"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://twitter.com/ApachePinot" target="_blank" rel="noopener noreferrer">Apache Pinot Engineering Team</a></div><small class="avatar__subtitle">Apache Pinot Engineering Team</small></div></div></header><div class="markdown"><p>We are excited to announce the release this week of Apache Pinot 0.10. |
| Apache Pinot is a real-time distributed datastore designed to answer OLAP queries with high throughput and low latency.</p><p>This release is cut from commit <a href="https://github.com/apache/pinot/commit/fd9c58a11ed16d27109baefcee138eea30132ad3" target="_blank" rel="noopener noreferrer">fd9c58a11ed16d27109baefcee138eea30132ad3</a>. |
| You can find a full list of everything included in the <a href="https://docs.pinot.apache.org/basics/releases/0.10.0" target="_blank" rel="noopener noreferrer">release notes</a>.</p><p>Let’s have a look at some of the changes, with the help of the batch <a href="https://docs.pinot.apache.org/basics/getting-started/running-pinot-in-docker" target="_blank" rel="noopener noreferrer">QuickStart configuration</a>.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="query-plans"></a>Query Plans<a class="hash-link" href="#query-plans" title="Direct link to heading">#</a></h2><p>Amrish Lal implemented the <code>EXPLAIN PLAN</code> clause, which returns the execution plan that will be chosen by the Pinot Query Engine. |
| This lets us see what the query is likely to do without actually having to run it.</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">EXPLAIN</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">PLAN</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FOR</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span><span class="token operator">*</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> baseballStats</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> league </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'NL'</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we run this query, we'll see the following results:</p><table><thead><tr><th>Operator</th><th>Operator_Id</th><th>Parent_Id</th></tr></thead><tbody><tr><td>BROKER_REDUCE(limit:10)</td><td>0</td><td>-1</td></tr><tr><td>COMBINE_SELECT</td><td>1</td><td>0</td></tr><tr><td>SELECT(selectList:AtBatting, G_old, baseOnBalls, caughtStealing, doules, groundedIntoDoublePlays, hits, hitsByPitch, homeRuns, intentionalWalks, league, numberOfGames, numberOfGamesAsBatter, playerID, playerName, playerStint, runs, runsBattedIn, sacrificeFlies, sacrificeHits, stolenBases, strikeouts, teamID, tripples, yearID)</td><td>2</td><td>1</td></tr><tr><td>TRANSFORM_PASSTHROUGH(AtBatting, G_old, baseOnBalls, caughtStealing, doules, groundedIntoDoublePlays, hits, hitsByPitch, homeRuns, intentionalWalks, league, numberOfGames, numberOfGamesAsBatter, playerID, playerName, playerStint, runs, runsBattedIn, sacrificeFlies, sacrificeHits, stolenBases, strikeouts, teamID, tripples, yearID)</td><td>3</td><td>2</td></tr><tr><td>PROJECT(homeRuns, playerStint, groundedIntoDoublePlays, numberOfGames, AtBatting, stolenBases, tripples, hitsByPitch, teamID, numberOfGamesAsBatter, strikeouts, sacrificeFlies, caughtStealing, baseOnBalls, playerName, doules, league, yearID, hits, runsBattedIn, G_old, sacrificeHits, intentionalWalks, runs, playerID)</td><td>4</td><td>3</td></tr><tr><td>FILTER_FULL_SCAN(operator:EQ,predicate:league = 'NL')</td><td>5</td><td>4</td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="filter-clauses-for-aggregates"></a>FILTER Clauses for Aggregates<a class="hash-link" href="#filter-clauses-for-aggregates" title="Direct link to heading">#</a></h2><p>Atri Sharma added the filter clause for aggregates. |
| This feature makes it possible to write queries like this:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">SUM</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">homeRuns</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> FILTER</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> league </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'NL'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> nlHomeRuns</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">SUM</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">homeRuns</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> FILTER</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> league </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'AL'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> alHomeRuns</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> baseballStats</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we run this query, we'll see the following output:</p><table><thead><tr><th>nlHomeRuns</th><th>alHomeRuns</th></tr></thead><tbody><tr><td>135486</td><td>135990</td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="greatest-and-least"></a>greatest and least<a class="hash-link" href="#greatest-and-least" title="Direct link to heading">#</a></h2><p>Richard Startin added the <code>greatest</code> and <code>least</code> functions:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> playerID</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> least</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token number">5.0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">max</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">homeRuns</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> homeRuns</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> greatest</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token number">5.0</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token function" style="color:rgb(80, 250, 123)">max</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">hits</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">AS</span><span class="token plain"> hits</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> baseballStats</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> league </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'NL'</span><span class="token plain"> </span><span class="token operator">AND</span><span class="token plain"> teamID </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'SFN'</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">GROUP</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> playerID</span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">LIMIT</span><span class="token plain"> </span><span class="token number">5</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we run this query, we'll see the following output:</p><table><thead><tr><th>playerID</th><th>homeRuns</th><th>hits</th></tr></thead><tbody><tr><td>ramirju01</td><td>0</td><td>5</td></tr><tr><td>milneed01</td><td>4</td><td>54</td></tr><tr><td>testani01</td><td>0</td><td>5</td></tr><tr><td>shawbo01</td><td>0</td><td>8</td></tr><tr><td>vogelry01</td><td>0</td><td>12</td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="distinctcountsmarthll"></a>DistinctCountSmartHLL<a class="hash-link" href="#distinctcountsmarthll" title="Direct link to heading">#</a></h2><p> Xiaotian (Jackie) Jiang added the <code>DistinctCountSmartHLL</code> aggregation function, which automatically converts the Set to HyperLogLog if the set size grows too big to protect the servers from running out of memory:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI sql"><pre tabindex="0" class="prism-code language-sql codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> DISTINCTCOUNTSMARTHLL</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">homeRuns</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">'hllLog2m=8;hllConversionThreshold=10'</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> baseballStats</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>If we run this query, we'll see the following output:</p><table><thead><tr><th>distinctcountsmarthll(homeRuns)</th></tr></thead><tbody><tr><td>66</td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="ui-updates"></a>UI updates<a class="hash-link" href="#ui-updates" title="Direct link to heading">#</a></h2><p>There were also a bunch of updates to the Pinot Data Explorer, by Sanket Shah and Johan Adami.</p><p>The display of reported size and estimated size is now in a human readable format:</p><p><img alt="Human readable sizes" src="/assets/images/human-readable-sizes-b8c4009dd53d23da3b8637963827a8de.png"></p><p>Fixes for the following issues:</p><ul><li>Error messages weren't showing on the UI when an invalid operation is attempted:</li></ul><p><img alt="A backwards incompatible attempted schema change" src="/assets/images/backwards-incompatible-99886dcd0be55a8100a7d6c5f3da3bda.png"></p><ul><li>Query console goes blank on syntax error.</li><li>Query console cannot show query result when multiple columns have the same name.</li><li>Adding extra fields after <code>SELECT *</code> would throw a NullPointerException.</li><li>Some queries were returning <code>--</code> instead of <code>0</code>.</li><li>Query console couldn't show the query result if multiple columns had the same name.</li><li>Pinot Dashboard tenant view showing the incorrect amount of servers and brokers.</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="realtimetooffline-task"></a>RealTimeToOffline Task<a class="hash-link" href="#realtimetooffline-task" title="Direct link to heading">#</a></h2><p>Xiaotian (Jackie) Jiang made some fixes to the <a href="https://dev.startree.ai/docs/pinot/recipes/real-time-offline-job" target="_blank" rel="noopener noreferrer">RealTimeToOffline job</a> to handle time gaps and proceed to the next time window when no segment matches the current one.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="empty-quickstart"></a>Empty QuickStart<a class="hash-link" href="#empty-quickstart" title="Direct link to heading">#</a></h2><p>Kenny Bastani added an empty QuickStart command, which lets you quickly spin up an empty Pinot cluster:</p><div class="codeBlockContainer_J+bg"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#F8F8F2;background-color:#282A36"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#F8F8F2"><span class="token function" style="color:rgb(80, 250, 123)">docker</span><span class="token plain"> run </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">8000</span><span class="token plain">:8000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -p </span><span class="token number">9000</span><span class="token plain">:9000 </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> apachepinot/pinot:0.10.0 QuickStart </span><span class="token punctuation" style="color:rgb(248, 248, 242)">\</span><span class="token plain"></span></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> -type empty</span></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>You can then ingest your own dataset without needing to worry about spinning up each of the Pinot components individually.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="data-ingestion"></a>Data Ingestion<a class="hash-link" href="#data-ingestion" title="Direct link to heading">#</a></h2><ul><li><p>Richard Startin fixed some issues with real-time ingestion where consumption of messages would stop if a bad batch of messages was consumed from Kafka.</p></li><li><p>Mohemmad Zaid Khan added the BoundedColumnValue partition function, which partitions segments based on column values.</p></li><li><p>Xiaobing Li added the fixed name segment generator, which can be used when you want to replace a specific existing segment.</p></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="other-changes"></a>Other changes<a class="hash-link" href="#other-changes" title="Direct link to heading">#</a></h2><ul><li>Richard Startin set LZ4 compression as the default for all metrics fields.</li><li>Mark Needham added the <code>ST_Within</code> geospatial function.</li><li>Rong Rong fixed a bug where query stats wouldn't show if there was an error processing the query (e.g. if the query timed out).</li><li>Prashant Pandey fixed the query engine to handle extra columns added to a <code>SELECT *</code> statement.</li><li>Richard Startin added support for forward indexes on JSON columns.</li><li>Rong Rong added the GRPC broker request handler so that data can be streamed back from the server to the broker when processing queries.</li><li>deemoliu made it possible to add a default strategy when using the <a href="https://dev.startree.ai/docs/pinot/recipes/upserts-partial" target="_blank" rel="noopener noreferrer">partial upsert feature</a>.</li><li>Jeff Moszuti added support for the <code>TIMESTAMP</code> data type in the <a href="https://docs.pinot.apache.org/operators/configuration-recommendation-engine" target="_blank" rel="noopener noreferrer">configuration recommendation engine</a>.</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="dependency-updates"></a>Dependency updates<a class="hash-link" href="#dependency-updates" title="Direct link to heading">#</a></h2><p>The following dependencies were updated:</p><ul><li>async-http-client because the library moved to a different organization.</li><li>RoaringBitmap to 0.9.25</li><li>JsonPath to 2.7.0</li><li>Kafka to 2.8.1</li><li>Prometheus to 0.16.1</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor" id="resources"></a>Resources<a class="hash-link" href="#resources" title="Direct link to heading">#</a></h2><p>If you want to try out Apache Pinot, the following resources will help you get started:</p><ul><li>Download page: <a href="https://pinot.apache.org/download/" target="_blank" rel="noopener noreferrer">https://pinot.apache.org/download/</a></li><li>Getting started: <a href="https://docs.pinot.apache.org/getting-started" target="_blank" rel="noopener noreferrer">https://docs.pinot.apache.org/getting-started</a></li><li>Apache Pinot Recipes: <a href="https://dev.startree.ai/docs/pinot/recipes/" target="_blank" rel="noopener noreferrer">https://dev.startree.ai/docs/pinot/recipes/</a></li><li>Join our Slack channel: <a href="https://communityinviter.com/apps/apache-pinot/apache-pinot" target="_blank" rel="noopener noreferrer">https://communityinviter.com/apps/apache-pinot/apache-pinot</a></li><li>See our upcoming events: <a href="https://www.meetup.com/apache-pinot" target="_blank" rel="noopener noreferrer">https://www.meetup.com/apache-pinot</a></li><li>Follow us on Twitter: <a href="https://twitter.com/startreedata" target="_blank" rel="noopener noreferrer">https://twitter.com/startreedata</a></li><li>Subscribe to our YouTube channel: <a href="https://www.youtube.com/c/StarTree" target="_blank" rel="noopener noreferrer">https://www.youtube.com/c/StarTree</a></li></ul></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a><a class="margin-horiz--sm" href="/blog/tags/releases">Releases</a></div><div class="col text--right"><a aria-label="Read more about Announcing Apache Pinot 0.10" href="/blog/2022/04/04/Announcing-Apache-Pinot-0-10"><b>Read More</b></a></div></footer></article><article class="margin-bottom--xl"><header><h2 class="blogPostTitle_d4p0"><a href="/blog/2021/06/16/LinkedIn-TextAnalytics">Text analytics on LinkedIn Talent Insights using Apache Pinot</a></h2><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2021-06-16T00:00:00.000Z">June 16, 2021</time> · One min read</div><div class="avatar margin-vert--md"><a href="https://engineering.linkedin.com/blog/topic/pinot" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/Linkedin_icon.svg/512px-Linkedin_icon.svg.png" alt="LinkedIn"></a><div class="avatar__intro"><div class="avatar__name"><a href="https://engineering.linkedin.com/blog/topic/pinot" target="_blank" rel="noopener noreferrer">LinkedIn</a></div><small class="avatar__subtitle">LinkedIn Engineering Team</small></div></div></header><div class="markdown"><p>LinkedIn Talent Insights (LTI) is a platform that helps organizations understand the external labor market and their internal workforce, and enables the long term success of their employees. Users of LTI have the flexibility to construct searches using the various facets of the LinkedIn Economic Graph (skills, titles, location, company, etc.).</p><p><a href="https://engineering.linkedin.com/blog/2021/text-analytics-on-linkedin-talent-insights-using-apache-pinot" target="_blank" rel="noopener noreferrer">Read More at https://engineering.linkedin.com/blog/2021/text-analytics-on-linkedin-talent-insights-using-apache-pinot</a></p><p><img src="https://content.linkedin.com/content/dam/engineering/site-assets/images/blog/posts/2021/06/ltipinot6.png" alt="Text analytics on LinkedIn Talent Insights using Apache Pinot"></p></div><footer class="row docusaurus-mt-lg"><div class="col"><b>Tags:</b><a class="margin-horiz--sm" href="/blog/tags/pinot">Pinot</a><a class="margin-horiz--sm" href="/blog/tags/linked-in">LinkedIn</a><a class="margin-horiz--sm" href="/blog/tags/data">Data</a><a class="margin-horiz--sm" href="/blog/tags/text-analytics">Text analytics</a><a class="margin-horiz--sm" href="/blog/tags/real-time-data-platform">real-time data platform</a><a class="margin-horiz--sm" href="/blog/tags/realtime">Realtime</a><a class="margin-horiz--sm" href="/blog/tags/third-eye">ThirdEye</a><a class="margin-horiz--sm" href="/blog/tags/analytics">Analytics</a><a class="margin-horiz--sm" href="/blog/tags/user-facing-analytics">User-Facing Analytics</a></div><div class="col text--right"><a aria-label="Read more about Text analytics on LinkedIn Talent Insights using Apache Pinot" href="/blog/2021/06/16/LinkedIn-TextAnalytics"><b>Read More</b></a></div></footer></article></main></div></div></div><footer class="footer"><div class="container"><div class="row footer__links"><div class="col footer__col"><h4 class="footer__title">About</h4><ul class="footer__items"><li class="footer__item"><a href="https://docs.pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="footer__link-item">What is Apache Pinot?</a></li><li class="footer__item"><a class="footer__link-item" href="/who_uses">Who uses Apache Pinot?</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/pinot-components" target="_blank" rel="noopener noreferrer" class="footer__link-item">Components</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/basics/architecture" target="_blank" rel="noopener noreferrer" class="footer__link-item">Architecture</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/plugins/plugin-architecture" target="_blank" rel="noopener noreferrer" class="footer__link-item">Plugins Architecture</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Integrations</h4><ul class="footer__items"><li class="footer__item"><a href="https://docs.pinot.apache.org/integrations/trino" target="_blank" rel="noopener noreferrer" class="footer__link-item">Trino</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/integrations/presto" target="_blank" rel="noopener noreferrer" class="footer__link-item">Presto</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/integrations/superset" target="_blank" rel="noopener noreferrer" class="footer__link-item">Superset</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/integrations/thirdeye" target="_blank" rel="noopener noreferrer" class="footer__link-item">ThirdEye</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Docs</h4><ul class="footer__items"><li class="footer__item"><a href="https://docs.pinot.apache.org/getting-started" target="_blank" rel="noopener noreferrer" class="footer__link-item">Getting Started</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/pinot-components" target="_blank" rel="noopener noreferrer" class="footer__link-item">Pinot Components</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/users" target="_blank" rel="noopener noreferrer" class="footer__link-item">User Guide</a></li><li class="footer__item"><a href="https://docs.pinot.apache.org/operators/operating-pinot" target="_blank" rel="noopener noreferrer" class="footer__link-item">Administration</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Community</h4><ul class="footer__items"><li class="footer__item"><a href="https://join.slack.com/t/apache-pinot/shared_invite/zt-5z7pav2f-yYtjZdVA~EDmrGkho87Vzw" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack</a></li><li class="footer__item"><a href="https://github.com/apache/pinot" target="_blank" rel="noopener noreferrer" class="footer__link-item">Github</a></li><li class="footer__item"><a href="https://twitter.com/ApachePinot" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter</a></li><li class="footer__item"><a href="mailto:dev-subscribe@pinot.apache.org?Subject=SubscribeToPinot" target="_blank" rel="noopener noreferrer" class="footer__link-item">Mailing List</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Apache</h4><ul class="footer__items"><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks</a></li><li class="footer__item"><a href="https://www.apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License</a></li><li class="footer__item"><a href="https://www.apache.org/security" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship</a></li><li class="footer__item"><a href="https://www.apache.org" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://pinot.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_94kH"><img src="/img/logo.svg" alt="Apache Pinot™" class="themedImage_TMUO themedImage--light_4Vu1 footer__logo"><img src="/img/logo.svg" alt="Apache Pinot™" class="themedImage_TMUO themedImage--dark_uzRr footer__logo"></a></div><div class="footerCopyright_-piB">Copyright © 2024 The Apache Software Foundation.<br>Apache Pinot, Pinot, Apache, the Apache feather logo, and the Apache Pinot project logo are registered trademarks of The Apache Software Foundation.<br><br>This page has references to third party software - Presto, PrestoDB, ThirdEye, Trino, TrinoDB, that are not part of the Apache Software Foundation and are not covered under the Apache License.</div></div></div></footer></div> |
| <script src="/assets/js/runtime~main.fa80698b.js"></script> |
| <script src="/assets/js/main.271ac9bf.js"></script> |
| </body> |
| </html> |