blob: 53b61b6f3cae7c89080cb295cb50e3a22604b736 [file] [log] [blame]
<!DOCTYPE html><html lang="en-us" class="__variable_1fc36d scroll-smooth"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/_next/static/media/6905431624c34d00-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="preload" as="image" href="https://www.datocms-assets.com/75153/1673273173-image4.png"/><link rel="preload" as="image" href="https://www.datocms-assets.com/75153/1673273248-image3.png"/><link rel="preload" as="image" href="https://www.datocms-assets.com/75153/1673273272-pinot_0-11-de-duplication-diagram_1-v2.png"/><link rel="preload" as="image" href="https://www.datocms-assets.com/75153/1673273289-pinot_0-11-de-duplication-diagram_2-v3.png"/><link rel="stylesheet" href="/_next/static/css/9e925a33b1acdac1.css" crossorigin="" data-precedence="next"/><link rel="stylesheet" href="/_next/static/css/c130d1629644f070.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/_next/static/chunks/webpack-dde39ac7c1b4eb4b.js" crossorigin=""/><script src="/_next/static/chunks/fd9d1056-f172993f20f9bb67.js" async="" crossorigin=""></script><script src="/_next/static/chunks/472-928e738895d89765.js" async="" crossorigin=""></script><script src="/_next/static/chunks/main-app-344a87763a0866a6.js" async="" crossorigin=""></script><script src="/_next/static/chunks/326-3a90a6443b9c824c.js" async=""></script><script src="/_next/static/chunks/980-6e243f9cd384c7d2.js" async=""></script><script src="/_next/static/chunks/702-a2bf9fe707814b79.js" async=""></script><script src="/_next/static/chunks/app/layout-776a485845c720ef.js" async=""></script><script src="/_next/static/chunks/413-f9f40b83f7bb3f22.js" async=""></script><script src="/_next/static/chunks/app/blog/%5B...slug%5D/page-502e08b6677b55da.js" async=""></script><link rel="preload" href="https://www.googletagmanager.com/gtag/js?id=G-ZXG79NJEBY" as="script"/><title>Apache Pinot™ 0.11 - Deduplication on Real-Time Tables | Apache Pinot™</title><meta name="description" content="Learn about the deduplication for the real-time tables feature in Apache Pinot"/><meta name="robots" content="index, follow"/><meta name="googlebot" content="index, follow, max-video-preview:-1, max-image-preview:large, max-snippet:-1"/><link rel="canonical" href="https://pinot.apache.org/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables"/><link rel="alternate" type="application/rss+xml" href="https://pinot.apache.org/feed.xml"/><meta property="og:title" content="Apache Pinot™ 0.11 - Deduplication on Real-Time Tables"/><meta property="og:description" content="Learn about the deduplication for the real-time tables feature in Apache Pinot"/><meta property="og:url" content="https://pinot.apache.org/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables"/><meta property="og:site_name" content="Apache Pinot™"/><meta property="og:locale" content="en_US"/><meta property="og:image" content="https://pinot.apache.org/static/images/twitter-card.png"/><meta property="og:type" content="article"/><meta property="article:published_time" content="2023-01-29T00:00:00.000Z"/><meta property="article:modified_time" content="2023-01-29T00:00:00.000Z"/><meta property="article:author" content="Mark Needham"/><meta name="twitter:card" content="summary_large_image"/><meta name="twitter:title" content="Apache Pinot™ 0.11 - Deduplication on Real-Time Tables"/><meta name="twitter:description" content="Learn about the deduplication for the real-time tables feature in Apache Pinot"/><meta name="twitter:image" content="https://pinot.apache.org/static/images/twitter-card.png"/><meta name="next-size-adjust"/><meta http-equiv="Content-Security-Policy" content="default-src &#x27;self&#x27;;script-src &#x27;self&#x27; &#x27;unsafe-eval&#x27; &#x27;unsafe-inline&#x27; giscus.app analytics.umami.is www.youtube.com www.googletagmanager.com www.google-analytics.com;style-src &#x27;self&#x27; &#x27;unsafe-inline&#x27;;img-src * blob: data:;media-src *.s3.amazonaws.com;connect-src *;font-src &#x27;self&#x27;;frame-src www.youtube.com youtube.com giscus.app youtu.be https://www.youtube.com https://youtube.com;"/><link rel="apple-touch-icon" sizes="76x76" href="/static/favicons/apple-touch-icon.png"/><link rel="icon" type="image/png" sizes="32x32" href="/static/favicons/favicon-32x32.png"/><link rel="icon" type="image/png" sizes="16x16" href="/static/favicons/favicon-16x16.png"/><link rel="manifest" href="/static/favicons/site.webmanifest"/><link rel="mask-icon" href="/static/favicons/safari-pinned-tab.svg" color="#5bbad5"/><meta name="msapplication-TileColor" content="#000000"/><meta name="theme-color" media="(prefers-color-scheme: light)" content="#fff"/><meta name="theme-color" media="(prefers-color-scheme: dark)" content="#000"/><link rel="alternate" type="application/rss+xml" href="/feed.xml"/><script src="/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="bg-white text-black antialiased dark:bg-gray-950 dark:text-white"><script>!function(){try{var d=document.documentElement,c=d.classList;c.remove('light','dark');var e=localStorage.getItem('theme');if('system'===e||(!e&&false)){var t='(prefers-color-scheme: dark)',m=window.matchMedia(t);if(m.media!==t||m.matches){d.style.colorScheme = 'dark';c.add('dark')}else{d.style.colorScheme = 'light';c.add('light')}}else if(e){c.add(e|| '')}else{c.add('light')}if(e==='light'||e==='dark'||!e)d.style.colorScheme=e||'light'}catch(e){}}()</script><div class="mx-auto flex max-w-screen-customDesktop flex-col justify-between font-sans"><div class="inset-x-0 top-0 z-50 flex text-center text-base sm:text-left"><div class="flex w-full flex-col items-center justify-center bg-sky-200 pt-1 md:flex-row md:pt-0"><div class="flex flex-wrap items-center justify-center md:justify-start">🎉🎉🎉 Announcing the release of Apache Pinot 1.1.0</div><div class="flex items-center justify-center"><a href="https://github.com/apache/pinot/releases/tag/release-1.1.0" target="_blank" class="inline-flex items-center justify-center whitespace-nowrap rounded-md ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 underline-offset-4 hover:text-vine-120 h-10 px-4 py-2 mr-2 text-base font-semibold leading-tight text-vine-100">learn more<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-arrow-right mr-2 h-5 w-5"><path d="M5 12h14"></path><path d="m12 5 7 7-7 7"></path></svg></a></div></div></div><header class="border-b-1 flex items-center justify-between border-b px-5 py-3 md:px-[4rem] md:py-4"><div class="flex"><a aria-label="" href="/"><div class="flex items-center justify-between"><div class=""><svg xmlns="http://www.w3.org/2000/svg" width="120" height="48" fill="none"><g fill="#C7154A" clip-path="url(#logo_svg__a)"><path d="M42.99 18.448c1.032-.553 2.21-.831 3.535-.831 1.542 0 2.938.38 4.187 1.14 1.248.76 2.236 1.841 2.965 3.241.728 1.402 1.091 3.025 1.091 4.872s-.363 3.482-1.091 4.903c-.729 1.424-1.717 2.525-2.965 3.307-1.25.782-2.645 1.173-4.187 1.173-1.325 0-2.493-.271-3.503-.815-1.01-.543-1.83-1.226-2.46-2.053v14.612H36V17.912h4.562v2.606c.586-.825 1.395-1.515 2.426-2.068l.002-.002m6.452 5.605c-.445-.793-1.032-1.395-1.76-1.808a4.72 4.72 0 0 0-2.362-.618c-.847 0-1.602.211-2.33.635-.728.423-1.315 1.038-1.76 1.841-.445.804-.668 1.749-.668 2.835 0 1.087.221 2.032.668 2.835.445.804 1.032 1.417 1.76 1.842a4.557 4.557 0 0 0 2.33.635 4.57 4.57 0 0 0 2.362-.652c.728-.435 1.313-1.053 1.76-1.856.445-.804.668-1.76.668-2.867s-.223-2.025-.668-2.818v-.004M62.947 17.912v18.051h-4.562V17.912h4.562m.551-6.079a2.833 2.833 0 1 1-5.666 0 2.833 2.833 0 0 1 5.666 0M82.954 19.687c1.325 1.358 1.988 3.253 1.988 5.685v10.59H80.38v-9.97c0-1.434-.358-2.537-1.075-3.307-.717-.772-1.695-1.157-2.933-1.157-1.239 0-2.254.387-2.982 1.157-.728.772-1.091 1.873-1.091 3.307v9.97h-4.562V17.91h4.562v2.248a6.322 6.322 0 0 1 2.33-1.841c.944-.445 1.981-.669 3.111-.669 2.15 0 3.889.68 5.214 2.037v.002M92.892 35.098c-1.39-.77-2.482-1.861-3.275-3.275-.794-1.411-1.19-3.041-1.19-4.888s.406-3.475 1.221-4.888a8.502 8.502 0 0 1 3.34-3.275c1.412-.772 2.987-1.157 4.725-1.157 1.739 0 3.312.387 4.725 1.157a8.5 8.5 0 0 1 3.34 3.275c.815 1.411 1.222 3.041 1.222 4.888s-.418 3.475-1.255 4.888a8.708 8.708 0 0 1-3.388 3.275c-1.424.772-3.014 1.157-4.774 1.157-1.76 0-3.301-.385-4.691-1.157m7.021-3.421c.729-.402 1.309-1.005 1.744-1.809.435-.803.651-1.781.651-2.933 0-1.715-.451-3.035-1.351-3.958-.902-.924-2.004-1.385-3.307-1.385s-2.395.461-3.275 1.385c-.88.923-1.32 2.243-1.32 3.958 0 1.715.428 3.035 1.287 3.958.858.924 1.938 1.385 3.241 1.385.825 0 1.602-.2 2.33-.603v.002M115.96 21.658v8.734c0 .608.147 1.048.44 1.32.293.271.787.406 1.482.406H120v3.845h-2.867c-3.845 0-5.766-1.868-5.766-5.605v-8.7h-2.15v-3.746h2.15V13l4.595-1v5.912h4.04v3.746h-4.042M20.03 46.757l-5.538-1.385A1.97 1.97 0 0 1 13 43.46v-5.462c0-.841.349-1.601.907-2.146a12.212 12.212 0 0 0 6.975-3.644c2.602-2.731 3.627-6.578 2.882-10.251L21 9h-4V4a1 1 0 0 0-2 0v7a1 1 0 0 1-2 0v-1a1 1 0 0 0-2 0v6.758a4.489 4.489 0 0 1 2.694-.755c2.278.095 4.156 1.934 4.297 4.21a4.501 4.501 0 0 1-6.992 4.029V29a1 1 0 0 1-2 0V7a1 1 0 0 0-2 0v2h-4L.237 21.957c-.745 3.675.279 7.52 2.882 10.251a12.202 12.202 0 0 0 6.975 3.644c.558.545.907 1.305.907 2.146V43.4c0 .938-.639 1.757-1.55 1.985l-5.48 1.37c-.57.143-.97.655-.97 1.243h18c0-.588-.4-1.1-.97-1.243v.002"></path><path d="M13.5 23a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5M8 5a1 1 0 1 0 0-2 1 1 0 0 0 0 2M12 8a1 1 0 1 0 0-2 1 1 0 0 0 0 2M16 2a1 1 0 1 0 0-2 1 1 0 0 0 0 2"></path></g><defs><clipPath id="logo_svg__a"><path fill="#fff" d="M0 0h120v48H0z"></path></clipPath></defs></svg></div><div class="hidden h-6 text-2xl font-semibold sm:block"></div></div></a><div class="ml-[4.5rem] flex items-center gap-12 text-lg leading-5"><a target="_blank" rel="noopener noreferrer" href="https://docs.pinot.apache.org" class="hidden sm:block font-medium text-gray-900 dark:text-gray-100">Docs</a><a class="hidden sm:block font-medium text-gray-900 dark:text-gray-100" href="/download">Download</a><a class="hidden sm:block font-medium text-gray-900 dark:text-gray-100" href="/powered-by">Powered by</a><a class="hidden sm:block font-medium text-gray-900 dark:text-gray-100" href="/blog">Blog</a></div></div><button aria-label="Toggle Menu" class="sm:hidden"><svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="none"><mask id="menu_svg__a" width="32" height="32" x="0" y="0" maskUnits="userSpaceOnUse" style="mask-type:alpha"><path fill="#D9D9D9" d="M0 0h32v32H0z"></path></mask><g mask="url(#menu_svg__a)"><path fill="#201F1F" d="M4.667 23.513v-2h22.666v2H4.667Zm0-6.513v-2h22.666v2H4.667Zm0-6.513v-2h22.666v2H4.667Z"></path></g></svg></button><div class="hidden gap-3 sm:flex"><button aria-label="Search"><svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="h-6 w-6 text-gray-900 dark:text-gray-100"><path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-5.197-5.197m0 0A7.5 7.5 0 105.196 5.196a7.5 7.5 0 0010.607 10.607z"></path></svg></button><a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/pinot" class="inline-flex items-center justify-center whitespace-nowrap font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 border border-input bg-background hover:bg-accent hover:text-accent-foreground rounded-md px-3 py-2 text-base"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="currentColor" class="mr-2"><g clip-path="url(#github_svg__a)"><path fill-rule="evenodd" d="M12.01 0C5.369 0 0 5.5 0 12.304c0 5.44 3.44 10.043 8.212 11.673.597.122.815-.265.815-.59 0-.286-.02-1.264-.02-2.283-3.34.734-4.036-1.466-4.036-1.466-.537-1.426-1.332-1.793-1.332-1.793-1.094-.754.08-.754.08-.754 1.212.082 1.849 1.263 1.849 1.263 1.073 1.874 2.803 1.345 3.5 1.019.098-.795.417-1.345.755-1.65-2.665-.285-5.468-1.345-5.468-6.07 0-1.345.477-2.445 1.232-3.3-.119-.306-.537-1.57.12-3.26 0 0 1.014-.326 3.3 1.263.98-.27 1.989-.407 3.003-.408 1.014 0 2.048.143 3.002.408 2.287-1.59 3.301-1.263 3.301-1.263.657 1.69.239 2.954.12 3.26.775.855 1.232 1.955 1.232 3.3 0 4.725-2.803 5.764-5.488 6.07.438.387.815 1.12.815 2.281 0 1.65-.02 2.975-.02 3.382 0 .326.22.713.816.59C20.56 22.347 24 17.744 24 12.305 24.02 5.5 18.63 0 12.01 0" clip-rule="evenodd"></path></g><defs><clipPath id="github_svg__a"><path fill="#fff" d="M0 0h24v24H0z"></path></clipPath></defs></svg>3.5k</a><button class="inline-flex items-center justify-center whitespace-nowrap font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 text-primary-foreground hover:bg-vine-120 rounded-md bg-vine-100 px-6 py-2 text-base"><a target="_blank" rel="noopener noreferrer" href="https://docs.pinot.apache.org/basics/getting-started">Get Started</a></button></div></header><main><script type="application/ld+json">{"@context":"https://schema.org","@type":"BlogPosting","headline":"Apache Pinot™ 0.11 - Deduplication on Real-Time Tables","datePublished":"2023-01-29T00:00:00.000Z","dateModified":"2023-01-29T00:00:00.000Z","description":"Learn about the deduplication for the real-time tables feature in Apache Pinot","image":"/static/images/twitter-card.png","url":"https://pinot.apache.org/blog/2023-01-29-Apache-Pinot-Deduplication-on-Real-Time-Tables","author":[{"@type":"Person","name":"Mark Needham"}]}</script><section class=" px-5 pt-10 md:px-[13.313rem] md:py-16"><div class="fixed bottom-8 right-8 hidden flex-col gap-3 md:hidden"><button aria-label="Scroll To Top" class="rounded-full bg-gray-200 p-2 text-gray-500 transition-all hover:bg-gray-300 dark:bg-gray-700 dark:text-gray-400 dark:hover:bg-gray-600"><svg class="h-5 w-5" viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M3.293 9.707a1 1 0 010-1.414l6-6a1 1 0 011.414 0l6 6a1 1 0 01-1.414 1.414L11 5.414V17a1 1 0 11-2 0V5.414L4.707 9.707a1 1 0 01-1.414 0z" clip-rule="evenodd"></path></svg></button></div><article class=""><div class="mx-auto lg:flex"><div class="lg:pr-12"><header class="pt-6 md:pr-10"><h1 class="text-4xl font-semibold">Apache Pinot™ 0.11 - Deduplication on Real-Time Tables</h1><p class="pt-2 text-lg">By: <!-- -->Mark Needham</p><p class="py-2 text-sm">January 29th, 2023<!-- --> • <!-- -->8 min read</p></header><div class="flex flex-col lg:flex-row"><main class=""><div class="prose max-w-[45rem] pb-8 pt-10 dark:prose-invert"><p>Last fall, the Apache Pinot community released version <a target="_blank" rel="noopener noreferrer" href="https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4">0.11.0</a>, which has lots of goodies for you to play with.</p><p>In this post, we’re going to learn about the <a target="_blank" rel="noopener noreferrer" href="https://docs.pinot.apache.org/basics/data-import/dedup">deduplication for the real-time tables feature</a>.</p><h2 id="why-do-we-need-deduplication-on-real-time-tables"><a href="#why-do-we-need-deduplication-on-real-time-tables" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>Why do we need deduplication on real-time tables?</h2><p>This feature was built to deal with duplicate data in the streaming platform.</p><p>Users have previously used the upsert feature to de-duplicate data, but this has the following limitations:</p><ul><li>It forces us to keep redundant records that we don’t want to keep, which increases overall storage costs.</li><li>We can’t yet use the StarTree index with upserts, so the speed benefits we get from using that indexing technique are lost.</li></ul><h2 id="how-does-dedup-differ-from-upserts"><a href="#how-does-dedup-differ-from-upserts" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>How does dedup differ from upserts?</h2><p>Both upserts and dedup keep track of multiple documents that have the same primary key. They differ as follows:</p><ul><li>Upserts are used when we want to get the latest copy of a document for a given primary key. It’s likely that some or all of the other fields will be different. Pinot stores all documents it receives, but when querying it will only return the latest document for each primary key.</li><li>Dedup is used when we know that multiple documents with the same primary key are identical. Only the first event received for a given primary key is stored in Pinot—any future events with the same primary key are thrown away.</li></ul><p>Let’s see how to use this functionality with help from a worked example.</p><h2 id="setting-up-apache-kafka-and-apache-pinot"><a href="#setting-up-apache-kafka-and-apache-pinot" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>Setting up Apache Kafka and Apache Pinot</h2><p>We’re going to spin up Kafka and Pinot using the following Docker Compose config:</p><div class="relative"><pre><code class="code-highlight language-yaml"><span class="code-line"><span class="token atrule key">version</span><span class="token punctuation">:</span> <span class="token string">&#x27;3&#x27;</span>
</span><span class="code-line"><span class="token atrule key">services</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">zookeeper</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">image</span><span class="token punctuation">:</span> zookeeper<span class="token punctuation">:</span>3.8.0
</span><span class="code-line"> <span class="token atrule key">hostname</span><span class="token punctuation">:</span> zookeeper
</span><span class="code-line"> <span class="token atrule key">container_name</span><span class="token punctuation">:</span> zookeeper<span class="token punctuation">-</span>dedup<span class="token punctuation">-</span>blog
</span><span class="code-line"> <span class="token atrule key">ports</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> <span class="token string">&#x27;2181:2181&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">environment</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">ZOOKEEPER_CLIENT_PORT</span><span class="token punctuation">:</span> <span class="token number">2181</span>
</span><span class="code-line"> <span class="token atrule key">ZOOKEEPER_TICK_TIME</span><span class="token punctuation">:</span> <span class="token number">2000</span>
</span><span class="code-line"> <span class="token atrule key">networks</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> dedup_blog
</span><span class="code-line"> <span class="token atrule key">kafka</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">image</span><span class="token punctuation">:</span> wurstmeister/kafka<span class="token punctuation">:</span>latest
</span><span class="code-line"> <span class="token atrule key">restart</span><span class="token punctuation">:</span> unless<span class="token punctuation">-</span>stopped
</span><span class="code-line"> <span class="token atrule key">container_name</span><span class="token punctuation">:</span> <span class="token string">&#x27;kafka-dedup-blog&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">ports</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> <span class="token string">&#x27;9092:9092&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">expose</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> <span class="token string">&#x27;9093&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">depends_on</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> zookeeper
</span><span class="code-line"> <span class="token atrule key">environment</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">KAFKA_ZOOKEEPER_CONNECT</span><span class="token punctuation">:</span> zookeeper<span class="token punctuation">-</span>dedup<span class="token punctuation">-</span>blog<span class="token punctuation">:</span>2181/kafka
</span><span class="code-line"> <span class="token atrule key">KAFKA_BROKER_ID</span><span class="token punctuation">:</span> <span class="token number">0</span>
</span><span class="code-line"> <span class="token atrule key">KAFKA_ADVERTISED_HOST_NAME</span><span class="token punctuation">:</span> kafka<span class="token punctuation">-</span>dedup<span class="token punctuation">-</span>blog
</span><span class="code-line"> <span class="token atrule key">KAFKA_ADVERTISED_LISTENERS</span><span class="token punctuation">:</span> PLAINTEXT<span class="token punctuation">:</span>//kafka<span class="token punctuation">-</span>dedup<span class="token punctuation">-</span>blog<span class="token punctuation">:</span><span class="token number">9093</span><span class="token punctuation">,</span>OUTSIDE<span class="token punctuation">:</span>//localhost<span class="token punctuation">:</span><span class="token number">9092</span>
</span><span class="code-line"> <span class="token atrule key">KAFKA_LISTENERS</span><span class="token punctuation">:</span> PLAINTEXT<span class="token punctuation">:</span>//0.0.0.0<span class="token punctuation">:</span><span class="token number">9093</span><span class="token punctuation">,</span>OUTSIDE<span class="token punctuation">:</span>//0.0.0.0<span class="token punctuation">:</span><span class="token number">9092</span>
</span><span class="code-line"> <span class="token atrule key">KAFKA_LISTENER_SECURITY_PROTOCOL_MAP</span><span class="token punctuation">:</span> PLAINTEXT<span class="token punctuation">:</span>PLAINTEXT<span class="token punctuation">,</span>OUTSIDE<span class="token punctuation">:</span>PLAINTEXT
</span><span class="code-line"> <span class="token atrule key">networks</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> dedup_blog
</span><span class="code-line"> <span class="token atrule key">pinot-controller</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">image</span><span class="token punctuation">:</span> apachepinot/pinot<span class="token punctuation">:</span>0.11.0<span class="token punctuation">-</span>arm64
</span><span class="code-line"> <span class="token atrule key">command</span><span class="token punctuation">:</span> <span class="token string">&#x27;QuickStart -type EMPTY&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">container_name</span><span class="token punctuation">:</span> <span class="token string">&#x27;pinot-controller-dedup-blog&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">volumes</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> ./config<span class="token punctuation">:</span>/config
</span><span class="code-line"> <span class="token atrule key">restart</span><span class="token punctuation">:</span> unless<span class="token punctuation">-</span>stopped
</span><span class="code-line"> <span class="token atrule key">ports</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> <span class="token string">&#x27;9000:9000&#x27;</span>
</span><span class="code-line"> <span class="token atrule key">networks</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token punctuation">-</span> dedup_blog
</span><span class="code-line"><span class="token atrule key">networks</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">dedup_blog</span><span class="token punctuation">:</span>
</span><span class="code-line"> <span class="token atrule key">name</span><span class="token punctuation">:</span> dedup_blog
</span></code></pre></div><p>We can spin up our infrastructure using the following command:</p><div class="relative"><pre><code class="code-highlight language-bash"><span class="code-line"><span class="token function">docker-compose</span> up
</span></code></pre></div><h2 id="data-generation"><a href="#data-generation" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>Data Generation</h2><p>Let’s imagine that we want to ingest events generated by the following Python script:</p><div class="relative"><pre><code class="code-highlight language-python"><span class="code-line"><span class="token keyword">import</span> datetime
</span><span class="code-line"><span class="token keyword">import</span> uuid
</span><span class="code-line"><span class="token keyword">import</span> random
</span><span class="code-line"><span class="token keyword">import</span> json
</span><span class="code-line">
</span><span class="code-line"><span class="token keyword">while</span> <span class="token boolean">True</span><span class="token punctuation">:</span>
</span><span class="code-line"> ts <span class="token operator">=</span> datetime<span class="token punctuation">.</span>datetime<span class="token punctuation">.</span>now<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>strftime<span class="token punctuation">(</span><span class="token string">&quot;%Y-%m-%dT%H:%M:%S.%fZ&quot;</span><span class="token punctuation">)</span>
</span><span class="code-line"> <span class="token builtin">id</span> <span class="token operator">=</span> <span class="token builtin">str</span><span class="token punctuation">(</span>uuid<span class="token punctuation">.</span>uuid4<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
</span><span class="code-line"> count <span class="token operator">=</span> random<span class="token punctuation">.</span>randint<span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">,</span> <span class="token number">1000</span><span class="token punctuation">)</span>
</span><span class="code-line"> <span class="token keyword">print</span><span class="token punctuation">(</span>
</span><span class="code-line"> json<span class="token punctuation">.</span>dumps<span class="token punctuation">(</span><span class="token punctuation">{</span><span class="token string">&quot;tsString&quot;</span><span class="token punctuation">:</span> ts<span class="token punctuation">,</span> <span class="token string">&quot;uuid&quot;</span><span class="token punctuation">:</span> <span class="token builtin">id</span><span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token number">3</span><span class="token punctuation">]</span><span class="token punctuation">,</span> <span class="token string">&quot;count&quot;</span><span class="token punctuation">:</span> count<span class="token punctuation">}</span><span class="token punctuation">)</span>
</span><span class="code-line"> <span class="token punctuation">)</span>
</span></code></pre></div><p>We can view the data generated by this script by pasting the above code into a file called datagen.py and then running the following command:</p><div class="relative"><pre><code class="code-highlight language-bash"><span class="code-line">python datagen.py <span class="token operator"><span class="token file-descriptor important">2</span>&gt;</span>/dev/null <span class="token operator">|</span> <span class="token function">head</span> <span class="token parameter variable">-n3</span> <span class="token operator">|</span> jq
</span></code></pre></div><p>We’ll see the following output:</p><div class="relative"><pre><code class="code-highlight language-json"><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;tsString&quot;</span><span class="token operator">:</span> <span class="token string">&quot;2023-01-03T10:59:17.355081Z&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;uuid&quot;</span><span class="token operator">:</span> <span class="token string">&quot;f94&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;count&quot;</span><span class="token operator">:</span> <span class="token number">541</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;tsString&quot;</span><span class="token operator">:</span> <span class="token string">&quot;2023-01-03T10:59:17.355125Z&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;uuid&quot;</span><span class="token operator">:</span> <span class="token string">&quot;057&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;count&quot;</span><span class="token operator">:</span> <span class="token number">96</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;tsString&quot;</span><span class="token operator">:</span> <span class="token string">&quot;2023-01-03T10:59:17.355141Z&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;uuid&quot;</span><span class="token operator">:</span> <span class="token string">&quot;d7b&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;count&quot;</span><span class="token operator">:</span> <span class="token number">288</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span></code></pre></div><p>If we generate only 25,000 events, we’ll get some duplicates, which we can see by running the following command:</p><div class="relative"><pre><code class="code-highlight language-bash"><span class="code-line">python datagen.py <span class="token operator"><span class="token file-descriptor important">2</span>&gt;</span>/dev/null <span class="token operator">|</span>
</span><span class="code-line">jq <span class="token parameter variable">-r</span> <span class="token string">&#x27;.uuid&#x27;</span> <span class="token operator">|</span> <span class="token function">head</span> <span class="token parameter variable">-n25000</span> <span class="token operator">|</span> <span class="token function">uniq</span> <span class="token parameter variable">-cd</span>
</span></code></pre></div><p>The results of running that command are shown below:</p><div class="relative"><pre><code class="code-highlight language-text"><span class="code-line">2 3a2
</span><span class="code-line">2 a04
</span><span class="code-line">2 433
</span><span class="code-line">2 291
</span><span class="code-line">2 d73
</span></code></pre></div><p>We’re going to pipe this data into a Kafka stream called events, like this:</p><div class="relative"><pre><code class="code-highlight language-bash"><span class="code-line">python datagen.py <span class="token operator"><span class="token file-descriptor important">2</span>&gt;</span>/dev/null <span class="token operator">|</span>
</span><span class="code-line">jq <span class="token parameter variable">-cr</span> <span class="token parameter variable">--arg</span> sep 😊 <span class="token string">&#x27;[.uuid, tostring] | join($sep)&#x27;</span> <span class="token operator">|</span>
</span><span class="code-line">kcat <span class="token parameter variable">-P</span> <span class="token parameter variable">-b</span> localhost:9092 <span class="token parameter variable">-t</span> events -K😊
</span></code></pre></div><p>The construction of the key/value structure comes from Robin Moffatt’s <a target="_blank" rel="noopener noreferrer" href="https://rmoff.net/2020/09/30/setting-key-value-when-piping-from-jq-to-kafkacat/">excellent blog post</a>. Since that blog post was written, kcat has started supporting multi byte separators, which is why we can use a smiley face to separate our key and value.</p><h2 id="pinot-schematable-config"><a href="#pinot-schematable-config" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>Pinot Schema/Table Config</h2><p>Next, we’re going to create a Pinot table and schema with the same name. Let’s first define a schema:</p><div class="relative"><pre><code class="code-highlight language-json"><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;schemaName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dimensionFieldSpecs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span><span class="token punctuation">{</span> <span class="token property">&quot;name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;uuid&quot;</span><span class="token punctuation">,</span> <span class="token property">&quot;dataType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;STRING&quot;</span> <span class="token punctuation">}</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;metricFieldSpecs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span><span class="token punctuation">{</span> <span class="token property">&quot;name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;count&quot;</span><span class="token punctuation">,</span> <span class="token property">&quot;dataType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;INT&quot;</span> <span class="token punctuation">}</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dateTimeFieldSpecs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span>
</span><span class="code-line"> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;ts&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dataType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;TIMESTAMP&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;format&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1:MILLISECONDS:EPOCH&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;granularity&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1:MILLISECONDS&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span>
</span><span class="code-line"> <span class="token punctuation">]</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span></code></pre></div><p>Note that the timestamp field is called ts and not tsString, as it is in the Kafka stream. We’re going to transform the DateTime string value held in that field into a proper timestamp using a transformation function.</p><p>Our table config is described below:</p><div class="relative"><pre><code class="code-highlight language-json"><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;tableName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;tableType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;REALTIME&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;segmentsConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;timeColumnName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;ts&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;schemaName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;replication&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;replicasPerPartition&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;tableIndexConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;loadMode&quot;</span><span class="token operator">:</span> <span class="token string">&quot;MMAP&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;streamConfigs&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;streamType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;kafka&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.topic.name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.broker.list&quot;</span><span class="token operator">:</span> <span class="token string">&quot;kafka-dedup-blog:9093&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.consumer.type&quot;</span><span class="token operator">:</span> <span class="token string">&quot;lowlevel&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.consumer.prop.auto.offset.reset&quot;</span><span class="token operator">:</span> <span class="token string">&quot;smallest&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.consumer.factory.class.name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.decoder.class.name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span>
</span><span class="code-line"> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;ingestionConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;transformConfigs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span>
</span><span class="code-line"> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;columnName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;ts&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;transformFunction&quot;</span><span class="token operator">:</span> <span class="token string">&quot;FromDateTime(tsString, &#x27;YYYY-MM-dd&#x27;&#x27;T&#x27;&#x27;HH:mm:ss.SSSSSS&#x27;&#x27;Z&#x27;&#x27;&#x27;)&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span>
</span><span class="code-line"> <span class="token punctuation">]</span>
</span><span class="code-line"> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;tenants&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span><span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;metadata&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span><span class="token punctuation">}</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span></code></pre></div><p>Let’s create the table using the following command:</p><div class="relative"><pre><code class="code-highlight language-bash"><span class="code-line"><span class="token function">docker</span> run <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">--network</span> dedup_blog <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-v</span> <span class="token constant environment">$PWD</span>/pinot/config:/config <span class="token punctuation">\</span>
</span><span class="code-line"> apachepinot/pinot:0.11.0-arm64 AddTable <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-schemaFile</span> /config/schema.json <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-tableConfigFile</span> /config/table.json <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-controllerHost</span> <span class="token string">&quot;pinot-controller-dedup-blog&quot;</span> <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-exec</span>
</span></code></pre></div><p>Now we can navigate to <a target="_blank" rel="noopener noreferrer" href="http://localhost:9000/">http://localhost:9000</a> and run a query that will return a count of the number of each uuid:</p><div class="relative"><pre><code class="code-highlight language-sql"><span class="code-line"><span class="token keyword">select</span> uuid<span class="token punctuation">,</span> <span class="token function">count</span><span class="token punctuation">(</span><span class="token operator">*</span><span class="token punctuation">)</span>
</span><span class="code-line"><span class="token keyword">from</span> events
</span><span class="code-line"><span class="token keyword">group</span> <span class="token keyword">by</span> uuid
</span><span class="code-line"><span class="token keyword">order</span> <span class="token keyword">by</span> <span class="token function">count</span><span class="token punctuation">(</span><span class="token operator">*</span><span class="token punctuation">)</span>
</span><span class="code-line"><span class="token keyword">limit</span> <span class="token number">10</span>
</span></code></pre></div><p>The results of this query are shown below:</p><p><img alt="Sample Apache Pinot real-time query response stats including duplicates" src="https://www.datocms-assets.com/75153/1673273173-image4.png" title="Sample Apache Pinot real-time query response stats including duplicates"/></p><p>We can see loads of duplicates!</p><p>Now let’s add a table and schema that uses the de-duplicate feature, starting with the schema:</p><div class="relative"><pre><code class="code-highlight language-json"><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;schemaName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events_dedup&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;primaryKeyColumns&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span><span class="token string">&quot;uuid&quot;</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dimensionFieldSpecs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span><span class="token punctuation">{</span> <span class="token property">&quot;name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;uuid&quot;</span><span class="token punctuation">,</span> <span class="token property">&quot;dataType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;STRING&quot;</span> <span class="token punctuation">}</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;metricFieldSpecs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span><span class="token punctuation">{</span> <span class="token property">&quot;name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;count&quot;</span><span class="token punctuation">,</span> <span class="token property">&quot;dataType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;INT&quot;</span> <span class="token punctuation">}</span><span class="token punctuation">]</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dateTimeFieldSpecs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span>
</span><span class="code-line"> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;ts&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dataType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;TIMESTAMP&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;format&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1:MILLISECONDS:EPOCH&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;granularity&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1:MILLISECONDS&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span>
</span><span class="code-line"> <span class="token punctuation">]</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span></code></pre></div><p>The main difference between this schema and the events schema is that we need to specify a primary key. This key can be any number of fields, but in this case, we’re only using the uuid field.</p><p>Next, the table config:</p><div class="relative"><pre><code class="code-highlight language-json"><span class="code-line"><span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;tableName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events_dedup&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;tableType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;REALTIME&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;segmentsConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;timeColumnName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;ts&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;schemaName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events_dedup&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;replication&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;replicasPerPartition&quot;</span><span class="token operator">:</span> <span class="token string">&quot;1&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;tableIndexConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;loadMode&quot;</span><span class="token operator">:</span> <span class="token string">&quot;MMAP&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;streamConfigs&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;streamType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;kafka&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.topic.name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;events&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.broker.list&quot;</span><span class="token operator">:</span> <span class="token string">&quot;kafka-dedup-blog:9093&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.consumer.type&quot;</span><span class="token operator">:</span> <span class="token string">&quot;lowlevel&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.consumer.prop.auto.offset.reset&quot;</span><span class="token operator">:</span> <span class="token string">&quot;smallest&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.consumer.factory.class.name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;stream.kafka.decoder.class.name&quot;</span><span class="token operator">:</span> <span class="token string">&quot;org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span>
</span><span class="code-line"> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;routing&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span> <span class="token property">&quot;instanceSelectorType&quot;</span><span class="token operator">:</span> <span class="token string">&quot;strictReplicaGroup&quot;</span> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;dedupConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span> <span class="token property">&quot;dedupEnabled&quot;</span><span class="token operator">:</span> <span class="token boolean">true</span><span class="token punctuation">,</span> <span class="token property">&quot;hashFunction&quot;</span><span class="token operator">:</span> <span class="token string">&quot;NONE&quot;</span> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;ingestionConfig&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;transformConfigs&quot;</span><span class="token operator">:</span> <span class="token punctuation">[</span>
</span><span class="code-line"> <span class="token punctuation">{</span>
</span><span class="code-line"> <span class="token property">&quot;columnName&quot;</span><span class="token operator">:</span> <span class="token string">&quot;ts&quot;</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;transformFunction&quot;</span><span class="token operator">:</span> <span class="token string">&quot;FromDateTime(tsString, &#x27;YYYY-MM-dd&#x27;&#x27;T&#x27;&#x27;HH:mm:ss.SSSSSS&#x27;&#x27;Z&#x27;&#x27;&#x27;)&quot;</span>
</span><span class="code-line"> <span class="token punctuation">}</span>
</span><span class="code-line"> <span class="token punctuation">]</span>
</span><span class="code-line"> <span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;tenants&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span><span class="token punctuation">}</span><span class="token punctuation">,</span>
</span><span class="code-line"> <span class="token property">&quot;metadata&quot;</span><span class="token operator">:</span> <span class="token punctuation">{</span><span class="token punctuation">}</span>
</span><span class="code-line"><span class="token punctuation">}</span>
</span></code></pre></div><p>The changes to notice here are:</p><ul><li><code>&quot;dedupConfig&quot;: {&quot;dedupEnabled&quot;: true, &quot;hashFunction&quot;: &quot;NONE&quot;}</code> - This enables the feature and indicates that we won’t use a hash function on our primary key.</li><li><code>&quot;routing&quot;: {&quot;instanceSelectorType&quot;: &quot;strictReplicaGroup&quot;}</code> - This makes sure that all segments of the same partition are served from the same server to ensure data consistency across the segments.</li></ul><div class="relative"><pre><code class="code-highlight language-bash"><span class="code-line"> <span class="token function">docker</span> run <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">--network</span> dedup_blog <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-v</span> <span class="token constant environment">$PWD</span>/pinot/config:/config <span class="token punctuation">\</span>
</span><span class="code-line"> apachepinot/pinot:0.11.0-arm64 AddTable <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-schemaFile</span> /config/schema-dedup.json <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-tableConfigFile</span> /config/table-dedup.json <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-controllerHost</span> <span class="token string">&quot;pinot-controller-dedup-blog&quot;</span> <span class="token punctuation">\</span>
</span><span class="code-line"> <span class="token parameter variable">-exec</span>
</span><span class="code-line">
</span><span class="code-line"> <span class="token keyword">select</span> uuid, count<span class="token punctuation">(</span>*<span class="token punctuation">)</span>
</span><span class="code-line"> from events_dedup
</span><span class="code-line"> group by uuid
</span><span class="code-line"> order by count<span class="token punctuation">(</span>*<span class="token punctuation">)</span>
</span><span class="code-line"> limit <span class="token number">10</span>
</span></code></pre></div><p><img alt="Sample Apache Pinot real-time query response stats deduplicated" src="https://www.datocms-assets.com/75153/1673273248-image3.png" title="Sample Apache Pinot real-time query response stats deduplicated"/></p><p>We have every combination of hex values (16^3=4096) and no duplicates! Pinot’s de-duplication feature has done its job.</p><h2 id="how-does-it-work"><a href="#how-does-it-work" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>How does it work? </h2><p>When we’re not using the deduplication feature, events are ingested from our streaming platform into Pinot, as shown in the diagram below:</p><p><img alt="Events ingested from a streaming platform into Apache Pinot without using the deduplication feature" src="https://www.datocms-assets.com/75153/1673273272-pinot_0-11-de-duplication-diagram_1-v2.png" title="Events ingested from a streaming platform into Apache Pinot without using the deduplication feature"/></p><p>When de-dup is enabled, we have to check whether records can be ingested, as shown in the diagram below:</p><p><img alt="Events ingested from a streaming platform into Apache Pinot using the deduplication feature" src="https://www.datocms-assets.com/75153/1673273289-pinot_0-11-de-duplication-diagram_2-v3.png" title="Events ingested from a streaming platform into Apache Pinot using the deduplication feature"/></p><p>De-dup works out whether a primary key has already been ingested by using an in memory map of (primary key -&gt; corresponding segment reference).</p><p>We need to take that into account when using this feature, otherwise, we’ll end up using all the available memory on the Pinot Server. Below are some tips for using this feature:</p><ul><li>Try to use a simple primary key type and avoid composite keys. If you don’t have a simple primary key, consider using one of the available hash functions to reduce the space taken up.</li><li>Create more partitions in the streaming platform than you might otherwise create. The number of partitions determines the partition numbers of the Pinot table. The more partitions you have in the streaming platform, the more Pinot servers you can distribute the Pinot table to, and the more horizontally scalable the table will be.</li></ul><h2 id="summary"><a href="#summary" aria-hidden="true" tabindex="-1"><span class="icon icon-link"></span></a>Summary</h2><p>This feature makes it easier to ensure that we don’t end up with duplicate data in our Apache Pinot estate.</p><p>So give it a try and let us know how you get on. If you have any questions about this feature, feel free to join us on <a target="_blank" rel="noopener noreferrer" href="https://stree.ai/slack">Slack</a>, where we’ll be happy to help you out.</p><p>And if you’re interested in how this feature was implemented, you can look at the <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/pinot/pull/8708">pull request on GitHub</a>.</p></div></main></div></div><aside class="mt-10 hidden border-l-2 pl-5 lg:sticky lg:top-1 lg:block lg:h-full"><section class="sticky top-0 mb-4 w-[15.375rem]"><div class="flex flex-col space-y-1.5 pb-3"><h3 class="text-sm font-semibold leading-snug text-neutral-500 dark:text-neutral-100">Table of Contents</h3></div><nav class="flex items-center self-start" aria-label="Table of Contents"><ol class="list-none space-y-3"><li class="text-sm leading-tight font-bold"><a href="#why-do-we-need-deduplication-on-real-time-tables">Why do we need deduplication on real-time tables?</a></li><li class="text-sm font-normal leading-tight"><a href="#how-does-dedup-differ-from-upserts">How does dedup differ from upserts?</a></li><li class="text-sm font-normal leading-tight"><a href="#setting-up-apache-kafka-and-apache-pinot">Setting up Apache Kafka and Apache Pinot</a></li><li class="text-sm font-normal leading-tight"><a href="#data-generation">Data Generation</a></li><li class="text-sm font-normal leading-tight"><a href="#pinot-schematable-config">Pinot Schema/Table Config</a></li><li class="text-sm font-normal leading-tight"><a href="#how-does-it-work">How does it work? </a></li><li class="text-sm font-normal leading-tight"><a href="#summary">Summary</a></li></ol></nav></section></aside></div></article></section></main><footer class="border-t bg-sky-100 px-5 py-10 md:px-[6.75rem] md:pb-10 md:pt-16"><div class="mx-auto flex max-w-7xl flex-wrap justify-between"><div class="flex-shrink-0"><svg xmlns="http://www.w3.org/2000/svg" width="120" height="48" fill="none"><g fill="#C7154A" clip-path="url(#logo_svg__a)"><path d="M42.99 18.448c1.032-.553 2.21-.831 3.535-.831 1.542 0 2.938.38 4.187 1.14 1.248.76 2.236 1.841 2.965 3.241.728 1.402 1.091 3.025 1.091 4.872s-.363 3.482-1.091 4.903c-.729 1.424-1.717 2.525-2.965 3.307-1.25.782-2.645 1.173-4.187 1.173-1.325 0-2.493-.271-3.503-.815-1.01-.543-1.83-1.226-2.46-2.053v14.612H36V17.912h4.562v2.606c.586-.825 1.395-1.515 2.426-2.068l.002-.002m6.452 5.605c-.445-.793-1.032-1.395-1.76-1.808a4.72 4.72 0 0 0-2.362-.618c-.847 0-1.602.211-2.33.635-.728.423-1.315 1.038-1.76 1.841-.445.804-.668 1.749-.668 2.835 0 1.087.221 2.032.668 2.835.445.804 1.032 1.417 1.76 1.842a4.557 4.557 0 0 0 2.33.635 4.57 4.57 0 0 0 2.362-.652c.728-.435 1.313-1.053 1.76-1.856.445-.804.668-1.76.668-2.867s-.223-2.025-.668-2.818v-.004M62.947 17.912v18.051h-4.562V17.912h4.562m.551-6.079a2.833 2.833 0 1 1-5.666 0 2.833 2.833 0 0 1 5.666 0M82.954 19.687c1.325 1.358 1.988 3.253 1.988 5.685v10.59H80.38v-9.97c0-1.434-.358-2.537-1.075-3.307-.717-.772-1.695-1.157-2.933-1.157-1.239 0-2.254.387-2.982 1.157-.728.772-1.091 1.873-1.091 3.307v9.97h-4.562V17.91h4.562v2.248a6.322 6.322 0 0 1 2.33-1.841c.944-.445 1.981-.669 3.111-.669 2.15 0 3.889.68 5.214 2.037v.002M92.892 35.098c-1.39-.77-2.482-1.861-3.275-3.275-.794-1.411-1.19-3.041-1.19-4.888s.406-3.475 1.221-4.888a8.502 8.502 0 0 1 3.34-3.275c1.412-.772 2.987-1.157 4.725-1.157 1.739 0 3.312.387 4.725 1.157a8.5 8.5 0 0 1 3.34 3.275c.815 1.411 1.222 3.041 1.222 4.888s-.418 3.475-1.255 4.888a8.708 8.708 0 0 1-3.388 3.275c-1.424.772-3.014 1.157-4.774 1.157-1.76 0-3.301-.385-4.691-1.157m7.021-3.421c.729-.402 1.309-1.005 1.744-1.809.435-.803.651-1.781.651-2.933 0-1.715-.451-3.035-1.351-3.958-.902-.924-2.004-1.385-3.307-1.385s-2.395.461-3.275 1.385c-.88.923-1.32 2.243-1.32 3.958 0 1.715.428 3.035 1.287 3.958.858.924 1.938 1.385 3.241 1.385.825 0 1.602-.2 2.33-.603v.002M115.96 21.658v8.734c0 .608.147 1.048.44 1.32.293.271.787.406 1.482.406H120v3.845h-2.867c-3.845 0-5.766-1.868-5.766-5.605v-8.7h-2.15v-3.746h2.15V13l4.595-1v5.912h4.04v3.746h-4.042M20.03 46.757l-5.538-1.385A1.97 1.97 0 0 1 13 43.46v-5.462c0-.841.349-1.601.907-2.146a12.212 12.212 0 0 0 6.975-3.644c2.602-2.731 3.627-6.578 2.882-10.251L21 9h-4V4a1 1 0 0 0-2 0v7a1 1 0 0 1-2 0v-1a1 1 0 0 0-2 0v6.758a4.489 4.489 0 0 1 2.694-.755c2.278.095 4.156 1.934 4.297 4.21a4.501 4.501 0 0 1-6.992 4.029V29a1 1 0 0 1-2 0V7a1 1 0 0 0-2 0v2h-4L.237 21.957c-.745 3.675.279 7.52 2.882 10.251a12.202 12.202 0 0 0 6.975 3.644c.558.545.907 1.305.907 2.146V43.4c0 .938-.639 1.757-1.55 1.985l-5.48 1.37c-.57.143-.97.655-.97 1.243h18c0-.588-.4-1.1-.97-1.243v.002"></path><path d="M13.5 23a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5M8 5a1 1 0 1 0 0-2 1 1 0 0 0 0 2M12 8a1 1 0 1 0 0-2 1 1 0 0 0 0 2M16 2a1 1 0 1 0 0-2 1 1 0 0 0 0 2"></path></g><defs><clipPath id="logo_svg__a"><path fill="#fff" d="M0 0h120v48H0z"></path></clipPath></defs></svg></div><div class="flex flex-wrap gap-x-16 gap-y-5 py-8 md:pl-24 md:pr-[21.625rem]"> <div><h5 class="mb-4 text-lg font-semibold">Resources</h5><div class="flex justify-between gap-x-10"><div class="flex flex-col"><a target="_blank" rel="noopener noreferrer" href="https://docs.pinot.apache.org/" class="block py-1 text-gray-600 hover:text-gray-900">Docs</a><a target="_blank" rel="noopener noreferrer" href="https://docs.pinot.apache.org/getting-started" class="block py-1 text-gray-600 hover:text-gray-900">Getting Started</a><a target="_blank" rel="noopener noreferrer" href="https://docs.pinot.apache.org/integrations/thirdeye" class="block py-1 text-gray-600 hover:text-gray-900">ThirdEye</a></div><div class="flex flex-col"><a class="block py-1 text-gray-600 hover:text-gray-900" href="/powered-by">Company Stories</a><a class="block py-1 text-gray-600 hover:text-gray-900" href="/download">Download</a><a class="block py-1 text-gray-600 hover:text-gray-900" href="/blog">Blog</a></div></div></div><div><h5 class="mb-4 text-lg font-semibold">Apache</h5><div class="flex justify-between gap-x-10"><div class="flex flex-col"><a target="_blank" rel="noopener noreferrer" href="https://www.apache.org" class="block py-1 text-gray-600 hover:text-gray-900">Foundation</a><a target="_blank" rel="noopener noreferrer" href="https://www.apache.org/licenses" class="block py-1 text-gray-600 hover:text-gray-900">License</a><a target="_blank" rel="noopener noreferrer" href="https://www.apache.org/security" class="block py-1 text-gray-600 hover:text-gray-900">Security</a></div><div class="flex flex-col"><a target="_blank" rel="noopener noreferrer" href="https://www.apache.org/foundation/sponsorship.html" class="block py-1 text-gray-600 hover:text-gray-900">Sponsorship</a><a target="_blank" rel="noopener noreferrer" href="https://www.apache.org/events/current-event" class="block py-1 text-gray-600 hover:text-gray-900">Events</a><a target="_blank" rel="noopener noreferrer" href="https://www.apache.org/foundation/thanks.html" class="block py-1 text-gray-600 hover:text-gray-900">Thanks</a></div></div></div></div><div class="mt-4 flex justify-center md:mt-0"><a target="_blank" rel="noopener noreferrer" href="https://join.slack.com/t/apache-pinot/shared_invite/zt-5z7pav2f-yYtjZdVA~EDmrGkho87Vzw" class="mr-4"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-slack fill-gray-900"><rect width="3" height="8" x="13" y="2" rx="1.5"></rect><path d="M19 8.5V10h1.5A1.5 1.5 0 1 0 19 8.5"></path><rect width="3" height="8" x="8" y="14" rx="1.5"></rect><path d="M5 15.5V14H3.5A1.5 1.5 0 1 0 5 15.5"></path><rect width="8" height="3" x="14" y="13" rx="1.5"></rect><path d="M15.5 19H14v1.5a1.5 1.5 0 1 0 1.5-1.5"></path><rect width="8" height="3" x="2" y="8" rx="1.5"></rect><path d="M8.5 5H10V3.5A1.5 1.5 0 1 0 8.5 5"></path></svg></a><a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/pinot"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="currentColor" size="24"><g clip-path="url(#github_svg__a)"><path fill-rule="evenodd" d="M12.01 0C5.369 0 0 5.5 0 12.304c0 5.44 3.44 10.043 8.212 11.673.597.122.815-.265.815-.59 0-.286-.02-1.264-.02-2.283-3.34.734-4.036-1.466-4.036-1.466-.537-1.426-1.332-1.793-1.332-1.793-1.094-.754.08-.754.08-.754 1.212.082 1.849 1.263 1.849 1.263 1.073 1.874 2.803 1.345 3.5 1.019.098-.795.417-1.345.755-1.65-2.665-.285-5.468-1.345-5.468-6.07 0-1.345.477-2.445 1.232-3.3-.119-.306-.537-1.57.12-3.26 0 0 1.014-.326 3.3 1.263.98-.27 1.989-.407 3.003-.408 1.014 0 2.048.143 3.002.408 2.287-1.59 3.301-1.263 3.301-1.263.657 1.69.239 2.954.12 3.26.775.855 1.232 1.955 1.232 3.3 0 4.725-2.803 5.764-5.488 6.07.438.387.815 1.12.815 2.281 0 1.65-.02 2.975-.02 3.382 0 .326.22.713.816.59C20.56 22.347 24 17.744 24 12.305 24.02 5.5 18.63 0 12.01 0" clip-rule="evenodd"></path></g><defs><clipPath id="github_svg__a"><path fill="#fff" d="M0 0h24v24H0z"></path></clipPath></defs></svg></a></div></div><div class="mt-8 border-t border-neutral-300 pt-4 text-left text-sm text-gray-600">Copyright © <!-- -->2024<!-- --> The Apache Software Foundation. Apache Pinot, Pinot, Apache, the Apache feather logo, and the Apache Pinot project logo are registered trademarks of The Apache Software Foundation. This page has references to third party software - Presto, PrestoDB, ThirdEye, Trino, TrinoDB, that are not part of the Apache Software Foundation and are not covered under the Apache License.</div></footer></div><script src="/_next/static/chunks/webpack-dde39ac7c1b4eb4b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/_next/static/media/6905431624c34d00-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/_next/static/css/9e925a33b1acdac1.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:HL[\"/_next/static/css/c130d1629644f070.css\",\"style\",{\"crossOrigin\":\"\"}]\n"])</script><script>self.__next_f.push([1,"5:I[3728,[],\"\"]\n7:I[9928,[],\"\"]\n8:I[7821,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"702\",\"static/chunks/702-a2bf9fe707814b79.js\",\"185\",\"static/chunks/app/layout-776a485845c720ef.js\"],\"ThemeProviders\"]\n9:I[3994,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"702\",\"static/chunks/702-a2bf9fe707814b79.js\",\"185\",\"static/chunks/app/layout-776a485845c720ef.js\"],\"\"]\na:I[9640,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js"])</script><script>self.__next_f.push([1,"\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"702\",\"static/chunks/702-a2bf9fe707814b79.js\",\"185\",\"static/chunks/app/layout-776a485845c720ef.js\"],\"AlgoliaSearchProvider\"]\nb:I[7975,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"702\",\"static/chunks/702-a2bf9fe707814b79.js\",\"185\",\"static/chunks/app/layout-776a485845c720ef.js\"],\"\"]\nc:I[6954,[],\"\"]\nd:I[7264,[],\"\"]\ne:I[8326,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"413\",\"static/chunks/413-f9f40b83f7bb3f22.js\""])</script><script>self.__next_f.push([1,",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"797\",\"static/chunks/app/blog/%5B...slug%5D/page-502e08b6677b55da.js\"],\"\"]\n11:T9fe,"])</script><script>self.__next_f.push([1,"M42.99 18.448c1.032-.553 2.21-.831 3.535-.831 1.542 0 2.938.38 4.187 1.14 1.248.76 2.236 1.841 2.965 3.241.728 1.402 1.091 3.025 1.091 4.872s-.363 3.482-1.091 4.903c-.729 1.424-1.717 2.525-2.965 3.307-1.25.782-2.645 1.173-4.187 1.173-1.325 0-2.493-.271-3.503-.815-1.01-.543-1.83-1.226-2.46-2.053v14.612H36V17.912h4.562v2.606c.586-.825 1.395-1.515 2.426-2.068l.002-.002m6.452 5.605c-.445-.793-1.032-1.395-1.76-1.808a4.72 4.72 0 0 0-2.362-.618c-.847 0-1.602.211-2.33.635-.728.423-1.315 1.038-1.76 1.841-.445.804-.668 1.749-.668 2.835 0 1.087.221 2.032.668 2.835.445.804 1.032 1.417 1.76 1.842a4.557 4.557 0 0 0 2.33.635 4.57 4.57 0 0 0 2.362-.652c.728-.435 1.313-1.053 1.76-1.856.445-.804.668-1.76.668-2.867s-.223-2.025-.668-2.818v-.004M62.947 17.912v18.051h-4.562V17.912h4.562m.551-6.079a2.833 2.833 0 1 1-5.666 0 2.833 2.833 0 0 1 5.666 0M82.954 19.687c1.325 1.358 1.988 3.253 1.988 5.685v10.59H80.38v-9.97c0-1.434-.358-2.537-1.075-3.307-.717-.772-1.695-1.157-2.933-1.157-1.239 0-2.254.387-2.982 1.157-.728.772-1.091 1.873-1.091 3.307v9.97h-4.562V17.91h4.562v2.248a6.322 6.322 0 0 1 2.33-1.841c.944-.445 1.981-.669 3.111-.669 2.15 0 3.889.68 5.214 2.037v.002M92.892 35.098c-1.39-.77-2.482-1.861-3.275-3.275-.794-1.411-1.19-3.041-1.19-4.888s.406-3.475 1.221-4.888a8.502 8.502 0 0 1 3.34-3.275c1.412-.772 2.987-1.157 4.725-1.157 1.739 0 3.312.387 4.725 1.157a8.5 8.5 0 0 1 3.34 3.275c.815 1.411 1.222 3.041 1.222 4.888s-.418 3.475-1.255 4.888a8.708 8.708 0 0 1-3.388 3.275c-1.424.772-3.014 1.157-4.774 1.157-1.76 0-3.301-.385-4.691-1.157m7.021-3.421c.729-.402 1.309-1.005 1.744-1.809.435-.803.651-1.781.651-2.933 0-1.715-.451-3.035-1.351-3.958-.902-.924-2.004-1.385-3.307-1.385s-2.395.461-3.275 1.385c-.88.923-1.32 2.243-1.32 3.958 0 1.715.428 3.035 1.287 3.958.858.924 1.938 1.385 3.241 1.385.825 0 1.602-.2 2.33-.603v.002M115.96 21.658v8.734c0 .608.147 1.048.44 1.32.293.271.787.406 1.482.406H120v3.845h-2.867c-3.845 0-5.766-1.868-5.766-5.605v-8.7h-2.15v-3.746h2.15V13l4.595-1v5.912h4.04v3.746h-4.042M20.03 46.757l-5.538-1.385A1.97 1.97 0 0 1 13 43.46v-5.462c0-.841.349-1.601.907-2.146a12.212 12.212 0 0 0 6.975-3.644c2.602-2.731 3.627-6.578 2.882-10.251L21 9h-4V4a1 1 0 0 0-2 0v7a1 1 0 0 1-2 0v-1a1 1 0 0 0-2 0v6.758a4.489 4.489 0 0 1 2.694-.755c2.278.095 4.156 1.934 4.297 4.21a4.501 4.501 0 0 1-6.992 4.029V29a1 1 0 0 1-2 0V7a1 1 0 0 0-2 0v2h-4L.237 21.957c-.745 3.675.279 7.52 2.882 10.251a12.202 12.202 0 0 0 6.975 3.644c.558.545.907 1.305.907 2.146V43.4c0 .938-.639 1.757-1.55 1.985l-5.48 1.37c-.57.143-.97.655-.97 1.243h18c0-.588-.4-1.1-.97-1.243v.002"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/_next/static/css/9e925a33b1acdac1.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L5\",null,{\"buildId\":\"rmcKjFZ3e9kKdH1iJwCIQ\",\"assetPrefix\":\"\",\"initialCanonicalUrl\":\"/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables\",\"initialTree\":[\"\",{\"children\":[\"blog\",{\"children\":[[\"slug\",\"2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables\",\"c\"],{\"children\":[\"__PAGE__?{\\\"slug\\\":[\\\"2023\\\",\\\"01\\\",\\\"29\\\",\\\"Apache-Pinot-Deduplication-on-Real-Time-Tables\\\"]}\",{}]}]}]},\"$undefined\",\"$undefined\",true],\"initialHead\":[false,\"$L6\"],\"globalErrorComponent\":\"$7\",\"children\":[null,[\"$\",\"html\",null,{\"lang\":\"en-us\",\"className\":\"__variable_1fc36d scroll-smooth\",\"suppressHydrationWarning\":true,\"children\":[[\"$\",\"head\",null,{\"children\":[[\"$\",\"meta\",null,{\"httpEquiv\":\"Content-Security-Policy\",\"content\":\"default-src 'self';script-src 'self' 'unsafe-eval' 'unsafe-inline' giscus.app analytics.umami.is www.youtube.com www.googletagmanager.com www.google-analytics.com;style-src 'self' 'unsafe-inline';img-src * blob: data:;media-src *.s3.amazonaws.com;connect-src *;font-src 'self';frame-src www.youtube.com youtube.com giscus.app youtu.be https://www.youtube.com https://youtube.com;\"}],[\"$\",\"link\",null,{\"rel\":\"apple-touch-icon\",\"sizes\":\"76x76\",\"href\":\"/static/favicons/apple-touch-icon.png\"}],[\"$\",\"link\",null,{\"rel\":\"icon\",\"type\":\"image/png\",\"sizes\":\"32x32\",\"href\":\"/static/favicons/favicon-32x32.png\"}],[\"$\",\"link\",null,{\"rel\":\"icon\",\"type\":\"image/png\",\"sizes\":\"16x16\",\"href\":\"/static/favicons/favicon-16x16.png\"}],[\"$\",\"link\",null,{\"rel\":\"manifest\",\"href\":\"/static/favicons/site.webmanifest\"}],[\"$\",\"link\",null,{\"rel\":\"mask-icon\",\"href\":\"/static/favicons/safari-pinned-tab.svg\",\"color\":\"#5bbad5\"}],[\"$\",\"meta\",null,{\"name\":\"msapplication-TileColor\",\"content\":\"#000000\"}],[\"$\",\"meta\",null,{\"name\":\"theme-color\",\"media\":\"(prefers-color-scheme: light)\",\"content\":\"#fff\"}],[\"$\",\"meta\",null,{\"name\":\"theme-color\",\"media\":\"(prefers-color-scheme: dark)\",\"content\":\"#000\"}],[\"$\",\"link\",null,{\"rel\":\"alternate\",\"type\":\"application/rss+xml\",\"href\":\"/feed.xml\"}]]}],[\"$\",\"body\",null,{\"className\":\"bg-white text-black antialiased dark:bg-gray-950 dark:text-white\",\"children\":[\"$\",\"$L8\",null,{\"children\":[[\"$undefined\",\"$undefined\",\"$undefined\",\"$undefined\",[[\"$\",\"$L9\",null,{\"strategy\":\"afterInteractive\",\"src\":\"https://www.googletagmanager.com/gtag/js?id=G-ZXG79NJEBY\"}],[\"$\",\"$L9\",null,{\"strategy\":\"afterInteractive\",\"id\":\"ga-script\",\"children\":\"\\n window.dataLayer = window.dataLayer || [];\\n function gtag(){dataLayer.push(arguments);}\\n gtag('js', new Date());\\n gtag('config', 'G-ZXG79NJEBY');\\n \"}]]],[\"$\",\"div\",null,{\"className\":\"mx-auto flex max-w-screen-customDesktop flex-col justify-between font-sans\",\"children\":[\"$\",\"$La\",null,{\"algoliaConfig\":{\"appId\":\"CKRA00L2X9\",\"apiKey\":\"6531f8f7783a88d76629190843f1801e\",\"indexName\":\"prod_apache_pinot_docs\"},\"children\":[[\"$\",\"$Lb\",null,{}],[\"$\",\"main\",null,{\"children\":[\"$\",\"$Lc\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$Ld\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[\"$\",\"div\",null,{\"className\":\"flex flex-col items-start justify-start md:mt-24 md:flex-row md:items-center md:justify-center md:space-x-6\",\"children\":[[\"$\",\"div\",null,{\"className\":\"space-x-2 pb-8 pt-6 md:space-y-5\",\"children\":[\"$\",\"h1\",null,{\"className\":\"text-6xl font-extrabold leading-9 tracking-tight text-gray-900 dark:text-gray-100 md:border-r-2 md:px-6 md:text-8xl md:leading-14\",\"children\":\"404\"}]}],[\"$\",\"div\",null,{\"className\":\"max-w-md\",\"children\":[[\"$\",\"p\",null,{\"className\":\"mb-4 text-xl font-bold leading-normal md:text-2xl\",\"children\":\"Sorry we couldn't find this page.\"}],[\"$\",\"p\",null,{\"className\":\"mb-8\",\"children\":\"But dont worry, you can find plenty of other things on our homepage.\"}],[\"$\",\"$Le\",null,{\"href\":\"/\",\"className\":\"focus:shadow-outline-blue inline rounded-lg border border-transparent bg-blue-600 px-4 py-2 text-sm font-medium leading-5 text-white shadow transition-colors duration-150 hover:bg-blue-700 focus:outline-none dark:hover:bg-blue-500\",\"children\":\"Back to homepage\"}]]}]]}],\"notFoundStyles\":[],\"initialChildNode\":[\"$\",\"$Lc\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"blog\",\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$Ld\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\",\"initialChildNode\":[\"$\",\"$Lc\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\",\"blog\",\"children\",[\"slug\",\"2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables\",\"c\"],\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$Ld\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":\"$undefined\",\"notFoundStyles\":\"$undefined\",\"initialChildNode\":[\"$Lf\",\"$L10\",null],\"childPropSegment\":\"__PAGE__?{\\\"slug\\\":[\\\"2023\\\",\\\"01\\\",\\\"29\\\",\\\"Apache-Pinot-Deduplication-on-Real-Time-Tables\\\"]}\",\"styles\":[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/_next/static/css/c130d1629644f070.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]]}],\"childPropSegment\":[\"slug\",\"2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables\",\"c\"],\"styles\":null}],\"childPropSegment\":\"blog\",\"styles\":null}]}],[\"$\",\"footer\",null,{\"className\":\"border-t bg-sky-100 px-5 py-10 md:px-[6.75rem] md:pb-10 md:pt-16\",\"children\":[[\"$\",\"div\",null,{\"className\":\"mx-auto flex max-w-7xl flex-wrap justify-between\",\"children\":[[\"$\",\"div\",null,{\"className\":\"flex-shrink-0\",\"children\":[\"$\",\"svg\",null,{\"xmlns\":\"http://www.w3.org/2000/svg\",\"width\":120,\"height\":48,\"fill\":\"none\",\"children\":[[\"$\",\"g\",null,{\"fill\":\"#C7154A\",\"clipPath\":\"url(#logo_svg__a)\",\"children\":[[\"$\",\"path\",null,{\"d\":\"$11\"}],[\"$\",\"path\",null,{\"d\":\"M13.5 23a2.5 2.5 0 1 0 0-5 2.5 2.5 0 0 0 0 5M8 5a1 1 0 1 0 0-2 1 1 0 0 0 0 2M12 8a1 1 0 1 0 0-2 1 1 0 0 0 0 2M16 2a1 1 0 1 0 0-2 1 1 0 0 0 0 2\"}]]}],[\"$\",\"defs\",null,{\"children\":[\"$\",\"clipPath\",null,{\"id\":\"logo_svg__a\",\"children\":[\"$\",\"path\",null,{\"fill\":\"#fff\",\"d\":\"M0 0h120v48H0z\"}]}]}]]}]}],[\"$\",\"div\",null,{\"className\":\"flex flex-wrap gap-x-16 gap-y-5 py-8 md:pl-24 md:pr-[21.625rem]\",\"children\":[\" \",[[\"$\",\"div\",\"Resources\",{\"children\":[[\"$\",\"h5\",null,{\"className\":\"mb-4 text-lg font-semibold\",\"children\":\"Resources\"}],[\"$\",\"div\",null,{\"className\":\"flex justify-between gap-x-10\",\"children\":[[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://docs.pinot.apache.org/\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Docs\"}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://docs.pinot.apache.org/getting-started\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Getting Started\"}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://docs.pinot.apache.org/integrations/thirdeye\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"ThirdEye\"}]]}],[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[[\"$\",\"$Le\",null,{\"href\":\"/powered-by\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Company Stories\"}],[\"$\",\"$Le\",null,{\"href\":\"/download\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Download\"}],[\"$\",\"$Le\",null,{\"href\":\"/blog\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Blog\"}]]}]]}]]}],[\"$\",\"div\",\"Apache\",{\"children\":[[\"$\",\"h5\",null,{\"className\":\"mb-4 text-lg font-semibold\",\"children\":\"Apache\"}],[\"$\",\"div\",null,{\"className\":\"flex justify-between gap-x-10\",\"children\":[[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://www.apache.org\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Foundation\"}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://www.apache.org/licenses\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"License\"}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://www.apache.org/security\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Security\"}]]}],[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://www.apache.org/foundation/sponsorship.html\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Sponsorship\"}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://www.apache.org/events/current-event\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Events\"}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://www.apache.org/foundation/thanks.html\",\"className\":\"block py-1 text-gray-600 hover:text-gray-900\",\"children\":\"Thanks\"}]]}]]}]]}]]]}],[\"$\",\"div\",null,{\"className\":\"mt-4 flex justify-center md:mt-0\",\"children\":[[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://join.slack.com/t/apache-pinot/shared_invite/zt-5z7pav2f-yYtjZdVA~EDmrGkho87Vzw\",\"className\":\"mr-4\",\"children\":[\"$\",\"svg\",null,{\"xmlns\":\"http://www.w3.org/2000/svg\",\"width\":24,\"height\":24,\"viewBox\":\"0 0 24 24\",\"fill\":\"none\",\"stroke\":\"currentColor\",\"strokeWidth\":2,\"strokeLinecap\":\"round\",\"strokeLinejoin\":\"round\",\"className\":\"lucide lucide-slack fill-gray-900\",\"children\":[[\"$\",\"rect\",\"diqz80\",{\"width\":\"3\",\"height\":\"8\",\"x\":\"13\",\"y\":\"2\",\"rx\":\"1.5\"}],[\"$\",\"path\",\"183iwg\",{\"d\":\"M19 8.5V10h1.5A1.5 1.5 0 1 0 19 8.5\"}],[\"$\",\"rect\",\"hqg7r1\",{\"width\":\"3\",\"height\":\"8\",\"x\":\"8\",\"y\":\"14\",\"rx\":\"1.5\"}],[\"$\",\"path\",\"76g71w\",{\"d\":\"M5 15.5V14H3.5A1.5 1.5 0 1 0 5 15.5\"}],[\"$\",\"rect\",\"1kmz0a\",{\"width\":\"8\",\"height\":\"3\",\"x\":\"14\",\"y\":\"13\",\"rx\":\"1.5\"}],[\"$\",\"path\",\"jc4sz0\",{\"d\":\"M15.5 19H14v1.5a1.5 1.5 0 1 0 1.5-1.5\"}],[\"$\",\"rect\",\"1omvl4\",{\"width\":\"8\",\"height\":\"3\",\"x\":\"2\",\"y\":\"8\",\"rx\":\"1.5\"}],[\"$\",\"path\",\"16f3cl\",{\"d\":\"M8.5 5H10V3.5A1.5 1.5 0 1 0 8.5 5\"}],\"$undefined\"]}]}],[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://github.com/apache/pinot\",\"children\":[\"$\",\"svg\",null,{\"xmlns\":\"http://www.w3.org/2000/svg\",\"width\":24,\"height\":24,\"fill\":\"currentColor\",\"size\":24,\"children\":[[\"$\",\"g\",null,{\"clipPath\":\"url(#github_svg__a)\",\"children\":[\"$\",\"path\",null,{\"fillRule\":\"evenodd\",\"d\":\"M12.01 0C5.369 0 0 5.5 0 12.304c0 5.44 3.44 10.043 8.212 11.673.597.122.815-.265.815-.59 0-.286-.02-1.264-.02-2.283-3.34.734-4.036-1.466-4.036-1.466-.537-1.426-1.332-1.793-1.332-1.793-1.094-.754.08-.754.08-.754 1.212.082 1.849 1.263 1.849 1.263 1.073 1.874 2.803 1.345 3.5 1.019.098-.795.417-1.345.755-1.65-2.665-.285-5.468-1.345-5.468-6.07 0-1.345.477-2.445 1.232-3.3-.119-.306-.537-1.57.12-3.26 0 0 1.014-.326 3.3 1.263.98-.27 1.989-.407 3.003-.408 1.014 0 2.048.143 3.002.408 2.287-1.59 3.301-1.263 3.301-1.263.657 1.69.239 2.954.12 3.26.775.855 1.232 1.955 1.232 3.3 0 4.725-2.803 5.764-5.488 6.07.438.387.815 1.12.815 2.281 0 1.65-.02 2.975-.02 3.382 0 .326.22.713.816.59C20.56 22.347 24 17.744 24 12.305 24.02 5.5 18.63 0 12.01 0\",\"clipRule\":\"evenodd\"}]}],[\"$\",\"defs\",null,{\"children\":[\"$\",\"clipPath\",null,{\"id\":\"github_svg__a\",\"children\":[\"$\",\"path\",null,{\"fill\":\"#fff\",\"d\":\"M0 0h24v24H0z\"}]}]}]]}]}]]}]]}],[\"$\",\"div\",null,{\"className\":\"mt-8 border-t border-neutral-300 pt-4 text-left text-sm text-gray-600\",\"children\":[\"Copyright © \",2024,\" The Apache Software Foundation. Apache Pinot, Pinot, Apache, the Apache feather logo, and the Apache Pinot project logo are registered trademarks of The Apache Software Foundation. This page has references to third party software - Presto, PrestoDB, ThirdEye, Trino, TrinoDB, that are not part of the Apache Software Foundation and are not covered under the Apache License.\"]}]]}]]}]}]]}]}]]}],null]}]]\n"])</script><script>self.__next_f.push([1,"12:I[1514,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"413\",\"static/chunks/413-f9f40b83f7bb3f22.js\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"797\",\"static/chunks/app/blog/%5B...slug%5D/page-502e08b6677b55da.js\"],\"\"]\n13:I[2529,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"413\",\"static/chunks/413-f9f40b83f7bb3f22.js\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"797\",\"static/chunks/app/blog/%5B...slug%5D/page-502e08b6677b55da.js\"],\"\"]\n14:I[5185,[\"326\",\"static/chunks/326-3a90a6443b9c824c.js\",\"413\",\""])</script><script>self.__next_f.push([1,"static/chunks/413-f9f40b83f7bb3f22.js\",\"980\",\"static/chunks/980-6e243f9cd384c7d2.js\",\"797\",\"static/chunks/app/blog/%5B...slug%5D/page-502e08b6677b55da.js\"],\"\"]\n"])</script><script>self.__next_f.push([1,"10:[[\"$\",\"script\",null,{\"type\":\"application/ld+json\",\"dangerouslySetInnerHTML\":{\"__html\":\"{\\\"@context\\\":\\\"https://schema.org\\\",\\\"@type\\\":\\\"BlogPosting\\\",\\\"headline\\\":\\\"Apache Pinot™ 0.11 - Deduplication on Real-Time Tables\\\",\\\"datePublished\\\":\\\"2023-01-29T00:00:00.000Z\\\",\\\"dateModified\\\":\\\"2023-01-29T00:00:00.000Z\\\",\\\"description\\\":\\\"Learn about the deduplication for the real-time tables feature in Apache Pinot\\\",\\\"image\\\":\\\"/static/images/twitter-card.png\\\",\\\"url\\\":\\\"https://pinot.apache.org/blog/2023-01-29-Apache-Pinot-Deduplication-on-Real-Time-Tables\\\",\\\"author\\\":[{\\\"@type\\\":\\\"Person\\\",\\\"name\\\":\\\"Mark Needham\\\"}]}\"}}],[\"$\",\"section\",null,{\"className\":\" px-5 pt-10 md:px-[13.313rem] md:py-16\",\"children\":[[\"$\",\"$L12\",null,{}],[\"$\",\"article\",null,{\"className\":\"\",\"children\":[\"$\",\"div\",null,{\"className\":\"mx-auto lg:flex\",\"children\":[[\"$\",\"div\",null,{\"className\":\"lg:pr-12\",\"children\":[[\"$\",\"header\",null,{\"className\":\"pt-6 md:pr-10\",\"children\":[[\"$\",\"h1\",null,{\"className\":\"text-4xl font-semibold\",\"children\":\"Apache Pinot™ 0.11 - Deduplication on Real-Time Tables\"}],[\"$\",\"p\",null,{\"className\":\"pt-2 text-lg\",\"children\":[\"By: \",\"Mark Needham\"]}],[\"$\",\"p\",null,{\"className\":\"py-2 text-sm\",\"children\":[\"January 29th, 2023\",\" • \",\"8 min read\"]}]]}],[\"$\",\"div\",null,{\"className\":\"flex flex-col lg:flex-row\",\"children\":[\"$\",\"main\",null,{\"className\":\"\",\"children\":[\"$\",\"div\",null,{\"className\":\"prose max-w-[45rem] pb-8 pt-10 dark:prose-invert\",\"children\":[[\"$\",\"p\",null,{\"children\":[\"Last fall, the Apache Pinot community released version \",[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://medium.com/apache-pinot-developer-blog/apache-pinot-0-11-released-d564684df5d4\",\"children\":\"0.11.0\"}],\", which has lots of goodies for you to play with.\"]}],[\"$\",\"p\",null,{\"children\":[\"In this post, we’re going to learn about the \",[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://docs.pinot.apache.org/basics/data-import/dedup\",\"children\":\"deduplication for the real-time tables feature\"}],\".\"]}],[\"$\",\"h2\",null,{\"id\":\"why-do-we-need-deduplication-on-real-time-tables\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#why-do-we-need-deduplication-on-real-time-tables\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"Why do we need deduplication on real-time tables?\"]}],[\"$\",\"p\",null,{\"children\":\"This feature was built to deal with duplicate data in the streaming platform.\"}],[\"$\",\"p\",null,{\"children\":\"Users have previously used the upsert feature to de-duplicate data, but this has the following limitations:\"}],[\"$\",\"ul\",null,{\"children\":[[\"$\",\"li\",null,{\"children\":\"It forces us to keep redundant records that we don’t want to keep, which increases overall storage costs.\"}],[\"$\",\"li\",null,{\"children\":\"We can’t yet use the StarTree index with upserts, so the speed benefits we get from using that indexing technique are lost.\"}]]}],[\"$\",\"h2\",null,{\"id\":\"how-does-dedup-differ-from-upserts\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#how-does-dedup-differ-from-upserts\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"How does dedup differ from upserts?\"]}],[\"$\",\"p\",null,{\"children\":\"Both upserts and dedup keep track of multiple documents that have the same primary key. They differ as follows:\"}],[\"$\",\"ul\",null,{\"children\":[[\"$\",\"li\",null,{\"children\":\"Upserts are used when we want to get the latest copy of a document for a given primary key. It’s likely that some or all of the other fields will be different. Pinot stores all documents it receives, but when querying it will only return the latest document for each primary key.\"}],[\"$\",\"li\",null,{\"children\":\"Dedup is used when we know that multiple documents with the same primary key are identical. Only the first event received for a given primary key is stored in Pinot—any future events with the same primary key are thrown away.\"}]]}],[\"$\",\"p\",null,{\"children\":\"Let’s see how to use this functionality with help from a worked example.\"}],[\"$\",\"h2\",null,{\"id\":\"setting-up-apache-kafka-and-apache-pinot\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#setting-up-apache-kafka-and-apache-pinot\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"Setting up Apache Kafka and Apache Pinot\"]}],[\"$\",\"p\",null,{\"children\":\"We’re going to spin up Kafka and Pinot using the following Docker Compose config:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-yaml\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-yaml\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"version\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'3'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"services\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"zookeeper\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"image\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" zookeeper\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"3.8.0\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"hostname\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" zookeeper\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"container_name\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" zookeeper\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"dedup\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"blog\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"ports\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'2181:2181'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"environment\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"ZOOKEEPER_CLIENT_PORT\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"2181\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"ZOOKEEPER_TICK_TIME\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"2000\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"networks\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" dedup_blog\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"kafka\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"image\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" wurstmeister/kafka\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"latest\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"restart\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" unless\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"stopped\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"container_name\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'kafka-dedup-blog'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"ports\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'9092:9092'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"expose\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'9093'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"depends_on\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" zookeeper\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"environment\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"KAFKA_ZOOKEEPER_CONNECT\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" zookeeper\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"dedup\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"blog\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"2181/kafka\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"KAFKA_BROKER_ID\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"0\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"KAFKA_ADVERTISED_HOST_NAME\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" kafka\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"dedup\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"blog\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"KAFKA_ADVERTISED_LISTENERS\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" PLAINTEXT\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"//kafka\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"dedup\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"blog\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"9093\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"OUTSIDE\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"//localhost\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"9092\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"KAFKA_LISTENERS\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" PLAINTEXT\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"//0.0.0.0\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"9093\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"OUTSIDE\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"//0.0.0.0\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"9092\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"KAFKA_LISTENER_SECURITY_PROTOCOL_MAP\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" PLAINTEXT\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"PLAINTEXT\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"OUTSIDE\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"PLAINTEXT\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"networks\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" dedup_blog\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"pinot-controller\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"image\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" apachepinot/pinot\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"0.11.0\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"arm64\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"command\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'QuickStart -type EMPTY'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"container_name\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'pinot-controller-dedup-blog'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"volumes\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" ./config\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"/config\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"restart\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" unless\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\"stopped\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"ports\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'9000:9000'\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"networks\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"-\"}],\" dedup_blog\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"networks\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"dedup_blog\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token atrule key\",\"children\":\"name\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" dedup_blog\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"We can spin up our infrastructure using the following command:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-bash\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-bash\",\"children\":[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"docker-compose\"}],\" up\\n\"]}]}]}],[\"$\",\"h2\",null,{\"id\":\"data-generation\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#data-generation\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"Data Generation\"]}],[\"$\",\"p\",null,{\"children\":\"Let’s imagine that we want to ingest events generated by the following Python script:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-python\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-python\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"import\"}],\" datetime\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"import\"}],\" uuid\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"import\"}],\" random\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"import\"}],\" json\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"while\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token boolean\",\"children\":\"True\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" ts \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"=\"}],\" datetime\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\".\"}],\"datetime\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\".\"}],\"now\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\".\"}],\"strftime\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"%Y-%m-%dT%H:%M:%S.%fZ\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token builtin\",\"children\":\"id\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"=\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token builtin\",\"children\":\"str\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],\"uuid\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\".\"}],\"uuid4\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" count \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"=\"}],\" random\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\".\"}],\"randint\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"0\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"1000\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"print\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" json\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\".\"}],\"dumps\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"tsString\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" ts\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token builtin\",\"children\":\"id\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"3\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"count\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\":\"}],\" count\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"We can view the data generated by this script by pasting the above code into a file called datagen.py and then running the following command:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-bash\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-bash\",\"children\":[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\"python datagen.py \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token file-descriptor important\",\"children\":\"2\"}],\"\u003e\"]}],\"/dev/null \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"head\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-n3\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\" jq\\n\"]}]}]}],[\"$\",\"p\",null,{\"children\":\"We’ll see the following output:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-json\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-json\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tsString\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"2023-01-03T10:59:17.355081Z\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"f94\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"count\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"541\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tsString\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"2023-01-03T10:59:17.355125Z\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"057\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"count\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"96\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tsString\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"2023-01-03T10:59:17.355141Z\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"d7b\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"count\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"288\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"If we generate only 25,000 events, we’ll get some duplicates, which we can see by running the following command:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-bash\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-bash\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\"python datagen.py \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token file-descriptor important\",\"children\":\"2\"}],\"\u003e\"]}],\"/dev/null \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\"jq \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-r\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'.uuid'\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"head\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-n25000\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"uniq\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-cd\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"The results of running that command are shown below:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-text\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-text\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"2 3a2\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"2 a04\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"2 433\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"2 291\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"2 d73\\n\"}]]}]}],[\"$\",\"p\",null,{\"children\":\"We’re going to pipe this data into a Kafka stream called events, like this:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-bash\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-bash\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\"python datagen.py \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token file-descriptor important\",\"children\":\"2\"}],\"\u003e\"]}],\"/dev/null \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\"jq \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-cr\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"--arg\"}],\" sep 😊 \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"'[.uuid, tostring] | join($sep)'\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"|\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\"kcat \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-P\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-b\"}],\" localhost:9092 \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-t\"}],\" events -K😊\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":[\"The construction of the key/value structure comes from Robin Moffatt’s \",[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://rmoff.net/2020/09/30/setting-key-value-when-piping-from-jq-to-kafkacat/\",\"children\":\"excellent blog post\"}],\". Since that blog post was written, kcat has started supporting multi byte separators, which is why we can use a smiley face to separate our key and value.\"]}],[\"$\",\"h2\",null,{\"id\":\"pinot-schematable-config\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#pinot-schematable-config\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"Pinot Schema/Table Config\"]}],[\"$\",\"p\",null,{\"children\":\"Next, we’re going to create a Pinot table and schema with the same name. Let’s first define a schema:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-json\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-json\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"schemaName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dimensionFieldSpecs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dataType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"STRING\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"metricFieldSpecs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"count\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dataType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"INT\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dateTimeFieldSpecs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"ts\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dataType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"TIMESTAMP\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"format\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1:MILLISECONDS:EPOCH\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"granularity\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1:MILLISECONDS\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"Note that the timestamp field is called ts and not tsString, as it is in the Kafka stream. We’re going to transform the DateTime string value held in that field into a proper timestamp using a transformation function.\"}],[\"$\",\"p\",null,{\"children\":\"Our table config is described below:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-json\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-json\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tableName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tableType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"REALTIME\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"segmentsConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"timeColumnName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"ts\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"schemaName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"replication\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"replicasPerPartition\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tableIndexConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"loadMode\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"MMAP\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"streamConfigs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"streamType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"kafka\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.topic.name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.broker.list\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"kafka-dedup-blog:9093\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.consumer.type\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"lowlevel\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.consumer.prop.auto.offset.reset\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"smallest\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.consumer.factory.class.name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.decoder.class.name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"ingestionConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"transformConfigs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"columnName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"ts\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"transformFunction\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SSSSSS''Z''')\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tenants\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"metadata\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"Let’s create the table using the following command:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-bash\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-bash\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"docker\"}],\" run \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"--network\"}],\" dedup_blog \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-v\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token constant environment\",\"children\":\"$$PWD\"}],\"/pinot/config:/config \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" apachepinot/pinot:0.11.0-arm64 AddTable \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-schemaFile\"}],\" /config/schema.json \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-tableConfigFile\"}],\" /config/table.json \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-controllerHost\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"pinot-controller-dedup-blog\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-exec\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":[\"Now we can navigate to \",[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"http://localhost:9000/\",\"children\":\"http://localhost:9000\"}],\" and run a query that will return a count of the number of each uuid:\"]}],[\"$\",\"$L13\",null,{\"className\":\"language-sql\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-sql\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"select\"}],\" uuid\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"count\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"*\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"from\"}],\" events\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"group\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"by\"}],\" uuid\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"order\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"by\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"count\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\"*\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"limit\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"10\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"The results of this query are shown below:\"}],[\"$\",\"p\",null,{\"children\":[\"$\",\"img\",null,{\"alt\":\"Sample Apache Pinot real-time query response stats including duplicates\",\"src\":\"https://www.datocms-assets.com/75153/1673273173-image4.png\",\"title\":\"Sample Apache Pinot real-time query response stats including duplicates\"}]}],[\"$\",\"p\",null,{\"children\":\"We can see loads of duplicates!\"}],[\"$\",\"p\",null,{\"children\":\"Now let’s add a table and schema that uses the de-duplicate feature, starting with the schema:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-json\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-json\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"schemaName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events_dedup\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"primaryKeyColumns\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dimensionFieldSpecs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"uuid\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dataType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"STRING\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"metricFieldSpecs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"count\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dataType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"INT\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dateTimeFieldSpecs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"ts\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dataType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"TIMESTAMP\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"format\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1:MILLISECONDS:EPOCH\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"granularity\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1:MILLISECONDS\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"The main difference between this schema and the events schema is that we need to specify a primary key. This key can be any number of fields, but in this case, we’re only using the uuid field.\"}],[\"$\",\"p\",null,{\"children\":\"Next, the table config:\"}],[\"$\",\"$L13\",null,{\"className\":\"language-json\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-json\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tableName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events_dedup\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tableType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"REALTIME\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"segmentsConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"timeColumnName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"ts\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"schemaName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events_dedup\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"replication\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"replicasPerPartition\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"1\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tableIndexConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"loadMode\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"MMAP\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"streamConfigs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"streamType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"kafka\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.topic.name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"events\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.broker.list\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"kafka-dedup-blog:9093\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.consumer.type\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"lowlevel\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.consumer.prop.auto.offset.reset\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"smallest\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.consumer.factory.class.name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"stream.kafka.decoder.class.name\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"routing\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"instanceSelectorType\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"strictReplicaGroup\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dedupConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"dedupEnabled\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token boolean\",\"children\":\"true\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"hashFunction\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"NONE\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"ingestionConfig\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"transformConfigs\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"[\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"columnName\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"ts\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"transformFunction\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"FromDateTime(tsString, 'YYYY-MM-dd''T''HH:mm:ss.SSSSSS''Z''')\\\"\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"]\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"tenants\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\",\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token property\",\"children\":\"\\\"metadata\\\"\"}],[\"$\",\"span\",null,{\"className\":\"token operator\",\"children\":\":\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"{\"}],[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"}\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":\"The changes to notice here are:\"}],[\"$\",\"ul\",null,{\"children\":[[\"$\",\"li\",null,{\"children\":[[\"$\",\"code\",null,{\"children\":\"\\\"dedupConfig\\\": {\\\"dedupEnabled\\\": true, \\\"hashFunction\\\": \\\"NONE\\\"}\"}],\" - This enables the feature and indicates that we won’t use a hash function on our primary key.\"]}],[\"$\",\"li\",null,{\"children\":[[\"$\",\"code\",null,{\"children\":\"\\\"routing\\\": {\\\"instanceSelectorType\\\": \\\"strictReplicaGroup\\\"}\"}],\" - This makes sure that all segments of the same partition are served from the same server to ensure data consistency across the segments.\"]}]]}],[\"$\",\"$L13\",null,{\"className\":\"language-bash\",\"children\":[\"$\",\"code\",null,{\"className\":\"code-highlight language-bash\",\"children\":[[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token function\",\"children\":\"docker\"}],\" run \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"--network\"}],\" dedup_blog \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-v\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token constant environment\",\"children\":\"$$PWD\"}],\"/pinot/config:/config \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" apachepinot/pinot:0.11.0-arm64 AddTable \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-schemaFile\"}],\" /config/schema-dedup.json \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-tableConfigFile\"}],\" /config/table-dedup.json \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-controllerHost\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token string\",\"children\":\"\\\"pinot-controller-dedup-blog\\\"\"}],\" \",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"\\\\\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token parameter variable\",\"children\":\"-exec\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\"\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" \",[\"$\",\"span\",null,{\"className\":\"token keyword\",\"children\":\"select\"}],\" uuid, count\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],\"*\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\" from events_dedup\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":\" group by uuid\\n\"}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" order by count\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\"(\"}],\"*\",[\"$\",\"span\",null,{\"className\":\"token punctuation\",\"children\":\")\"}],\"\\n\"]}],[\"$\",\"span\",null,{\"className\":\"code-line\",\"children\":[\" limit \",[\"$\",\"span\",null,{\"className\":\"token number\",\"children\":\"10\"}],\"\\n\"]}]]}]}],[\"$\",\"p\",null,{\"children\":[\"$\",\"img\",null,{\"alt\":\"Sample Apache Pinot real-time query response stats deduplicated\",\"src\":\"https://www.datocms-assets.com/75153/1673273248-image3.png\",\"title\":\"Sample Apache Pinot real-time query response stats deduplicated\"}]}],[\"$\",\"p\",null,{\"children\":\"We have every combination of hex values (16^3=4096) and no duplicates! Pinot’s de-duplication feature has done its job.\"}],[\"$\",\"h2\",null,{\"id\":\"how-does-it-work\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#how-does-it-work\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"How does it work? \"]}],[\"$\",\"p\",null,{\"children\":\"When we’re not using the deduplication feature, events are ingested from our streaming platform into Pinot, as shown in the diagram below:\"}],[\"$\",\"p\",null,{\"children\":[\"$\",\"img\",null,{\"alt\":\"Events ingested from a streaming platform into Apache Pinot without using the deduplication feature\",\"src\":\"https://www.datocms-assets.com/75153/1673273272-pinot_0-11-de-duplication-diagram_1-v2.png\",\"title\":\"Events ingested from a streaming platform into Apache Pinot without using the deduplication feature\"}]}],[\"$\",\"p\",null,{\"children\":\"When de-dup is enabled, we have to check whether records can be ingested, as shown in the diagram below:\"}],[\"$\",\"p\",null,{\"children\":[\"$\",\"img\",null,{\"alt\":\"Events ingested from a streaming platform into Apache Pinot using the deduplication feature\",\"src\":\"https://www.datocms-assets.com/75153/1673273289-pinot_0-11-de-duplication-diagram_2-v3.png\",\"title\":\"Events ingested from a streaming platform into Apache Pinot using the deduplication feature\"}]}],[\"$\",\"p\",null,{\"children\":\"De-dup works out whether a primary key has already been ingested by using an in memory map of (primary key -\u003e corresponding segment reference).\"}],[\"$\",\"p\",null,{\"children\":\"We need to take that into account when using this feature, otherwise, we’ll end up using all the available memory on the Pinot Server. Below are some tips for using this feature:\"}],[\"$\",\"ul\",null,{\"children\":[[\"$\",\"li\",null,{\"children\":\"Try to use a simple primary key type and avoid composite keys. If you don’t have a simple primary key, consider using one of the available hash functions to reduce the space taken up.\"}],[\"$\",\"li\",null,{\"children\":\"Create more partitions in the streaming platform than you might otherwise create. The number of partitions determines the partition numbers of the Pinot table. The more partitions you have in the streaming platform, the more Pinot servers you can distribute the Pinot table to, and the more horizontally scalable the table will be.\"}]]}],[\"$\",\"h2\",null,{\"id\":\"summary\",\"children\":[[\"$\",\"a\",null,{\"href\":\"#summary\",\"aria-hidden\":\"true\",\"tabIndex\":\"-1\",\"children\":[\"$\",\"span\",null,{\"className\":\"icon icon-link\"}]}],\"Summary\"]}],[\"$\",\"p\",null,{\"children\":\"This feature makes it easier to ensure that we don’t end up with duplicate data in our Apache Pinot estate.\"}],[\"$\",\"p\",null,{\"children\":[\"So give it a try and let us know how you get on. If you have any questions about this feature, feel free to join us on \",[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://stree.ai/slack\",\"children\":\"Slack\"}],\", where we’ll be happy to help you out.\"]}],[\"$\",\"p\",null,{\"children\":[\"And if you’re interested in how this feature was implemented, you can look at the \",[\"$\",\"a\",null,{\"target\":\"_blank\",\"rel\":\"noopener noreferrer\",\"href\":\"https://github.com/apache/pinot/pull/8708\",\"children\":\"pull request on GitHub\"}],\".\"]}]]}]}]}]]}],[\"$\",\"aside\",null,{\"className\":\"mt-10 hidden border-l-2 pl-5 lg:sticky lg:top-1 lg:block lg:h-full\",\"children\":[\"$\",\"section\",null,{\"className\":\"sticky top-0 mb-4 w-[15.375rem]\",\"children\":[[\"$\",\"div\",null,{\"className\":\"flex flex-col space-y-1.5 pb-3\",\"children\":[\"$\",\"h3\",null,{\"className\":\"text-sm font-semibold leading-snug text-neutral-500 dark:text-neutral-100\",\"children\":\"Table of Contents\"}]}],[\"$\",\"$L14\",null,{\"chapters\":[{\"value\":\"Why do we need deduplication on real-time tables?\",\"url\":\"#why-do-we-need-deduplication-on-real-time-tables\",\"depth\":2},{\"value\":\"How does dedup differ from upserts?\",\"url\":\"#how-does-dedup-differ-from-upserts\",\"depth\":2},{\"value\":\"Setting up Apache Kafka and Apache Pinot\",\"url\":\"#setting-up-apache-kafka-and-apache-pinot\",\"depth\":2},{\"value\":\"Data Generation\",\"url\":\"#data-generation\",\"depth\":2},{\"value\":\"Pinot Schema/Table Config\",\"url\":\"#pinot-schematable-config\",\"depth\":2},{\"value\":\"How does it work? \",\"url\":\"#how-does-it-work\",\"depth\":2},{\"value\":\"Summary\",\"url\":\"#summary\",\"depth\":2}]}]]}]}]]}]}]]}]]\n"])</script><script>self.__next_f.push([1,"6:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"Apache Pinot™ 0.11 - Deduplication on Real-Time Tables | Apache Pinot™\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Learn about the deduplication for the real-time tables feature in Apache Pinot\"}],[\"$\",\"meta\",\"4\",{\"name\":\"robots\",\"content\":\"index, follow\"}],[\"$\",\"meta\",\"5\",{\"name\":\"googlebot\",\"content\":\"index, follow, max-video-preview:-1, max-image-preview:large, max-snippet:-1\"}],[\"$\",\"link\",\"6\",{\"rel\":\"canonical\",\"href\":\"https://pinot.apache.org/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables\"}],[\"$\",\"link\",\"7\",{\"rel\":\"alternate\",\"type\":\"application/rss+xml\",\"href\":\"https://pinot.apache.org/feed.xml\"}],[\"$\",\"meta\",\"8\",{\"property\":\"og:title\",\"content\":\"Apache Pinot™ 0.11 - Deduplication on Real-Time Tables\"}],[\"$\",\"meta\",\"9\",{\"property\":\"og:description\",\"content\":\"Learn about the deduplication for the real-time tables feature in Apache Pinot\"}],[\"$\",\"meta\",\"10\",{\"property\":\"og:url\",\"content\":\"https://pinot.apache.org/blog/2023/01/29/Apache-Pinot-Deduplication-on-Real-Time-Tables\"}],[\"$\",\"meta\",\"11\",{\"property\":\"og:site_name\",\"content\":\"Apache Pinot™\"}],[\"$\",\"meta\",\"12\",{\"property\":\"og:locale\",\"content\":\"en_US\"}],[\"$\",\"meta\",\"13\",{\"property\":\"og:image\",\"content\":\"https://pinot.apache.org/static/images/twitter-card.png\"}],[\"$\",\"meta\",\"14\",{\"property\":\"og:type\",\"content\":\"article\"}],[\"$\",\"meta\",\"15\",{\"property\":\"article:published_time\",\"content\":\"2023-01-29T00:00:00.000Z\"}],[\"$\",\"meta\",\"16\",{\"property\":\"article:modified_time\",\"content\":\"2023-01-29T00:00:00.000Z\"}],[\"$\",\"meta\",\"17\",{\"property\":\"article:author\",\"content\":\"Mark Needham\"}],[\"$\",\"meta\",\"18\",{\"name\":\"twitter:card\",\"content\":\"summary_large_image\"}],[\"$\",\"meta\",\"19\",{\"name\":\"twitter:title\",\"content\":\"Apache Pinot™ 0.11 - Deduplication on Real-Time Tables\"}],[\"$\",\"meta\",\"20\",{\"name\":\"twitter:description\",\"content\":\"Learn about the deduplication for the real-time tables feature in Apache Pinot\"}],[\"$\",\"meta\",\"21\",{\"name\":\"twitter:image\",\"content\":\"https://pinot.apache.org/static/images/twitter-card.png\"}],[\"$\",\"meta\",\"22\",{\"name\":\"next-size-adjust\"}]]\n"])</script><script>self.__next_f.push([1,"f:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>