blob: c6fb4a75d74c5f4c5775bebf02feb21868efc256 [file] [log] [blame]
<!doctype html>
<html lang="en-US" dir="ltr" class="blog-wrapper blog-tags-post-list-page plugin-blog plugin-id-default" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no">
<meta name="generator" content="Docusaurus v2.4.3">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Doris RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Doris Atom Feed">
<link rel="preconnect" href="https://www.google-analytics.com">
<link rel="preconnect" href="https://www.googletagmanager.com">
<script async src="https://www.googletagmanager.com/gtag/js?id=G-DT7W9E9722"></script>
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-DT7W9E9722",{anonymize_ip:!0})</script>
<link rel="preconnect" href="https://analytics.apache.org/">
<script>var _paq=window._paq=window._paq||[];_paq.push(["setRequestMethod","POST"]),_paq.push(["trackPageView"]),_paq.push(["enableLinkTracking"]),_paq.push(["enableHeartBeatTimer"]),function(){var e="https://analytics.apache.org/";_paq.push(["setRequestMethod","POST"]),_paq.push(["setTrackerUrl",e+"matomo.php"]),_paq.push(["setSiteId","43"]);var a=document,t=a.createElement("script"),p=a.getElementsByTagName("script")[0];t.type="text/javascript",t.async=!0,t.src=e+"matomo.js",p.parentNode.insertBefore(t,p)}()</script>
<link rel="icon" href="/images/logo-only.png">
<link rel="manifest" href="/manifest.json">
<meta name="theme-color" content="#FFFFFF">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="#000">
<link rel="apple-touch-icon" href="/img/docusaurus.png">
<link rel="mask-icon" href="/img/docusaurus.svg" color="rgb(37, 194, 160)">
<meta name="msapplication-TileImage" content="/img/docusaurus.png">
<meta name="msapplication-TileColor" content="#000">
<link rel="stylesheet" href="https://cdn-font.hyperos.mi.com/font/css?family=MiSans:100,200,300,400,450,500,600,650,700,900:Chinese_Simplify,Latin&display=swap">
<link rel="stylesheet" href="https://cdn-font.hyperos.mi.com/font/css?family=MiSans_Latin:100,200,300,400,450,500,600,650,700,900:Latin&display=swap">
<script src="/js/custom-script.js"></script><title data-rh="true">37 posts tagged with &quot;Best Practice&quot; - Apache Doris</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://doris.apache.org/blog/tags/best-practice"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" property="og:title" content="37 posts tagged with &quot;Best Practice&quot; - Apache Doris"><meta data-rh="true" name="docusaurus_tag" content="blog_tags_posts"><meta data-rh="true" name="docsearch:docusaurus_tag" content="blog_tags_posts"><link data-rh="true" rel="icon" href="/images/favicon.ico"><link data-rh="true" rel="canonical" href="https://doris.apache.org/blog/tags/best-practice"><link data-rh="true" rel="alternate" href="https://doris.apache.org/blog/tags/best-practice" hreflang="en-US"><link data-rh="true" rel="alternate" href="https://doris.apache.org/zh-CN/blog/tags/best-practice" hreflang="zh-Hans-CN"><link data-rh="true" rel="alternate" href="https://doris.apache.org/blog/tags/best-practice" hreflang="x-default"><link rel="stylesheet" href="https://cdnd.selectdb.com/assets/css/styles.2bd95c40.css">
<link rel="preload" href="https://cdnd.selectdb.com/assets/js/runtime~main.eb208fba.js" as="script">
<link rel="preload" href="https://cdnd.selectdb.com/assets/js/main.fa7fcb85.js" as="script">
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_s0pr" style="background-color:#3C2FD4;color:#FFFFFF" role="banner"><div class="announcementBarPlaceholder_qxfj"></div><div class="announcementBarContent_dpRF"><a href="https://github.com/apache/doris" target="_blank" style="display: flex; width: 100%; align-items: center; justify-content: center; margin-left: 4px; text-decoration: none; color: white">Do you ❤️ Doris? Give us a 🌟 on GitHub
<img style="width: 1.2rem; height: 1.2rem; margin-left: 0.4rem;" src="/images/github-white-icon.svg">
</a></div><button type="button" class="clean-btn close announcementBarClose_iXyO" aria-label="Close"><svg viewBox="0 0 15 15" width="14" height="14" style="color:white"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner" style="padding:"><div class="navbar__items"><div class="navbar-left"><div class="navbar-logo-wrapper flex items-center"><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="https://cdnd.selectdb.com/images/logo.svg" alt="Apache Doris" class="themedImage_ToTc themedImage--light_HNdA"><img src="https://cdnd.selectdb.com/images/logo.svg" alt="Apache Doris" class="themedImage_ToTc themedImage--dark_i4oU"></div><b class="navbar__title text--truncate"></b></a></div><a class="navbar__item navbar__link" style="text-align:center" href="/docs/get-starting/quick-start">Docs</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" style="text-align:center" href="/blog">Blog</a><a class="navbar__item navbar__link" style="text-align:center" href="/users">Users</a><a href="https://github.com/apache/doris/discussions" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link" style="text-align:center">Discussions</a><a class="navbar__item navbar__link" style="text-align:center" href="/ecosystem/cluster-management">Ecosystem</a><a class="navbar__item navbar__link" style="text-align:center" href="/community/join-community">Community</a></div></div><div class="navbar__items navbar__items--right"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><div class="docs-search searchBox_H2mL"><div class="navbar__search searchBarContainer_PzyC"><input placeholder="Search" aria-label="Search" class="navbar__search-input navbarSearchInput_tb6T"><div class="loadingRing__K5d searchBarLoadingRing_e2f0"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_m7ml"><kbd class="searchHint_zuPL">ctrl</kbd><kbd class="searchHint_zuPL">K</kbd></div></div></div><div class="custom-navbar-item navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link" aria-haspopup="true" aria-expanded="false" role="button" href="/docs/get-starting/what-is-apache-doris"><span class="text-sm">Versions: </span></a><ul class="dropdown__menu"><li><a class="dropdown__link" style="text-align:center" href="/docs/dev/get-starting/what-is-apache-doris">Dev</a></li><li><a class="dropdown__link" style="text-align:center" href="/docs/get-starting/what-is-apache-doris">2.1</a></li><li><a class="dropdown__link" style="text-align:center" href="/docs/2.0/get-starting/what-is-apache-doris">2.0</a></li><li><a class="dropdown__link" style="text-align:center" href="/docs/1.2/get-starting/">1.2</a></li></ul></div><a class="navbar__item navbar__link header-right-button-primary navbar-download-mobile" style="text-align:center" href="/download">Download</a><a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer" class="github-btn desktop header-right-button-github"></a><a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer" class="slack-btn desktop header-right-button-slack"></a><a class="header-right-button-primary navbar-download-desktop" href="/download">Download</a></div></div><div class="navbar__bottom"></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="main-wrapper"><div class="mb-[4.875rem] container"><div class="lg:row lg:flex"><main class="col col--9 col--offset-1" itemscope="" itemtype="http://schema.org/Blog"><header class="margin-bottom--xl"><h1>37 posts tagged with &quot;Best Practice&quot;</h1><a href="/blog/tags">View All Tags</a></header><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/apache-doris-for-log-and-time-series-data-analysis-in-netease">Apache Doris for log and time series data analysis in NetEase, why not Elasticsearch and InfluxDB?</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2024-05-23T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">May 23, 2024</time></div></header><div class="markdown" itemprop="articleBody"><p>For most people looking for a log management and analytics solution, Elasticsearch is the go-to choice. The same applies to InfluxDB for time series data analysis. These were exactly the choices of <a href="https://finance.yahoo.com/quote/NTES/" target="_blank" rel="noopener noreferrer">NetEase,Inc. <em>(NASDAQ: NTES)</em></a>, one of the world&#x27;s highest-yielding game companies but more than that. As NetEase expands its business horizons, the logs and time series data it receives explode, and problems like surging storage costs and declining stability come. As NetEase&#x27;s pick among all big data components for platform upgrades, <a href="https://doris.apache.org" target="_blank" rel="noopener noreferrer">Apache Doris</a> fits into both scenarios and brings much faster query performance. </p><p>We list the gains of NetEase after adopting Apache Doris in their monitoring platform and time series data platform, and share their best practice with users who have similar needs.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="monitoring-platform-elasticsearch---apache-doris">Monitoring platform: Elasticsearch -&gt; Apache Doris<a href="#monitoring-platform-elasticsearch---apache-doris" class="hash-link" aria-label="Direct link to Monitoring platform: Elasticsearch -&gt; Apache Doris" title="Direct link to Monitoring platform: Elasticsearch -&gt; Apache Doris"></a></h2><p>NetEase provides a collaborative workspace platform that combines email, calendar, cloud-based documents, instant messaging, and customer management, etc. To oversee its performance and availability, NetEase builds the Eagle monitoring platform, which collects logs for analysis. Eagle was supported by Elasticsearch and Logstash. The data pipeline was simple: Logstash gathers log data, cleans and transforms it, and then outputs it to Elasticsearch, which handles real-time log retrieval and analysis requests from users.</p><p><img loading="lazy" alt="Monitoring platform: Elasticsearch -&amp;gt; Apache Doris" src="https://cdnd.selectdb.com/assets/images/monitoring-platform-elasticsearch-5926a8f4794acda07e50b877ffc85c92.PNG" width="1280" height="158" class="img_ev3q"></p><p>Due to NetEase&#x27;s increasingly sizable log dataset, Elastisearch&#x27;s index design, and limited hardware resources, the monitoring platform exhibits <strong>high latency</strong> in daily queries. Additionally, Elasticsearch maintains high data redundancy for forward indexes, inverted indexes, and columnar storage. This adds to cost pressure.</p><p>After migration to Apache Doris, NetEase achieves a 70% reduction in storage costs and an 11-fold increase in query speed. </p><p><img loading="lazy" alt="Monitoring platform: Elasticsearch -&amp;gt; Apache Doris" src="https://cdnd.selectdb.com/assets/images/monitoring-platform-apache-doris-23c3a1008f0d3e6e59d53047ace7e185.PNG" width="1280" height="160" class="img_ev3q"></p><ul><li><p><strong>70% reduction in storage costs</strong>: This means a dataset that takes up 100TB in Elasticsearch only requires 30TB in Apache Doris. Moreover, thanks to the much-reduced storage space usage, they can replace their HDDs with more expensive SSDs for hot data storage to achieve higher query performance while staying within the same budget.</p></li><li><p><strong>11-fold increase in query speed</strong>: Apache Doris can deliver faster queries while consuming less CPU resources than Elasticsearch. As shown below, Doris has reliably low latency in queries of various sizes, while Elasticsearch demonstrates longer latency and greater fluctuations, and the smallest speed difference is 11-fold. </p></li></ul><p><img loading="lazy" alt="Apache Doris vs Elasticsearch" src="https://cdnd.selectdb.com/assets/images/doris-vs-elasticsearch-query-latency-542660f4457f559a4e594993e28aef4c.PNG" width="1280" height="720" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="time-series-data-platform-influxdb---apache-doris">Time series data platform: InfluxDB -&gt; Apache Doris<a href="#time-series-data-platform-influxdb---apache-doris" class="hash-link" aria-label="Direct link to Time series data platform: InfluxDB -&gt; Apache Doris" title="Direct link to Time series data platform: InfluxDB -&gt; Apache Doris"></a></h2><p>NetEase is also an instant messaging (IM) PaaS provider. To support this, it builds a data platform to analyze time series data from their IM services. The platform was built on InfluxDB, a time series database. Data flowed into a Kafka message queue. After the fields were parsed and cleaned, they arrived in InfluxDB, ready to be queried. InfluxDB responded to both online and offline queries. The former was to generate metric monitoring reports and bills in real time, and the latter was to batch analyze data from a day ago. </p><p><img loading="lazy" alt="Time series data platform: InfluxDB -&amp;gt; Apache Doris " src="https://cdnd.selectdb.com/assets/images/time-series-data-platform-from-influxdb-to-apache-doris-480aab1f5537e6bd0fba6f1c6801f9c3.PNG" width="1280" height="588" class="img_ev3q"></p><p>This platform was also challenged by the increasing data size and diversifying data sources.</p><ul><li><p><strong>OOM</strong>: Offline data analysis across multiple data sources was putting InfluxDB under huge pressure and causing OOM errors.</p></li><li><p><strong>High storage costs</strong>: Cold data took up a large portion but it was stored the same way as hot data. That added up to huge expenditures.</p></li></ul><p><img loading="lazy" alt="Time series data platform: InfluxDB -&amp;gt; Apache Doris " src="https://cdnd.selectdb.com/assets/images/time-series-data-platform-influxdb-to-apache-doris-2-def95b716954bcd09bdffa13fef7ed1f.PNG" width="1280" height="588" class="img_ev3q"></p><p>Replacing InfluxDB with Apache Doris has brought higher cost efficiency to the data platform:</p><ul><li><p><strong>Higher throughput</strong>: Apache Doris maintains a writing throughput of 500MB/s and achieves a peak writing throughput of 1GB/s. With InfluxDB, they used to require 22 servers for a CPU utilization rate of 50%. Now, with Doris, it only takes them 11 servers at the same CPU utilization rate. That means Doris helps cut down resource consumption by half.</p></li><li><p><strong>67% less storage usage</strong>: The same dataset used 150TB of storage space with InfluxDB but only took up 50TB with Doris. Thus, Doris helps reduce storage costs by 67%.</p></li><li><p><strong>Faster and more stable query performance</strong>: The performance test was to select a random online query SQL and run it 99 consecutive times. As is shown below, Doris delivers generally faster response time and maintains stability throughout the 99 queries.</p></li></ul><p><img loading="lazy" alt="Doris vs InfluxDB" src="https://cdnd.selectdb.com/assets/images/doris-vs-influxdb-cost-effectivity-1026ec10820805c8bffc1f024a8ab2cb.png" width="1280" height="692" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="best-practice">Best practice<a href="#best-practice" class="hash-link" aria-label="Direct link to Best practice" title="Direct link to Best practice"></a></h2><p>Adopting a new product and putting it into a production environment is, after all, a big project. The NetEase engineers came across a few hiccups during the journey, and they are kind enough to share about how they solved these problems and save other users some detours.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="table-creation">Table creation<a href="#table-creation" class="hash-link" aria-label="Direct link to Table creation" title="Direct link to Table creation"></a></h3><p>Table schema design has a significant impact on database performance, and this holds for log and time series data processing as well. Apache Doris provides optimization options for these scenarios. These are some recommendations provided by NetEase.</p><ol><li><p><strong>Retrieval of the latest N logs</strong>: Using a <code>DATETIME</code> type time field as the primary key can largely speed queries up.</p></li><li><p><strong>Partitioning strategy</strong>: Use <code>PARTITION BY RANGE</code> based on a time field and enable <a href="https://doris.apache.org/docs/2.0/table-design/data-partition#dynamic-partition" target="_blank" rel="noopener noreferrer">dynamic partition</a>. This allows for auto-management of data partitions.</p></li><li><p><strong>Bucketing strategy</strong>: Adopt random bucketing and set the number of buckets to roughly three times the total number of disks in the cluster. (Apache Doris also provides an <a href="https://doris.apache.org/docs/2.0/table-design/data-partition/#auto-bucket" target="_blank" rel="noopener noreferrer">auto bucket</a> feature to avoid performance loss caused by improper data sharding.)</p></li><li><p><strong>Indexing</strong>: Create indexes for frequently searched fields to improve query efficiency. Pay attention to the parser for the fields that require full-text searching, because it determines query accuracy.</p></li><li><p><strong>Compaction</strong>: Optimize the compaction strategies based on your own business needs.</p></li><li><p><strong>Data compression</strong>: Enable <code>ZSTD</code> for better a higher compression ratio.</p></li></ol><div class="language-sql codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sql codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">CREATE</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">TABLE</span><span class="token plain"> log</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ts </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DATETIME</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> host </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">VARCHAR</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token number">20</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> msg </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">TEXT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> size </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INT</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INDEX</span><span class="token plain"> idx_size </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">size</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">USING</span><span class="token plain"> INVERTED</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INDEX</span><span class="token plain"> idx_status </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">status</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">USING</span><span class="token plain"> INVERTED</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INDEX</span><span class="token plain"> idx_host </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">host</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">USING</span><span class="token plain"> INVERTED</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INDEX</span><span class="token plain"> idx_msg </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">msg</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">USING</span><span class="token plain"> INVERTED PROPERTIES</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">&quot;parser&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;unicode&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">ENGINE</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> OLAP</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DUPLICATE</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">KEY</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">PARTITION</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> RANGE</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">ts</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">DISTRIBUTED</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">BY</span><span class="token plain"> RANDOM BUCKETS </span><span class="token number">250</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">PROPERTIES </span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;compression&quot;</span><span class="token operator">=</span><span class="token string" style="color:rgb(255, 121, 198)">&quot;zstd&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;compaction_policy&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;time_series&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.enable&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;true&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.create_history_partition&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;true&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.time_unit&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;DAY&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.start&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;-7&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.end&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;3&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.prefix&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;p&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;dynamic_partition.buckets&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;250&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="cluster-configuration">Cluster configuration<a href="#cluster-configuration" class="hash-link" aria-label="Direct link to Cluster configuration" title="Direct link to Cluster configuration"></a></h3><p><strong>Frontend (FE) configuration</strong></p><div class="language-sql codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sql codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token comment" style="color:rgb(98, 114, 164)"># For higher data ingestion performance:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">enable_single_replica_load </span><span class="token operator">=</span><span class="token plain"> </span><span class="token boolean">true</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token comment" style="color:rgb(98, 114, 164)"># For more balanced tablet distribution:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">enable_round_robin_create_tablet </span><span class="token operator">=</span><span class="token plain"> </span><span class="token boolean">true</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">tablet_rebalancer_type </span><span class="token operator">=</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">partition</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token comment" style="color:rgb(98, 114, 164)"># Memory optimization for frequent imports:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">max_running_txn_num_per_db </span><span class="token operator">=</span><span class="token plain"> </span><span class="token number">10000</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">streaming_label_keep_max_second </span><span class="token operator">=</span><span class="token plain"> </span><span class="token number">300</span><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">label_clean_interval_second </span><span class="token operator">=</span><span class="token plain"> </span><span class="token number">300</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Backend (BE) configuration</strong></p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">write_buffer_size=1073741824</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">max_tablet_version_num = 20000</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">max_cumu_compaction_threads = 10 (Half of the total number of CPUs)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">enable_write_index_searcher_cache = false</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">disable_storage_page_cache = true</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">enable_single_replica_load = true</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">streaming_load_json_max_mb=250</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="stream-load-optimization">Stream Load optimization<a href="#stream-load-optimization" class="hash-link" aria-label="Direct link to Stream Load optimization" title="Direct link to Stream Load optimization"></a></h3><p>During peak times, the data platform is undertaking up to 1 million TPS and a writing throughput of 1GB/s. This is demanding for the system. Meanwhile, at peak time, a large number of concurrent write operations are loading data into lots of tables, but each individual write operation only involves a small amount of data. Thus, it takes a long time to accumulate a batch, which is contradictory to the data freshness requirement from the query side.</p><p>As a result, the data platform was bottlenecked by data backlogs in Apache Kafka. NetEase adopts the <a href="https://doris.apache.org/docs/2.0/data-operate/import/stream-load-manual" target="_blank" rel="noopener noreferrer">Stream Load</a> method to ingest data from Kafka to Doris. So the key was to accelerate Stream Load. After talking to the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris developers</a>, NetEase adopted two optimizations for their log and time series data analysis:</p><ul><li><p><strong>Single replica data loading</strong>: Load one data replica and pull data from it to generate more replicas. This avoids the overhead of ranking and creating indexes for multiple replicas.</p></li><li><p><strong>Single tablet data loading</strong> (<code>load_to_single_tablet=true</code>): Compared to writing data to multiple tablets, this reduces the I/O overhead and the number the small files generated during data loading. </p></li></ul><p>The above measures are effective in improving data loading performance:</p><ul><li><strong>2X data consumption speed from Kafka</strong></li></ul><p><img loading="lazy" alt="2X data consumption speed from Kafka" src="https://cdnd.selectdb.com/assets/images/doris-data-loading-performance-1-ee9e3f0841cd78fa0171bc08c18d6fbb.png" width="1280" height="456" class="img_ev3q"></p><ul><li><strong>75% lower data latency</strong></li></ul><p><img loading="lazy" alt="75% lower data latency" src="https://cdnd.selectdb.com/assets/images/doris-data-loading-performance-2-ad5092021a47b02cb0a874cd5511ea0f.png" width="1280" height="574" class="img_ev3q"></p><ul><li><strong>70% faster response of Stream Load</strong></li></ul><p><img loading="lazy" alt="70% faster response of Stream Load" src="https://cdnd.selectdb.com/assets/images/doris-data-loading-performance-3-32c579174b74d58a922ad4b29e03acd7.png" width="1280" height="459" class="img_ev3q"></p><p>Before putting the upgraded data platform in their production environment, NetEase has conducted extensive stress testing and grayscale testing. This is their experience in tackling errors along the way.</p><p><strong>1. Stream Load timeout:</strong></p><p> The early stage of stress testing often reported frequent timeout errors during data import. Additionally, despite the processes and cluster status being normal, the monitoring system couldn&#x27;t collect the correct BE metrics. The engineers obtained the Doris BE stack using Pstack and analyzed it with PT-PMT. They discovered that the root cause was the lack of HTTP chunked encoding or content-length settings when initiating requests. This led Doris to mistakenly consider the data transfer as incomplete, causing it to remain in a waiting state. The solution was to simply add a chunked encoding setting on the client side.</p><p><strong>2. Data size in a single Stream Load exceeding threshold:</strong> </p><p> The default limit is 100 MB. The solution was to increase <code>streaming_load_json_max_mb</code> to 250 MB.</p><p><strong>3. Error:</strong> <code>alive replica num 0 &lt; quorum replica num 1</code></p><p> By the <code>show backends</code> command, it was discovered that one BE node was in OFFLINE state. A lookup in the <code>be_custom</code> configuration file revealed a <code>broken_storage_path</code>. Further inspection of the BE logs located an error message &quot;too many open files,&quot; meaning the number of file handles opened by the BE process had exceeded the system&#x27;s limit, and this caused I/O operations to fail. When Doris detected such an abnormality, it marked the disk as unavailable. Because the table was configured with one single replica, when the disk holding the only replica was unavailable, data writing failed.</p><p> The solution was to increase the maximum open file descriptor limit for the process to 1 million, delete the <code>be_custom.conf</code> file, and restart the BE node.</p><p><strong>4. FE memory jitter</strong></p><p> During grayscale testing, the FE could not be connected. The monitoring data showed that the JVM&#x27;s 32 GB was exhausted, and the <code>bdb</code> directory under the FE&#x27;s meta directory had ballooned to 50 GB. Memory jitter occurred every hour, with peak memory usage reaching 80%</p><p> The root cause was improper parameter configuration. During high-concurrency Stream Load operations, the FE records the related Load information. Each import adds about 200 KB of information to the memory. The cleanup time for such information is controlled by the <code>streaming_label_keep_max_second</code> parameter, which by default is 12 hours. Reducing this to 5 minutes can prevent the FE memory from being exhausted. However, they didn&#x27;t modify the <code>label_clean_interval_second</code> parameter, which controls the interval of the label cleanup thread. The default value of this parameter is 1 hour, which explains the hourly memory jitter. </p><p> The solution was to dial down <code>label_clean_interval_second</code> to 5 minutes.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="query">Query<a href="#query" class="hash-link" aria-label="Direct link to Query" title="Direct link to Query"></a></h3><p>The engineers found results that did not match the filtering conditions in a query on the Eagle monitoring platform. </p><p><img loading="lazy" alt="Dorsi Query Optimization" src="https://cdnd.selectdb.com/assets/images/doris-query-optimization-9a78bd121d00c488676981931cf1e981.png" width="1280" height="936" class="img_ev3q"></p><p>This was due to the engineers&#x27; misconception of <code>match_all</code> in Apache Doris. <code>match_all</code> identifies data records that include all the specified tokens while tokenization is based on space and punctuation marks. In the unqualified result, although the timestamp did not match, the message included &quot;29&quot;, which compensated for the unmatched part in the timestamp. That&#x27;s why this data record was included as a query result.</p><p><img loading="lazy" alt="Dorsi Query Optimization" src="https://cdnd.selectdb.com/assets/images/doris-query-optimization-2-778c89a665a9de4e41aacc256b099954.png" width="1144" height="825" class="img_ev3q"></p><p>For Doris to produce what the engineers wanted in this query, <code>MATCH_PHRASE</code> should be used instead, because it also identifies the sequence of texts. </p><div class="language-sql codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sql codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">SELECT</span><span class="token plain"> </span><span class="token operator">*</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">FROM</span><span class="token plain"> table_name </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">WHERE</span><span class="token plain"> logmsg MATCH_PHRASE </span><span class="token string" style="color:rgb(255, 121, 198)">&#x27;keyword1 keyword2&#x27;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>Note that when using <code>MATCH_PHRASE</code>, you should enable <code>support_phrase</code> during index creation. Otherwise, the system will perform a full table scan and a hard match, resulting in poor query efficiency.</p><div class="language-sql codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sql codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">INDEX</span><span class="token plain"> idx_name4</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token plain">column_name4</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">USING</span><span class="token plain"> INVERTED PROPERTIES</span><span class="token punctuation" style="color:rgb(248, 248, 242)">(</span><span class="token string" style="color:rgb(255, 121, 198)">&quot;parser&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;english&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;support_phrase&quot;</span><span class="token plain"> </span><span class="token operator">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;true&quot;</span><span class="token punctuation" style="color:rgb(248, 248, 242)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>If you want to enable <code>support_phrase</code> for existing tables that have already been populated with data, you can execute <code>DROP INDEX</code> and then <code>ADD INDEX</code> to replace the old index with a new one. This process is incremental and does not require rewriting the entire table.</p><p><strong>This is another advantage of Doris compared to Elasticsearch: It supports more flexible index management and allows easy addition and removal of indexes.</strong></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>Apache Doris supports the log and time series data analytic workloads of NetEase with higher query performance and less storage consumption. Beyond these, Apache Doris has other capabilities such as data lake analysis since it is designed as an all-in-one big data analytic platform. If you want a quick evaluation of whether Doris is right for your use case, come talk to the Doris makers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a>.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/cross-cluster-replication-for-read-write">Cross-cluster replication for read-write separation: story of a grocery store brand</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2024-04-25T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">April 25, 2024</time></div></header><div class="markdown" itemprop="articleBody"><p>This is about how a grocery store brand leverages the <a href="https://doris.apache.org/docs/2.0/admin-manual/data-admin/ccr" target="_blank" rel="noopener noreferrer">Cross-Cluster Replication (CCR)</a> capability of Apache Doris to separate their data reading and writing workloads. In this case, where the freshness of groceries is guaranteed by the freshness of data, they use Apache Doris as their data warehouse to monitor and analyze their procurement, sale, and stock in real time for all their stores and supply chains. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="why-they-need-ccr">Why they need CCR<a href="#why-they-need-ccr" class="hash-link" aria-label="Direct link to Why they need CCR" title="Direct link to Why they need CCR"></a></h2><p>A major part of the user&#x27;s data warehouse (including the ODS, DWD, DWS, and ADS layers) is built within Apache Doris, which employs a micro-batch scheduling mechanism to coordinate data across the data warehouse layers. However, this is pressured by the burgeoning business of the grocery store brand. The data size they have to receive, store, and analyze is getting larger and larger. That means their data warehouse has to handle bigger data writing batches and more frequent data queries. However, task scheduling during query execution might lead to resource preemption, so any resource shortage can easily compromise performance or even cause task failure or system disruption.</p><p> Naturally, the user thought of <strong>separating the reading and writing workloads.</strong> Specifically, they want to replicate data from the ADS layer (which is cleaned, transformed, aggregated, and ready to be queried) to a backup cluster dedicated to query services. <strong>This is implemented by the CCR in Apache Doris.</strong> It prevents abnormal queries from interrupting data writing and ensures cluster stability. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="before-ccr">Before CCR<a href="#before-ccr" class="hash-link" aria-label="Direct link to Before CCR" title="Direct link to Before CCR"></a></h2><p>Before CCR was available, they innovatively adopted the <a href="https://doris.apache.org/docs/2.0/lakehouse/lakehouse-overview#multi-catalog" target="_blank" rel="noopener noreferrer">Multi-Catalog</a> feature of Doris for the same purpose. Multi-Catalog allows users to connect Doris to various data sources conveniently. It is actually designed for federated querying, but the user drew inspiration from it. They wrote a script and tried to pull incremental data via Catalog. Their data synchronization pipeline is as follows:</p><p><img loading="lazy" alt="Before CCR" src="https://cdnd.selectdb.com/assets/images/before-ccr-079a13cf3fe218976cce0015a6c6c752.jpeg" width="1280" height="1003" class="img_ev3q"></p><p>They loaded data from the source cluster to the target cluster by regular scheduling tasks. To identify incremental data, they added a <code>last_update_time</code> field to the tables. There were two downsides to this. Firstly, the data freshness of the target cluster was reliant on and hindered by the scheduling tasks. Secondly, for incremental data ingestion, in order to identify incremental data, the import SQL statement for every table has to include the logic to check the <code>last_update_time</code> field, otherwise the system just deletes and re-imports the entire table. Such requirement increases development complexity and data error rate. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="ccr-in-apache-doris">CCR in Apache Doris<a href="#ccr-in-apache-doris" class="hash-link" aria-label="Direct link to CCR in Apache Doris" title="Direct link to CCR in Apache Doris"></a></h2><p>Just when they were looking for a better solution, Apache Doris released CCR in version 2.0. Compared to the alternatives they&#x27;ve tried, CCR in Apache Doris is:</p><ul><li><p><strong>Lightweight in design</strong>: The data synchronization tasks consume very few machine resources. They run smoothly without reducing the overall performance of Apache Doris.</p></li><li><p><strong>Easy to use</strong>: It can be configured by one simple <code>POST</code> request.</p></li><li><p><strong>Unlimited in migration</strong>: Users can raise the upper limit of the data migration capabilities in CCR by optimizing their cluster configuration. </p></li><li><p><strong>Consistent in data</strong>: The DDL statements executed in the source cluster can be automatically synchronized into the target cluster, ensuring data consistency.</p></li><li><p><strong>Flexible in synchronization</strong>: It is able to perform both full data synchronization and incremental data synchronization.</p></li></ul><p>To start CCR in Doris simply requires two steps. Step one is to enable binlogs in both the source cluster and the target cluster. Step two is to send the name of the database or table to be replicated. Then the system will start synchronizing full or incremental data. The detailed workflow is as follows: </p><p><img loading="lazy" alt="CCR in Apache Doris" src="https://cdnd.selectdb.com/assets/images/ccr-in-apache-doris-31b9554f59ba15f637a5c54778915973.jpeg" width="1280" height="335" class="img_ev3q"></p><p>In the grocery store brand&#x27;s case, they need to synchronize a few tables from the source cluster to the target cluster, each table having an incremental data size of about 50 million rows. After a month&#x27;s trial run, the Doris CCR mechanism is proven to be stable and performant:</p><ul><li><p><strong>Higher stability and data accuracy</strong>: No replication failure has ever occurred during the trial period. Every data row is transferred and landed in the target cluster accurately. </p></li><li><p><strong>Streamlined workflows:</strong></p><ul><li><strong>Before CCR</strong>: The user had to write SQL for each table and write data via Catalog; For tables without a <code>last_update_time</code> field, incremental data synchronization can only be implemented by full-table deletion and re-import.</li></ul><div class="language-sql codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sql codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">Insert</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">into</span><span class="token plain"> catalog1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">db</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">destination_table_1 </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token operator">*</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> catalog1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">db</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">source_table1 </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">where</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">time</span><span class="token plain"> </span><span class="token operator">&gt;</span><span class="token plain"> xxx</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">Insert</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">into</span><span class="token plain"> catalog1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">db</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">destination_table_2 </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token operator">*</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> catalog1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">db</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">source_table2 </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">where</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">time</span><span class="token plain"> </span><span class="token operator">&gt;</span><span class="token plain"> xxx</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"></span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">Insert</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">into</span><span class="token plain"> catalog1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">db</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">destination_table_x </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">select</span><span class="token plain"> </span><span class="token operator">*</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(189, 147, 249);font-style:italic">from</span><span class="token plain"> catalog1</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">db</span><span class="token punctuation" style="color:rgb(248, 248, 242)">.</span><span class="token plain">source_table_x</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li><strong>After CCR</strong>: It only requires one <code>POST</code> request to synchronize an entire database.</li></ul><div class="language-sql codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-sql codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">curl </span><span class="token operator">-</span><span class="token plain">X POST </span><span class="token operator">-</span><span class="token plain">H </span><span class="token string" style="color:rgb(255, 121, 198)">&quot;Content-Type: application/json&quot;</span><span class="token plain"> </span><span class="token operator">-</span><span class="token plain">d </span><span class="token string" style="color:rgb(255, 121, 198)">&#x27;{</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;name&quot;: &quot;ccr_test&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;src&quot;: {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;host&quot;: &quot;localhost&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;port&quot;: &quot;9030&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;thrift_port&quot;: &quot;9020&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;user&quot;: &quot;root&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;password&quot;: &quot;&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;database&quot;: &quot;demo&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;table&quot;: &quot;&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> },</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;dest&quot;: {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;host&quot;: &quot;localhost&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;port&quot;: &quot;9030&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;thrift_port&quot;: &quot;9020&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;user&quot;: &quot;root&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;password&quot;: &quot;&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;database&quot;: &quot;ccrt&quot;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> &quot;table&quot;: &quot;&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token string" style="color:rgb(255, 121, 198)">}&#x27;</span><span class="token plain"> http:</span><span class="token comment" style="color:rgb(98, 114, 164)">//127.0.0.1:9190/create_ccr</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div></li><li><p><strong>Faster data loading</strong>: With CCR, it only takes <strong>3~4 seconds</strong> to ingest a day&#x27;s incremental data, as compared to more than 30 seconds with the Catalog method. As for real-time synchronization, CCR can finish data ingestion in 1 second, without reliance on manual updates or regular scheduling.</p></li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>Using CCR in Apache Doris, the grocery store brand separates reading and writing workloads into different clusters and thus improves overall system stability. This solution delivers a real-time data synchronization latency of about 1 second. To further ensure normal functioning, it has a real-time monitoring and alerting mechanism so any issue will be notified and attended to instantly, and a contingency plan to guarantee uninterrupted query services. It also supports partition-based data synchronization (e.g. <code>ALTER TABLE tbl1 REPLACE PARTITION</code>). With demonstrated effectiveness of CCR, they are planning to replicate more of their data assets for efficient and secure data usage.</p><p>CCR is also applicable when you need to build multiple data centers or derive a test dataset from your production environment. For further guidance on CCR, join the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris community</a>.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/breaking-down-data-silos-with-an-apache-doris-based-cdp">Breaking down data silos with a unified data warehouse: an Apache Doris-based CDP</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2024-03-05T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">March 5, 2024</time></div></header><div class="markdown" itemprop="articleBody"><p>The data silos problem is like arthritis for online business, because almost everyone gets it as they grow old. Businesses interact with customers via websites, mobile apps, H5 pages, and end devices. For one reason or another, it is tricky to integrate the data from all these sources. Data stays where it is and cannot be interrelated for further analysis. That&#x27;s how data silos come to form. The bigger your business grows, the more diversified customer data sources you will have, and the more likely you are trapped by data silos. </p><p>This is exactly what happens to the insurance company I&#x27;m going to talk about in this post. By 2023, they have already served over 500 million customers and signed 57 billion insurance contracts. When they started to build a customer data platform (CDP) to accommodate such a data size, they used multiple components. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-silos-in-cdp">Data silos in CDP<a href="#data-silos-in-cdp" class="hash-link" aria-label="Direct link to Data silos in CDP" title="Direct link to Data silos in CDP"></a></h2><p>Like most data platforms, their CDP 1.0 had a batch processing pipeline and a real-time streaming pipeline. Offline data was loaded, via Spark jobs, to Impala, where it was tagged and divided into groups. Meanwhile, Spark also sent it to NebulaGraph for OneID computation (elaborated later in this post). On the other hand, real-time data was tagged by Flink and then stored in HBase, ready to be queried.</p><p>That led to a component-heavy computation layer in the CDP: Impala, Spark, NebulaGraph, and HBase.</p><p><img loading="lazy" alt="apache doris data silos in CDP" src="https://cdnd.selectdb.com/assets/images/apache-doris-data-silos-in-CDP-df4e64a7cadc2fa6fca8de1807571aa4.png" width="1280" height="1060" class="img_ev3q"></p><p>As a result, offline tags, real-time tags, and graph data were scattered across multiple components. Integrating them for further data services was costly due to redundant storage and bulky data transfer. What&#x27;s more, due to discrepancies in storage, they had to expand the size of the CDH cluster and NebulaGraph cluster, adding to the resource and maintenance costs.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="apache-doris-based-cdp">Apache Doris-based CDP<a href="#apache-doris-based-cdp" class="hash-link" aria-label="Direct link to Apache Doris-based CDP" title="Direct link to Apache Doris-based CDP"></a></h2><p>For CDP 2.0, they decide to introduce a unified solution to clean up the mess. At the computation layer of CDP 2.0, <a href="https://doris.apache.org" target="_blank" rel="noopener noreferrer">Apache Doris</a> undertakes both real-time and offline data storage and computation. </p><p>To ingest <strong>offline data</strong>, they utilize the <a href="https://doris.apache.org/docs/data-operate/import/import-way/stream-load-manual" target="_blank" rel="noopener noreferrer">Stream Load</a> method. Their 30-thread ingestion test shows that it can perform over 300,000 upserts per second. To load <strong>real-time data</strong>, they use a combination of <a href="https://doris.apache.org/docs/ecosystem/flink-doris-connector" target="_blank" rel="noopener noreferrer">Flink-Doris-Connector</a> and Stream Load. In addition, in real-time reporting where they need to extract data from multiple external data sources, they leverage the <a href="https://doris.apache.org/docs/lakehouse/multi-catalog/" target="_blank" rel="noopener noreferrer">Multi-Catalog</a> feature for <strong>federated queries</strong>. </p><p><img loading="lazy" alt="apache doris based-CDP" src="https://cdnd.selectdb.com/assets/images/apache-doris-based-CDP-be99e2c46e0588eb6d6540e0f557ddbb.png" width="1280" height="1068" class="img_ev3q"></p><p>The customer analytic workflows on this CDP go like this. First, they sort out customer information, then they attach tags to each customer. Based on the tags, they divide customers into groups for more targeted analysis and operation. </p><p>Next, I&#x27;ll delve into these workloads and show you how Apache Doris accelerates them. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="oneid">OneID<a href="#oneid" class="hash-link" aria-label="Direct link to OneID" title="Direct link to OneID"></a></h2><p>Has this ever happened to you when you have different user registration systems for your products and services? You might collect the email of UserID A from one product webpage, and later the social security number of UserID B from another. Then you find out that UserID A and UserID B actually belong to the same person because they go by the same phone number.</p><p>That&#x27;s why OneID arises as an idea. It is to pool the user registration information of all business lines into one large table in Apache Doris, sort it out, and make sure that one user has a unique OneID. </p><p>This is how they figure out which registration information belongs to the same user leveraging the functions in Apache Doris.</p><p><img loading="lazy" alt="apache doris OneID" src="https://cdnd.selectdb.com/assets/images/apache-doris-OneID-56d81b3a97eeeff7e9ce266e71263161.png" width="1280" height="543" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="tagging-services">Tagging services<a href="#tagging-services" class="hash-link" aria-label="Direct link to Tagging services" title="Direct link to Tagging services"></a></h2><p>This CDP accommodates information of <strong>500 million customers</strong>, which come from over <strong>500 source tables</strong> and are attached to over <strong>2000 tags</strong> in total.</p><p>By timeliness, the tags can be divided into real-time tags and offline tags. The real-time tags are computed by Apache Flink and written into the flat table in Apache Doris, while the offline tags are computed by Apache Doris as they are derived from the user attribute table, business table, and user behavior table in Doris. Here is the company&#x27;s best practice in data tagging: </p><p><strong>1. Offline tags:</strong></p><p>During the peaks of data writing, a full update might easily cause an OOM error given their huge data scale. To avoid that, they utilize the <a href="https://doris.apache.org/docs/data-operate/import/import-way/insert-into-manual" target="_blank" rel="noopener noreferrer">INSERT INTO SELECT</a> function of Apache Doris and enable <strong>partial column update</strong>. This will cut down memory consumption by a lot and maintain system stability during data loading.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">set enable_unique_key_partial_update=true;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">insert into tb_label_result(one_id, labelxx) </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">select one_id, label_value as labelxx</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">from .....</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>2. Real-time tags:</strong></p><p>Partial column update is also available for real-time tags, since even real-time tags are updated at different paces. All that is needed is to set <code>partial_columns</code> to <code>true</code>.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">curl --location-trusted -u root: -H &quot;partial_columns:true&quot; -H &quot;column_separator:,&quot; -H &quot;columns:id,balance,last_access_time&quot; -T /tmp/test.csv http://127.0.0.1:48037/api/db1/user_profile/_stream_load</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>3. High-concurrency point queries:</strong></p><p>With its current business size, the company is receiving query requests for tags at a concurrency level of over 5000 QPS. They use a combination of strategies to guarantee high performance. Firstly, they adopt <a href="https://doris.apache.org/docs/query-acceleration/hight-concurrent-point-query#using-preparedstatement" target="_blank" rel="noopener noreferrer">Prepared Statement</a> for pre-compilation and pre-execution of SQL. Secondly, they fine-tune the parameters for Doris Backend and the tables to optimize storage and execution. Lastly, they enable <a href="https://doris.apache.org/docs/query-acceleration/hight-concurrent-point-query#enable-row-cache" target="_blank" rel="noopener noreferrer">row cache</a> as a complement to the column-oriented Apache Doris.</p><ul><li>Fine-tune Doris Backend parameters in <code>be.conf</code>:</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">disable_storage_row_cache = false </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">storage_page_cache_limit=40%</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li>Fine-tune table parameters upon table creation:</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">enable_unique_key_merge_on_write = true</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">store_row_column = true</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">light_schema_change = true</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>4. Tag computation (join):</strong></p><p>In practice, many tagging services are implemented by multi-table joins in the database. That often involves more than 10 tables. For optimal computation performance, they adopt the <a href="https://doris.apache.org/docs/query-acceleration/join-optimization/colocation-join" target="_blank" rel="noopener noreferrer">colocation group</a> strategy in Doris. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="customer-grouping">Customer Grouping<a href="#customer-grouping" class="hash-link" aria-label="Direct link to Customer Grouping" title="Direct link to Customer Grouping"></a></h2><p>The customer grouping pipeline in CDP 2.0 goes like this: Apache Doris receives SQL from customer service, executes the computation, and sends the result set to S3 object storage via SELECT INTO OUTFILE. The company has divided their customers into 1 million groups. The customer grouping task that used to take <strong>50 seconds in Impala</strong> to finish now only needs <strong>10 seconds in Doris</strong>. </p><p><img loading="lazy" alt="apache doris customer grouping" src="https://cdnd.selectdb.com/assets/images/apache-doris-customer-grouping-7c42996acf6d17eb8be01be7848e6ee6.png" width="1280" height="402" class="img_ev3q"></p><p>Apart from grouping the customers for more fine-grained analysis, sometimes they do analysis in a reverse direction. That is, to target a certain customer and find out to which groups he/she belongs. This helps analysts understand the characteristics of customers as well as how different customer groups overlap.</p><p>In Apache Doris, this is implemented by the BITMAP functions: <code>BITMAP_CONTAINS</code> is a fast way to check if a customer is part of a certain group, and <code>BITMAP_OR</code>, <code>BITMAP_INTERSECT</code>, and <code>BITMAP_XOR</code> are the choices for cross analysis. </p><p><img loading="lazy" alt="apache doris bitmap" src="https://cdnd.selectdb.com/assets/images/apache-doris-bitmap-da70b0e27411c1ef101d8f48731ba27e.png" width="1280" height="649" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>From CDP 1.0 to CDP 2.0, the insurance company adopts Apache Doris, a unified data warehouse, to replace Spark+Impala+HBase+NebulaGraph. That increases their data processing efficiency by breaking down the data silos and streamlining data processing pipelines. In CDP 3.0 to come, they want to group their customer by combining real-time tags and offline tags for more diversified and flexible analysis. The <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris community</a> and the <a href="https://www.velodb.io" target="_blank" rel="noopener noreferrer">VeloDB</a> team will continue to be a supporting partner during this upgrade.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/a-financial-anti-fraud-solution-based-on-the-apache-doris-data-warehouse">A financial anti-fraud solution based on the Apache Doris data warehouse</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2024-02-22T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">February 22, 2024</time></div></header><div class="markdown" itemprop="articleBody"><p>Financial fraud prevention is a race against time. Implementation-wise, it relies heavily on the data processing power, especially under large datasets. Today I&#x27;m going to share with you the use case of a retail bank with over 650 million individual customers. They have compared analytics components including <a href="https://doris.apache.org" target="_blank" rel="noopener noreferrer">Apache Doris</a>, ClickHouse, Greenplum, Cassandra, and Kylin. After 5 rounds of deployment and comparsion based on 89 custom test cases, they settled on Apache Doris, because they witnessed a six-fold writing speed and faster multi-table joins in Apache Doris as compared to the mighty ClickHouse.</p><p>I will get into details about how the bank builds their fraud risk management platform based on Apache Doris and how it performs. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="fraud-risk-management-platform">Fraud Risk Management Platform<a href="#fraud-risk-management-platform" class="hash-link" aria-label="Direct link to Fraud Risk Management Platform" title="Direct link to Fraud Risk Management Platform"></a></h2><p>In this platform, <strong>80% of ad-hoc queries</strong> return results in less than <strong>2 seconds,</strong> and <strong>95%</strong> of them are finished in under <strong>5 seconds.</strong> On average, the solution <strong>intercepts tens of thousands of suspicious transactions</strong> every day and <strong>avoids losses of millions of dollars</strong> for bank customers. </p><p>This is an overview of the entire platform from an architectural perspective. </p><p><img loading="lazy" alt="Fraud Risk Management Platform" src="https://cdnd.selectdb.com/assets/images/fraud-risk-management-platform-262b039604139527d92106f9c6a67847.png" width="1280" height="530" class="img_ev3q"></p><p>The <strong>source data</strong> can be roughly categorized as:</p><ul><li>Dimension data: mostly stored in PostgreSQL</li><li>Real-time transaction data: decoupled from various external systems via Kafka message queues</li><li>Offline data: directly ingested from external systems to Hive, making data reconciliation easy</li></ul><p>For <strong>data ingestion</strong>, this is how they collect the three types of source data. First of all, they leverage the <a href="https://doris.apache.org/docs/lakehouse/multi-catalog/jdbc" target="_blank" rel="noopener noreferrer">JDBC Catalog</a> to to synchronize metadata and user data from PostgreSQL. </p><p>The transaction data needs to be combined with dimension data for further analysis. Thus, they employ a Flink SQL API to read dimension data from PostgreSQL, and real-time transaction data from Kafka. Then, in Flink, they do multi-stream joins and generate wide tables. For real-time refreshing of dimension tables, they use a Lookup Join mechanism, which dynamically looks up and refreshes dimension data when processing data streams. They also utilize Java UDFs to serve their specific needs in ETL. After that, they write the data into Apache Doris via the<a href="https://doris.apache.org/docs/ecosystem/flink-doris-connector/" target="_blank" rel="noopener noreferrer"> Flink-Doris-Connector</a>. </p><p>The offline data is cleaned, transformed, and written into Hive, Kafka, and PostgreSQL, for which Doris creates catalogs as mappings, based on its <a href="https://doris.apache.org/docs/lakehouse/multi-catalog/" target="_blank" rel="noopener noreferrer">Multi-Catalog</a> capability, to facilitate federated analysis. In this process, Hive Metastore is in place to access and refresh data from Hive automatically.</p><p>In terms of <strong>data modeling</strong>, they use Apache Doris as a data warehouse and apply different <a href="https://doris.apache.org/docs/data-table/data-model" target="_blank" rel="noopener noreferrer">data models</a> for different layers. Each layer aggregates or rolls up data from the previous layer at a coarser granularity. Eventually, it produces a highly aggregated Rollup or Materialized View. </p><p>Now let me show you what analytics tasks are running on this platform. Based on the scale of monitoring and human involvement, these tasks can be divided into real-time risk reporting, multi-dimensional analysis, federated queries, and auto alerting. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="real-time-risk-report">Real-time risk report<a href="#real-time-risk-report" class="hash-link" aria-label="Direct link to Real-time risk report" title="Direct link to Real-time risk report"></a></h2><p>When it comes to fraud prevention, what is diminishing the effectiveness of your anti-fraud efforts? It is incomplete exposure of potential risks and untimely risk identification. That&#x27;s why people always want real-time, full-scale monitoring and reporting.</p><p>The bank&#x27;s solution to that is built on Apache Flink and Apache Doris. First of all, they put together the 17 dimensions. After cleaning, aggregation, and other computations, they visualize the data on a real-time dashboard. </p><p>As for <strong>scale</strong>, it analyzes the workflows of <strong>over 10 million customers, 30,000 clerks, 10,000 branches, and 1000 products</strong>. </p><p>As for <strong>speed</strong>, the bank now has evolved from next-day data refreshing to near real-time data processing. Targeted analysis can be done within minutes instead of hours. The solution also supports complicated ad-hoc queries to capture underlying risks by monitoring how the data models and rules run. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="multi-dimensional-analysis-to-identify-risks">Multi-dimensional analysis to identify risks<a href="#multi-dimensional-analysis-to-identify-risks" class="hash-link" aria-label="Direct link to Multi-dimensional analysis to identify risks" title="Direct link to Multi-dimensional analysis to identify risks"></a></h2><p>Case tracing is another common anti-fraud practice. The bank has a fraud model library. Based on the fraud models, they analyze the risks of each transaction and visualize the results in near real time, so their staff can take prompt measures if needed. </p><p>For that purpose, they use Apache Doris for <strong>multi-dimensional analysis</strong> of cases. They check the patterns of transactions, including sources, types, and time, for a comprehensive overview. During this process, they often need to combine <strong>over 10 filtering conditions</strong> of different dimensions. This is empowered by the <strong>ad-hoc query</strong> capabilities of Apache Doris. Both rule-based matching and list-based matching of cases can be done <strong>within seconds</strong> without manual efforts.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="federated-queries-to-locate-risk-details">Federated queries to locate risk details<a href="#federated-queries-to-locate-risk-details" class="hash-link" aria-label="Direct link to Federated queries to locate risk details" title="Direct link to Federated queries to locate risk details"></a></h2><p>Apart from identifying risks from each transaction, the bank also receives risk reports from customers. In these cases, the corresponding transaction will be labeled as &quot;risky&quot;, and it will be categorized and recorded in the ticketing system. The labels make sure that the high-risk transactions are promptly attended to. </p><p>One problem is that, the ticketing system is overloaded with such data, so it is not able to directly present all the details of the risky transactions. What needs to be done is to relate the tickets to the transaction details so the bank staff can locate the actual risks. </p><p>How is that implemented? Every day, Apache Doris traverses the incremental tickets and the basic information table to get the ticket IDs, and then it relates the ticket IDs to the dimension data stored in itself. At the end, the ticket details are presented at the frontend of Doris. This entire process takes <strong>only a few minutes</strong>. This is a big game change compared to the old time when they had to manually look up the suspicious transaction.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="auto-alerting">Auto alerting<a href="#auto-alerting" class="hash-link" aria-label="Direct link to Auto alerting" title="Direct link to Auto alerting"></a></h2><p>Based on Apache Doris, the bank designs their own alerting rules, models, and strategies. The system monitors how everything runs. Once it detects a situation that matches the alert rules, it will trigger an alarm. They have also established a real-time feedback mechanism for the alerting rules, so if a newly added rule causes any negative effects, it will be adjusted or removed rapidly. </p><p>So far, the bank has added nearly 100 alerting rules for various risk types to the system. During the past two months, <strong>over 100 alarms</strong> were issued with <strong>over 95% accuracy</strong> in less than <strong>5 seconds</strong> after the risk situation arises. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>For a comprehensive anti-fraud solution, the bank conducts full-scale real-time monitoring and reporting for all their data workflows. Then, for each transaction, they look into the multiple dimensions of it to identify risks. For the suspicious transactions reported by the bank customers, they perform federated queries to retrieve the full details of them. Also, an auto alerting mechanism is always on patrol to safeguard the whole system. These are the various types of analytic workloads in this solution. The implementation of them rely on the capabilities of Apache Doris, which is a data warehouse designed to be an all-in-one platform for various workloads. If you try to build your own anti-fraud solution, the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris open source developers</a> are happy to exchange ideas with you.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/a-fast-secure-high-available-real-time-data-warehouse-based-on-apache-doris">Financial data warehousing: fast, secure, and highly available with Apache Doris</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2024-01-08T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">January 8, 2024</time></div></header><div class="markdown" itemprop="articleBody"><p>This is a whole-journey guide for Apache Doris users, especially those from the financial sector which requires a high level of data security and availability. If you don&#x27;t know how to build a real-time data pipeline and make the most of the <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a> functionalities, start with this post and you will be loaded with inspiration after reading.</p><p>This is the best practice of a non-banking payment service provider that serves over 25 million retailers and processes data from 40 million end devices. Data sources include MySQL, Oracle, and MongoDB. They were using Apache Hive as an offline data warehouse but feeling the need to add a real-time data processing pipeline. <strong>After introducing Apache Doris, they increase their data ingestion speed by 2~5 times, ETL performance by 3~12 times, and query execution speed by 10~15 times.</strong></p><p>In this post, you will learn how to integrate Apache Doris into your data architecture, including how to arrange data inside Doris, how to ingest data into it, and how to enable efficient data updates. Plus, you will learn about the enterprise features that Apache Doris provides to guarantee data security, system stability, and service availability.</p><p><img loading="lazy" src="https://cdn.selectdb.com/static/offline_vs_real_time_data_warehouse_6b3fd0d1bc.png" alt="offline-vs-real-time-data-warehouse" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="building-a-real-time-data-warehouse-with-apache-doris">Building a real-time data warehouse with Apache Doris<a href="#building-a-real-time-data-warehouse-with-apache-doris" class="hash-link" aria-label="Direct link to Building a real-time data warehouse with Apache Doris" title="Direct link to Building a real-time data warehouse with Apache Doris"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="choice-of-data-models">Choice of data models<a href="#choice-of-data-models" class="hash-link" aria-label="Direct link to Choice of data models" title="Direct link to Choice of data models"></a></h3><p>Apache Doris arranges data with three data models. The main difference between these models lies in whether or how they aggregate data.</p><ul><li><strong><a href="https://doris.apache.org/docs/data-table/data-model#duplicate-model" target="_blank" rel="noopener noreferrer">Duplicate Key model</a></strong>: for detailed data queries. It supports ad-hoc queries of any dimension.</li><li><strong><a href="https://doris.apache.org/docs/data-table/data-model#unique-model" target="_blank" rel="noopener noreferrer">Unique Key model</a></strong>: for use cases with data uniqueness constraints. It supports precise deduplication, multi-stream upserts, and partial column updates.</li><li><strong><a href="https://doris.apache.org/docs/data-table/data-model#aggregate-model" target="_blank" rel="noopener noreferrer">Aggregate Key model</a></strong>: for data reporting. It accelerates data reporting by pre-aggregating data.</li></ul><p>The financial user adopts different data models in different data warehouse layers:</p><ul><li><strong>ODS - Duplicate Key model</strong>: As a payment service provider, the user receives a million settlement data every day. Since the settlement cycle can span a whole year, the relevant data needs to be kept intact for a year. Thus, the proper way is to put it in the Duplicate Key model, which does not perform any data aggregations. An exception is that some data is prone to constant changes, like order status from retailers. Such data should be put into the Unique Key model so that the newly updated record of the same retailer ID or order ID will always replace the old one.</li><li><strong>DWD &amp; DWS - Unique Key model</strong>: Data in the DWD and DWS layers are further abstracted, but it is all put in the Unique Key model so that the settlement data can be automatically updated.</li><li><strong>ADS - Aggregate Key model</strong>: Data is highly abstracted in this layer. It is pre-aggregated to mitigate the computation load of downstream analytics.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="partitioning-and-bucketing-strategies">Partitioning and bucketing strategies<a href="#partitioning-and-bucketing-strategies" class="hash-link" aria-label="Direct link to Partitioning and bucketing strategies" title="Direct link to Partitioning and bucketing strategies"></a></h3><p>The idea of partitioning and bucketing is to &quot;cut&quot; data into smaller pieces to increase data processing speed. The key is to set an appropriate number of data partitions and buckets. Based on their use case, the user tailors the bucketing field and bucket number to each table. For example, they often need to query the dimensional data of different retailers from the retailer flat table, so they specify the retailer ID column as the bucketing field, and list the recommended bucket number for various data sizes.</p><p><img loading="lazy" src="https://cdn.selectdb.com/static/partitioning_and_bucketing_strategies_c91ad6a340.png" alt="partitioning-and-bucketing-strategies" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="multi-source-data-migration">Multi-source data migration<a href="#multi-source-data-migration" class="hash-link" aria-label="Direct link to Multi-source data migration" title="Direct link to Multi-source data migration"></a></h3><p>In the adoption of Apache Doris, the user had to migrate all local data from their branches into Doris, which was when they found out their branches were using <strong>different databases</strong> and had <strong>data files of very different formats</strong>, so the migration could be a mess.</p><p><img loading="lazy" src="https://cdn.selectdb.com/static/multi_source_data_migration_2b4f54e005.png" alt="multi-source-data-migration" class="img_ev3q"></p><p>Luckily, Apache Doris supports a rich collection of data integration methods for both real-time data streaming and offline data import.</p><ul><li><strong>Real-time data streaming</strong>: Apache Doris fetches MySQL Binlogs in real time. Part of them is written into Doris directly via Flink CDC, while the high-volume ones are synchronized into Kafka for peak shaving, and then written into Doris via the Flink-Doris-Connector.</li><li><strong>Offline data import</strong>: This includes more diversified data sources and data formats. Historical data and incremental data from S3 and HDFS will be ingested into Doris via the <a href="https://doris.apache.org/docs/data-operate/import/import-way/broker-load-manual" target="_blank" rel="noopener noreferrer">Broker Load</a> method, data from Hive or JDBC will be synchronized to Doris via the <a href="https://doris.apache.org/docs/data-operate/import/import-way/insert-into-manual" target="_blank" rel="noopener noreferrer">Insert Into</a> method, and files will be loaded to Doris via the Flink-Doris-Connector and Flink FTP Connector. (FTP is how the user transfers files across systems internally, so they developed the Flink-FTP-Connector to support the complicated data formats and multiple newline characters in data.)</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="full-data-ingestion-and-incremental-data-ingestion">Full data ingestion and incremental data ingestion<a href="#full-data-ingestion-and-incremental-data-ingestion" class="hash-link" aria-label="Direct link to Full data ingestion and incremental data ingestion" title="Direct link to Full data ingestion and incremental data ingestion"></a></h3><p>To ensure business continuity and data accuracy, the user figures out the following ways to ingest full data and incremental data:</p><ul><li><strong>Full data ingestion</strong>: Create a temporary table of the target schema in Doris, ingest full data into the temporary table, and then use the <code>ALTER TABLE t1 REPLACE WITH TABLE t2</code> statement for atomic replacement of the regular table with the temporary table. This method prevents interruptions to queries on the frontend.</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">alter table ${DB_NAME}.${TBL_NAME} drop partition IF EXISTS p${P_DOWN_DATE};</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">ALTER TABLE ${DB_NAME}.${TBL_NAME} ADD PARTITION IF NOT EXISTS p${P_DOWN_DATE} VALUES [(&#x27;${P_DOWN_DATE}&#x27;), (&#x27;${P_UP_DATE}&#x27;));</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">LOAD LABEL ${TBL_NAME}_${load_timestamp} ...</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li><strong>Incremental data ingestion</strong>: Create a new data partition to accommodate incremental data.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="offline-data-processing">Offline data processing<a href="#offline-data-processing" class="hash-link" aria-label="Direct link to Offline data processing" title="Direct link to Offline data processing"></a></h3><p>The user has moved their offline data processing workload to Apache Doris and thus <strong>increased execution speed by 5 times</strong>. </p><p><img loading="lazy" src="https://cdn.selectdb.com/static/offline_data_processing_82e20fc59a.png" alt="offline-data-processing" class="img_ev3q"></p><ul><li><strong>Before</strong>: The old Hive-based offline data warehouse used the TEZ execution engine to process 30 million new data records every day. With 2TB computation resources, the whole pipeline took 2.5 hours. </li><li><strong>After</strong>: Apache Doris finishes the same tasks within only 30 minutes and consumes only 1TB. Script execution takes only 10 seconds instead of 8 minutes.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="enterprise-features-for-financial-players">Enterprise features for financial players<a href="#enterprise-features-for-financial-players" class="hash-link" aria-label="Direct link to Enterprise features for financial players" title="Direct link to Enterprise features for financial players"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="multi-tenant-resource-isolation">Multi-tenant resource isolation<a href="#multi-tenant-resource-isolation" class="hash-link" aria-label="Direct link to Multi-tenant resource isolation" title="Direct link to Multi-tenant resource isolation"></a></h3><p>This is required because it often happens that the same piece of data is requested by multiple teams or business systems. These tasks can lead to resource preemption and thus performance decrease and system instability.</p><p><strong>Resource limit for different workloads</strong></p><p>The user classifies their analytics workloads into four types and sets a resource limit for each of them. In particular, they have four different types of Doris accounts and set a limit on the CPU and memory resources for each type of account.</p><p><img loading="lazy" src="https://cdn.selectdb.com/static/multi_tenant_resource_isolation_772a57a4f1.png" alt="multi-tenant-resource-isolation" class="img_ev3q"></p><p>In this way, when one tenant requires excessive resources, it will only compromise its own efficiency but not affect other tenants.</p><p><strong>Resource tag-based isolation</strong></p><p>For data security under the parent-subsidiary company hierarchy, the user has set isolated resource groups for the subsidiaries. Data of each subsidiary is stored in its own resource group with three replicas, while data of the parent company is stored with four replicas: three in the parent company resource group, and the other one in the subsidiary resource group. Thus, when an employee from a subsidiary requests data from the parent company, the query will only executed in the subsidiary resource group. Specifically, they take these steps:</p><p><img loading="lazy" src="https://cdn.selectdb.com/static/resource_tag_based_isolation_442e20f09c.png" alt=" resource-tag-based-isolation" class="img_ev3q"></p><p><strong>Workload group</strong></p><p>The resource tag-based isolation plan ensures isolation on a physical level, but as Apache Doris developers, we want to further optimize resource utilization and pursue more fine-grained resource isolation. For these purposes, we released the <a href="https://doris.apache.org/docs/admin-manual/workload-group" target="_blank" rel="noopener noreferrer">Workload Group</a> feature in <a href="https://doris.apache.org/blog/release-note-2.0.0" target="_blank" rel="noopener noreferrer">Apache Doris 2.0</a>. </p><p>The Workload Group mechanism relates queries to workload groups, which limit the share of CPU and memory resources of the backend nodes that a query can use. When cluster resources are in short supply, the biggest queries will stop execution. On the contrary, when there are plenty of available cluster resources and a workload group requires more resources than the limit, it will get assigned with the idle resources proportionately. </p><p>The user is actively planning their transition to the Workload Group plan and utilizing the task prioritizing mechanism and query queue feature to organize the execution order.</p><p><strong>Fine-grained user privilege management</strong></p><p>For regulation and compliance reasons, this payment service provider implements strict privilege control to make sure that everyone only has access to what they are supposed to access. This is how they do it:</p><ul><li><strong>User privilege setting</strong>: System users of different subsidiaries or with different business needs are granted different data access privileges.</li><li><strong>Privilege control over databases, tables, and rows</strong>: The <code>ROW POLICY</code> mechanism of Apache Doris makes these operations easy.</li><li><strong>Privilege control over columns</strong>: This is done by creating views.</li></ul><p><img loading="lazy" src="https://cdn.selectdb.com/static/fine_grained_user_privilege_management_f0cd060011.png" alt="fine-grained-user-privilege-management.png" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="cluster-stability-guarantee">Cluster stability guarantee<a href="#cluster-stability-guarantee" class="hash-link" aria-label="Direct link to Cluster stability guarantee" title="Direct link to Cluster stability guarantee"></a></h3><ul><li><strong>Circuit Breaking</strong>: From time to time, system users might input faulty SQL, causing excessive resource consumption. A circuit-breaking mechanism is in place for that. It will promptly stop these resource-intensive queries and prevent interruption to the system.</li><li><strong>Data ingestion concurrency control</strong>: The user has a frequent need to integrate historical data into their data platform. That involves a lot of data modification tasks and might stress the cluster. To solve that, they turn on the <a href="https://doris.apache.org/docs/data-table/data-model#merge-on-write-of-unique-model" target="_blank" rel="noopener noreferrer">Merge-on-Write</a> mode in the Unique Key model, enable <a href="https://doris.apache.org/docs/advanced/best-practice/compaction#vertical-compaction" target="_blank" rel="noopener noreferrer">Vertical Compaction</a> and <a href="https://doris.apache.org/docs/advanced/best-practice/compaction#segment-compaction" target="_blank" rel="noopener noreferrer">Segment Compaction</a>, and tune the data compaction parameters to control data ingestion concurrency.</li><li><strong>Network traffic control</strong>: Considering their two clusters in different cities, they employ Quality of Service (QoS) strategies tailored to different scenarios for precise network isolation and ensuring network quality and stability.</li><li><strong>Monitoring and alerting</strong>: The user has integrated Doris with their internal monitoring and alerting platform so any detected issues will be notified via their messaging software and email in real time.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="cross-cluster-replication">Cross-cluster replication<a href="#cross-cluster-replication" class="hash-link" aria-label="Direct link to Cross-cluster replication" title="Direct link to Cross-cluster replication"></a></h3><p>Disaster recovery is crucial for the financial industry. The user leverages the Cross-Cluster Replication (CCR) capability and builds a dual-cluster solution. As the primary cluster undertakes all the queries, the major business data is also synchronized into the backup cluster and updated in real time, so that in the case of service downtime in the primary cluster, the backup cluster will take over swiftly and ensure business continuity.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>We appreciate the user for their active <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">communication</a> with us along the way and are glad to see so many Apache Doris features fit in their needs. They are also planning on exploring federated query, compute-storage separation, and auto maintenance with Apache Doris. We look forward to more best practice and feedback from them.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/apache-doris-speeds-up-data-reporting-tagging-and-data-lake-analytics">Apache Doris speeds up data reporting, tagging, and data lake analytics</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2023-12-27T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 27, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>As much as we say <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a> is an all-in-one data platform that is capable of various analytics workloads, it is always compelling to demonstrate that by real use cases. That&#x27;s why I would like to share this user story with you. It is about how they leverage the capabilities of Apache Doris in reporting, customer tagging, and data lake analytics and achieve high performance.</p><p>This fintech service provider is a long-term user of Apache Doris. They have almost 10 clusters for production, hundreds of Doris backend nodes, and thousands of CPU Cores. The total data size is near 1 PB. Every day, they have hundreds of workflows running simultaneously, receive almost 10 billion new data records, and respond to millions of data queries.</p><p>Before migrating to Apache Doris, they used ClickHouse, MySQL, and Elasticsearch. Then frictions arise from their ever-enlarging data size. They found it hard to scale out the ClickHouse clusters because there were too many dependencies. As for MySQL, they had to switch between various MySQL instances because one MySQL instance had its limits and cross-instance queries were not supported.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="reporting">Reporting<a href="#reporting" class="hash-link" aria-label="Direct link to Reporting" title="Direct link to Reporting"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="from-clickhouse--mysql-to-apache-doris">From ClickHouse + MySQL to Apache Doris<a href="#from-clickhouse--mysql-to-apache-doris" class="hash-link" aria-label="Direct link to From ClickHouse + MySQL to Apache Doris" title="Direct link to From ClickHouse + MySQL to Apache Doris"></a></h3><p>Data reporting is one of the major services they provide to their customers and they are bound by an SLA. They used to support such service with a combination of ClickHouse and MySQL, but they found significant fluctuations in their data synchronization duration, making it hard for them to meet the service levels outlined in their SLA. Diagnosis showed that it was because the multiple components add to the complexity and instability of data synchronization tasks. To fix that, they have used Apache Doris as a unified analytic engine to support data reporting. </p><div style="text-align:center"><img loading="lazy" src="https://cdn.selectdb.com/static/from_clickhouse_mysql_to_apache_doris_6387c0363a.png" alt="from-clickhouse-mysql-to-apache-doris" width="840" style="display:inline-block" class="img_ev3q"></div><h3 class="anchor anchorWithStickyNavbar_LWe7" id="performance-improvements">Performance improvements<a href="#performance-improvements" class="hash-link" aria-label="Direct link to Performance improvements" title="Direct link to Performance improvements"></a></h3><p>With Apache Doris, they ingest data via the <a href="https://doris.apache.org/docs/1.2/data-operate/import/import-way/broker-load-manual" target="_blank" rel="noopener noreferrer">Broker Load</a> method and reach an SLA compliance rate of over 99% in terms of data synchronization performance.</p><div style="text-align:center"><img loading="lazy" src="https://cdn.selectdb.com/static/data_synchronization_size_and_duration_327e4dc1fe.png" alt="data-synchronization-size-and-duration" width="640" style="display:inline-block" class="img_ev3q"></div><p>As for data queries, the Doris-based architecture maintains an <strong>average query response time</strong> of less than <strong>10s</strong> and a <strong>P90 response time</strong> of less than <strong>30s</strong>. This is a 50% speedup compared to the old architecture. </p><div style="text-align:center"><img loading="lazy" src="https://cdn.selectdb.com/static/average_query_response_time_372d71ef16.png" alt="average-query-response-time" width="840" style="display:inline-block" class="img_ev3q"></div><div style="text-align:center"><img loading="lazy" src="https://cdn.selectdb.com/static/query_response_time_percentile_756c6f6a71.png" alt="query-response-time-percentile" width="840" style="display:inline-block" class="img_ev3q"></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="tagging">Tagging<a href="#tagging" class="hash-link" aria-label="Direct link to Tagging" title="Direct link to Tagging"></a></h2><p>Tagging is a common operation in customer analytics. You assign labels to customers based on their behaviors and characteristics, so that you can divide them into groups and figure out targeted marketing strategies for each group of them. </p><p>In the old processing architecture where Elasticsearch was the processing engine, raw data was ingested and tagged properly. Then, it will be merged into JSON files and imported into Elasticsearch, which provides data services for analysts and marketers. In this process, the merging step was to reduce updates and relieve load for Elasticsearch, but it turned out to be a troublemaker:</p><ul><li>Any problematic data in any of the tags could spoil the entire merging operation and thus interrupt the data services.</li><li>The merging operation was implemented based on Spark and MapReduce and took up to 4 hours. Such a long time frame could encroach on marketing opportunities and lead to unseen losses.</li></ul><div style="text-align:center"><img loading="lazy" src="https://cdn.selectdb.com/static/tagging_services_3263e21c36.png" alt="tagging-services" width="840" style="display:inline-block" class="img_ev3q"></div><p>Then Apache Doris takes this over. Apache Doris arranges tag data with its data models, which process data fast and smoothly. The aforementioned merging step can be done by the <a href="https://doris.apache.org/docs/data-table/data-model#aggregate-model" target="_blank" rel="noopener noreferrer">Aggregate Key model</a>, which aggregates tag data based on the specified Aggregate Key upon data ingestion. The <a href="https://doris.apache.org/docs/data-table/data-model#unique-model" target="_blank" rel="noopener noreferrer">Unique Key model</a> is handy for partial column updates. Again, all you need is to specify the Unique Key. This enables swift and flexible data updating and saves you from the trouble of replacing the entire flat table. You can also put your detailed data into a <a href="https://doris.apache.org/docs/data-table/data-model#duplicate-model" target="_blank" rel="noopener noreferrer">Duplicate model</a> to speed up certain queries. <strong>In practice, it took the user 1 hour to finish the data ingestion, compared to 4 hours with the old architecture.</strong></p><p>In terms of query performance, Doris is equipped with well-developed bitmap indexes and techniques tailored to high-concurrency queries, so in this case, it can finish <strong>customer segmentation within seconds</strong> and reach over <strong>700 QPS in user-facing queries</strong>.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-lake-analytics">Data lake analytics<a href="#data-lake-analytics" class="hash-link" aria-label="Direct link to Data lake analytics" title="Direct link to Data lake analytics"></a></h2><p>In data lake scenarios, the data size you need to handle tends to be huge, but the data processing volume in each query tends to vary. To ensure fast data ingestion and high query performance of huge data sets, you need more resources. On the other hand, during non-peak time, you want to scale down your cluster for more efficient resource management. How do you handle this dilemma?</p><p>Apache Doris has a few features that are designed for data lake analytics, including Multi-Catalog and Compute Node. The former shields you from the headache of data ingestion in data lake analytics while the latter enables elastic cluster scaling.</p><p>The <a href="https://doris.apache.org/docs/lakehouse/multi-catalog/?_highlight=multi&amp;_highlight=catalog" target="_blank" rel="noopener noreferrer">Multi-Catalog</a> mechanism allows you to connect Doris to a variety of external data sources so you can use Doris as a unified query gateway without worrying about bulky data ingestion into Doris.</p><p>The <a href="https://doris.apache.org/docs/advanced/compute-node/" target="_blank" rel="noopener noreferrer">Compute Node</a> of Apache Doris is a backend role that is designed for remote federated query workloads, like those in data lake analytics. Normal Doris backend nodes are responsible for both SQL query execution and data management, while the Compute Nodes in Doris, as the name implies, only perform computation. Compute Nodes are stateless, making them elastic enough for cluster scaling.</p><p>The user introduces Compute Nodes into their cluster and deploys them with other components in a hybrid configuration. As a result, the cluster automatically scales down during the night, when there are fewer query requests, and scales out during the daytime to handle the massive query workload. This is more resource-efficient.</p><p>For easier deployment, they have also optimized their Deploy on Yarn process via Skein. As is shown below, they define the number of Compute nodes and the required resources in the YAML file, and then pack the installation file, configuration file, and startup script into the distributed file system. In this way, they can start or stop the entire cluster of over 100 nodes within minutes using one simple line of code.</p><div style="text-align:center"><img loading="lazy" src="https://cdn.selectdb.com/static/skein_3516ba1a83.png" alt="skein" width="560" style="display:inline-block" class="img_ev3q"></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>For data reporting and customer tagging, Apache Doris smoothens data ingestion and merging steps, and delivers high query performance based on its own design and functionality. For data lake analytics, the user improves resource efficiency by elastic scaling of clusters using the Compute Node. Along their journey with Apache Doris, they have also developed a data ingestion task prioritizing mechanism and contributed it to the Doris project. A gesture to facilitate their use case ends up benefiting the whole <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">open source community</a>. This is a great example of open-source products thriving on user involvement.</p><p>Check Apache Doris <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">repo</a> on GitHub</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/from-elasticsearch-to-apache-doris-upgrading-an-observability-platform">From Elasticsearch to Apache Doris: upgrading an observability platform</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2023-12-14T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 14, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Observability platforms are akin to the immune system. Just like immune cells are everywhere in human bodies, an observability platform patrols every corner of your devices, components, and architectures, identifying any potential threats and proactively mitigating them. However, I might have gone too far with that metaphor, because till these days, we have never invented a system as sophisticated as the human body, but we can always make advancements.</p><p>The key to upgrading an observability platform is to increase data processing speed and reduce costs. This is based on two reasons:</p><ol><li>The faster you can identify abnormalities from your data, the more you can contain the potential damage.</li><li>An observability platform needs to store a sea of data, and low storage cost is the only way to make that sustainable.</li></ol><p>This post is about how GuanceDB, an observability platform, makes progress in these two aspects by replacing Elasticsearch with Apache Doris as its query and storage engine. <strong>The result is 70% less storage costs and 200%~400% data query performance.</strong></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="guancedb">GuanceDB<a href="#guancedb" class="hash-link" aria-label="Direct link to GuanceDB" title="Direct link to GuanceDB"></a></h2><p>GuanceDB is an all-around observability solution. It provides services including data analytics, data visualization, monitoring and alerting, and security inspection. From GuanceDB, users can have an understanding of their objects, network performance, applications, user experience, system availability, etc.</p><p>From the standpoint of a data pipeline, GuanceDB can be divided into two parts: data ingestion and data analysis. I will get to them one by one.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="data-integration">Data integration<a href="#data-integration" class="hash-link" aria-label="Direct link to Data integration" title="Direct link to Data integration"></a></h3><p>For data integration, GuanceDB uses its self-made tool called DataKit. It is an all-in-one data collector that extracts from different end devices, business systems, middleware, and data infrastructure. It can also preprocess data and relate it with metadata. It provides extensive support for data, from logs, and time series metrics, to data of distributed tracing, security events, and user behaviors from mobile APPs and web browsers. To cater to diverse needs across multiple scenarios, it ensures compatibility with various open-source probes and collectors as well as data sources of custom formats.</p><p><img loading="lazy" alt="observability-platform-architecture" src="https://cdnd.selectdb.com/assets/images/observability-platform-architecture-e6d61cc145b4fcaa0e8f81f9a3453836.png" width="2000" height="930" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="query--storage-engine">Query &amp; storage engine<a href="#query--storage-engine" class="hash-link" aria-label="Direct link to Query &amp; storage engine" title="Direct link to Query &amp; storage engine"></a></h3><p>Data collected by DataKit, goes through the core computation layer and arrive in GuanceDB, which is a multil-model database that combines various database technologies. It consists of the query engine layer and the storage engine layer. By decoupling the query engine and the storage engine, it enables pluggable and interchangeable architecture. </p><p><img loading="lazy" alt="observability-platform-query-engine-storage-engine" src="https://cdnd.selectdb.com/assets/images/observability-platform-query-engine-storage-engine-59ec8b8bcce25f1d2e401c8ef964a742.png" width="2400" height="1060" class="img_ev3q"></p><p>For time series data, they built Metric Store, which is a self-developed storage engine based on VictoriaMetrics. For logs, they integrate Elasticsearch and OpenSearch. GuanceDB is performant in this architecture, while Elasticsearch demonstrates room for improvement:</p><ul><li><strong>Data writing</strong>: Elasticsearch consumes a big share of CPU and memory resources. It is not only costly but also disruptive to query execution.</li><li><strong>Schemaless support</strong>: Elasticsearch provides schemaless support by Dynamic Mapping, but that&#x27;s not enough to handle large amounts of user-defined fields. In this case, it can lead to field type conflict and thus data loss.</li><li><strong>Data aggregation</strong>: Large aggregation tasks often trigger a timeout error in Elasticsearch. </li></ul><p>So this is where the upgrade happens. GuanceDB tried and replaced Elasticsearch with <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a>. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="dql">DQL<a href="#dql" class="hash-link" aria-label="Direct link to DQL" title="Direct link to DQL"></a></h2><p>In the GuanceDB observability platform, almost all queries involve timestamp filtering. Meanwhile, most data aggregations need to be performed within specified time windows. Additionally, there is a need to perform rollups of time series data on individual sequences within a time window. Expressing these semantics using SQL often requires nested subqueries, resulting in complex and cumbersome statements.</p><p>That&#x27;s why GuanceDB developed their own Data Query Language (DQL). With simplified syntax elements and computing functions optimized for observability use cases, this DQL can query metrics, logs, object data, and data from distributed tracing.</p><p><img loading="lazy" alt="observability-platform-query-engine-storage-engine-apache-doris" src="https://cdnd.selectdb.com/assets/images/observability-platform-query-engine-storage-engine-apache-doris-b7491e169fe7abf5488259b2d973ed8b.png" width="2400" height="878" class="img_ev3q"></p><p>This is how DQL works together with Apache Doris. GuanceDB has found a way to make full use of the analytic power of Doris, while complementing its SQL functionalities.</p><p>As is shown below, Guance-Insert is the data writing component, while Guance-Select is the DQL query engine.</p><ul><li><strong>Guance-Insert</strong>: It allows data of different tenants to be accumulated in different batches, and strikes a balance between writing throughput and writing latency. When logs are generated in large volumes, it can maintain a low data latency of 2~3 seconds.</li><li><strong>Guance-Select</strong>: For query execution, if the query SQL semantics or function is supported in Doris, Guance-Select will push the query down to the Doris Frontend for computation; if not, it will go for a fallback option: acquire columnar data in Arrow format via the Thrift RPC interface, and then finish computation in Guance-Select. The catch is that it cannot push the computation logic down to Doris Backend, so it can be slightly slower than executing queries in Doris Frontend.</li></ul><p><img loading="lazy" alt="DQL-GranceDB-apache-doris" src="https://cdnd.selectdb.com/assets/images/DQL-GranceDB-apache-doris-8e46a296f0c966f5742651d64d85cd2a.png" width="2400" height="984" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="observations">Observations<a href="#observations" class="hash-link" aria-label="Direct link to Observations" title="Direct link to Observations"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="storage-cost-70-down-query-speed-300-up">Storage cost 70% down, query speed 300% up<a href="#storage-cost-70-down-query-speed-300-up" class="hash-link" aria-label="Direct link to Storage cost 70% down, query speed 300% up" title="Direct link to Storage cost 70% down, query speed 300% up"></a></h3><p>Previously, with Elasticsearch clusters, they used 20 cloud virtual machines (16vCPU 64GB) and had independent index writing services (that&#x27;s another 20 cloud virtual machines). Now with Apache Doris, they only need 13 cloud virtual machines of the same configuration in total, representing <strong>a 67% cost reduction</strong>. This is contributed by three capabilities of Apache Doris:</p><ul><li><strong>High writing throughput</strong>: Under a consistent writing throughput of 1GB/s, Doris maintains a CPU usage of less than 20%. That equals 2.6 cloud virtual machines. With low CPU usage, the system is more stable and better prepared for sudden writing peaks.</li></ul><p><img loading="lazy" alt="writing-throughput-cpu-usage-apache-doris" src="https://cdnd.selectdb.com/assets/images/writing-throughput-cpu-usage-apache-doris-a629606fb8dc90bc682efb76c80f7cc9.png" width="1948" height="886" class="img_ev3q"></p><ul><li><strong>High data compression ratio</strong>: Doris utilizes the ZSTD compression algorithm on top of columnar storage. It can realize a compression ratio of 8:1. Compared to 1.5:1 in Elasticsearch, Doris can reduce storage costs by around 80%.</li><li><strong><a href="https://doris.apache.org/blog/Tiered-Storage-for-Hot-and-Cold-Data-What-Why-and-How" target="_blank" rel="noopener noreferrer">Tiered storage</a></strong>: Doris allows a more cost-effective way to store data: to put hot data in local disks and cold data object storage. Once the storage policy is set, Doris can automatically manage the &quot;cooldown&quot; process of hot data and move cold data to object storage. Such data lifecycle is transparent to the data application layer so it is user-friendly. Also, Doris speeds up cold data queries by local cache.</li></ul><p>With lower storage costs, Doris does not compromise query performance. It doubles the execution speed of queries that return a single row and those that return a result set. For aggregation queries without sampling, Doris runs at 4 times the speed of Elasticsearch.</p><p><strong>To sum up, Apache Doris achieves 2~4 times the query performance of Elasticsearch with only 1/3 of the storage cost it consumes.</strong></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="inverted-index-for-full-text-search">Inverted index for full-text search<a href="#inverted-index-for-full-text-search" class="hash-link" aria-label="Direct link to Inverted index for full-text search" title="Direct link to Inverted index for full-text search"></a></h3><p>Inverted index is the magic potion for log analytics because it can considerably increase full-text search performance and reduce query overheads. </p><p>It is especially useful in these scenarios:</p><ul><li>Full-text search by <code>MATCH_ALL</code>, <code>MATCH_ANY</code>, and <code>MATCH_PHRASE</code>. <code>MATCH_PHRASE</code> in combination with inverted index is the alternative to the Elasticsearch full-text search functionality.</li><li>Equivalence queries (=, ! =, IN), range queries (&gt;, &gt;=, &lt;, &lt;=), and support for numerics, datetime, and strings.</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">CREATE TABLE httplog</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">(</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> `ts` DATETIME,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> `clientip` VARCHAR(20),</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> `request` TEXT,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> INDEX idx_ip (`clientip`) USING INVERTED,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> INDEX idx_req (`request`) USING INVERTED PROPERTIES(&quot;parser&quot; = &quot;english&quot;) </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">DUPLICATE KEY(`ts`)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">...</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">-- Retrieve the latest 10 records of Client IP &quot;8.8.8.8&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT * FROM httplog WHERE clientip = &#x27;8.8.8.8&#x27; ORDER BY ts DESC LIMIT 10;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">-- Retrieve the latest 10 records with &quot;error&quot; or &quot;404&quot; in the &quot;request&quot; field</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT * FROM httplog WHERE request MATCH_ANY &#x27;error 404&#x27; ORDER BY ts DESC LIMIT 10;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">-- Retrieve the latest 10 records with &quot;image&quot; and &quot;faq&quot; in the &quot;request&quot; field</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT * FROM httplog WHERE request MATCH_ALL &#x27;image faq&#x27; ORDER BY ts DESC LIMIT 10;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">-- Retrieve the latest 10 records with &quot;query error&quot; in the &quot;request&quot; field</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT * FROM httplog WHERE request MATCH_PHRASE &#x27;query error&#x27; ORDER BY ts DESC LIMIT 10;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>As a powerful accelerator for full-text searches, inverted index in Doris is flexible because we witness the need for on-demand adjustments. In Elasticsearch, indexes are fixed upon creation, so there needs to be good planning of which fields need to be indexed, otherwise, any changes to the index will require a complete rewrite.</p><p>In contrast, Doris allows for dynamic indexing. You can add inverted index to a field during runtime and it will take effect immediately. You can also decide which data partitions to create indexes on.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="a-new-data-type-for-dynamic-schema-change">A new data type for dynamic schema change<a href="#a-new-data-type-for-dynamic-schema-change" class="hash-link" aria-label="Direct link to A new data type for dynamic schema change" title="Direct link to A new data type for dynamic schema change"></a></h3><p>By nature, an observability platform requires support for dynamic schema, because the data it collects is prone to changes. Every click by a user on the webpage might add a new metric to the database. </p><p>Looking around the database landscape, you will find that static schema is the norm. Some databases take a step further. For example, Elasticsearch realizes dynamic schema by mapping. However, this functionality can be easily interrupted by field type conflicts or unexpired historical fields.</p><p>The Doris solution for dynamic schema is a newly-introduced data type: Variant, and GuanceDB is among the first to try it out. (It will officially be available in Apache Doris V2.1.)</p><p>The Variant data type is the move of Doris to embrace semi-structured data analytics. It can solve a lot of the problems that often harass database users:</p><ul><li><strong>JSON</strong> <strong>data storage</strong>: A Variant column in Doris can accommodate any legal JSON data, and can automatically recognize the subfields and data types.</li><li><strong>Schema explosion due to too many fields</strong>: The frequently occurring subfields will be stored in a column-oriented manner to facilitate analysis, while the less frequently seen subfields will be merged into the same column to streamline the data schema.</li><li><strong>Write failure due to data type conflicts</strong>: A Variant column allows different types of data in the same field, and applies different storage for different data types.</li></ul><p><strong>Difference</strong> <strong>between Variant and Dynamic Mapping</strong></p><p>From a functional perspective, the biggest difference between Variant in Doris and Dynamic Mapping in Elasticsearch is that the scope of Dynamic Mapping extends throughout the entire lifecycle of the current table, while that of Variant can be limited to the current data partition. </p><p>For example, if a user has changed the business logic and renamed some Variant fields today, the old field name will remain on the partitions before today, but will not appear on the new partitions since tomorrow. <strong>So there is a lower risk of data type conflict.</strong></p><p>In the case of field type conflicts in the same partition, the two fields will be changed to JSON type to avoid data error or data loss. For example, there are two <code>status</code> fields in the user&#x27;s business system: One is strings, and the other is numerics, so in queries, the user can decide whether to query the string field, or the nuemric field, or both. (E.g. If you specify <code>status = &quot;ok&quot;</code> in the filters, the query will only be executed on the string field.)</p><p>From the users&#x27; perspective, they can use the Variant type as simply as other data types. They can add or remove Variant fields based on their business needs, and no extra syntax or annotation is required.</p><p>Currently, the Variant type requires extra type assertion, we plan to automate this process in future versions of Doris. GuanceDB is one step faster in this aspect. They have realized auto type assertion for their DQL queries. In most cases, type assertion is based on the actual data type of Variant fields. In some rare cases when there is a type conflict, the Variant fields will be upgraded to JSON fields, and then type assertion will be based on the semantics of operators in DQL queries.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>GuanceDB&#x27;s transition from Elasticsearch to Apache Doris showcases a big stride in improving data processing speed and reducing costs. For these purposes, Apache Doris has optimized itself in the two major aspects of data processing: data integration and data analysis. It has expanded its schemaless support to flexibly accommodate more data types, introduced features like inverted index and tiered storage to enable faster and more cost-effective queries. Evolution is an ongoing process. Apache Doris has never stopped improving itself. We have a lot of new features under development and the Doris <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">community</a> embrace any input and feedback.</p><p>Check Apache Doris GitHub <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">repo</a></p><p>Find Apache Doris makers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/empowering-cyber-security-by-enabling-seven-times-faster-log-analysis">Empowering cyber security by enabling 7 times faster log analysis</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2023-12-07T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 7, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>This is about how a cyber security service provider built its log storage and analysis system (LSAS) and realized 3X data writing speed, 7X query execution speed, and visualized management. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="log-storage--analysis-platform">Log storage &amp; analysis platform<a href="#log-storage--analysis-platform" class="hash-link" aria-label="Direct link to Log storage &amp; analysis platform" title="Direct link to Log storage &amp; analysis platform"></a></h2><p>In this use case, the LSAS collects system logs from its enterprise users, scans them, and detects viruses. It also provides data management and file tracking services. </p><p>Within the LSAS, it scans local files and uploads the file information as MD5 values to its cloud engine and identifies suspicious viruses. The cloud engine returns a log entry to tell the risk level of the files. The log entry includes messages like <code>file_name</code>, <code>file_size</code>, <code>file_level</code>, and <code>event_time</code>. Such information goes into a Topic in Apache Kafka, and then the real-time data warehouse normalizes the log messages. After that, all log data will be backed up to the offline data warehouse. Some log data requires further security analysis, so it will be pulled into the analytic engine and the self-developed Extended Detection and Response system (XDR) for more comprehensive detection. </p><p><img loading="lazy" alt="cyber-security-log-storage-and-analysis-platform" src="https://cdnd.selectdb.com/assets/images/cyber-security-log-storage-and-analysis-platform-83b6323a2b975c59ddcf59de91f96847.png" width="1280" height="536" class="img_ev3q"></p><p>The above process comes down to log writing and analysis, and the company faced some issues in both processes with their old system, which used StarRocks as the analytic engine.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="slow-data-writing">Slow data writing<a href="#slow-data-writing" class="hash-link" aria-label="Direct link to Slow data writing" title="Direct link to Slow data writing"></a></h3><p>The cloud engine interacts with tens of millions of terminal software and digests over 100 billion logs every day. The enormous data size poses a big challenge. The LSAS used to rely on StarRocks for log storage. With the ever-increasing daily log influx, data writing gradually slows down. The severe backlogs during peak times undermines system stability. They tried scaling the cluster from 3 nodes to 13 nodes, but the writing speed wasn&#x27;t substantially improved.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="slow-query-execution">Slow query execution<a href="#slow-query-execution" class="hash-link" aria-label="Direct link to Slow query execution" title="Direct link to Slow query execution"></a></h3><p>From an execution standpoint, extracting security information from logs involves a lot of keyword matching in the text fields (URL, payload, etc.). The StarRocks-based system does that by the SQL LIKE operator, which implements full scanning and brutal-force matching. In that way, queries on a 100-billion-row table often take one or several minutes. After screening out irrelevant data based on time range, the query response time still ranges from seconds to dozens of seconds, and it gets worse with concurrent queries.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="architectural-upgrade">Architectural upgrade<a href="#architectural-upgrade" class="hash-link" aria-label="Direct link to Architectural upgrade" title="Direct link to Architectural upgrade"></a></h2><p>In the search for a new database tool, the cyber security company set their eye on <a href="https://doris.apache.org/zh-CN/" target="_blank" rel="noopener noreferrer">Apache Doris</a>, which happened to have sharpened itself up in <a href="https://doris.apache.org/zh-CN/blog/release-note-2.0.0" target="_blank" rel="noopener noreferrer">version 2.0</a> for log analysis. It supports <a href="https://doris.apache.org/docs/dev/data-table/index/inverted-index/" target="_blank" rel="noopener noreferrer">inverted index</a> to empower text search, and <a href="https://doris.apache.org/docs/dev/data-table/index/ngram-bloomfilter-index?_highlight=ngram" target="_blank" rel="noopener noreferrer">NGram BloomFilter</a> to speed up the LIKE operator. </p><p>Although StarRocks was a fork of Apache Doris, it has rewritten part of the code and is now very different from Apache Doris in terms of features. The foregoing inverted index and NGram BloomFilter are a fragment of the current advancements that Apache Doris has made.</p><p>They tried Apache Doris out to evaluate its writing speed, query performance, and the associated storage and maintenance costs. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="300-data-writing-speed">300% data writing speed<a href="#300-data-writing-speed" class="hash-link" aria-label="Direct link to 300% data writing speed" title="Direct link to 300% data writing speed"></a></h3><p>To test the peak performance of Apache Doris, they only used 3 servers and connected it to Apache Kafka to receive their daily data input, and this is the test result compared to the old StarRocks-based LSAS.</p><p><img loading="lazy" alt="apache-doris-vs-starrocks-writing-throughput" src="https://cdnd.selectdb.com/assets/images/apache-doris-vs-starrocks-writing-throughput-e462779d45f4ba298ecbdc75b2f90b68.png" width="1280" height="403" class="img_ev3q"></p><p>Based on the peak performance of Apache Doris, it&#x27;s estimated that a 3-server cluster with 30% of CPU usage will be able to handle the writing workload. That can save them over 70% of hardware resources. Notably, in this test, they enabled inverted index for half of the fields. If it were disabled, the writing speed could be increased by another 50%.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="60-storage-cost">60% storage cost<a href="#60-storage-cost" class="hash-link" aria-label="Direct link to 60% storage cost" title="Direct link to 60% storage cost"></a></h3><p>With inverted index enabled, Apache Doris used even smaller storage space than the old system without inverted indexes. The data compression ratio was 1: 5.7 compared to the previous 1: 4.3.</p><p>In most databases and similar tools, the index file is often 2~4 times the size of the data file it belongs to, but in Apache Doris, the index-data size is basically one to one. That means Apache Doris can save a lot of storage space for users. This is because it has adopted columnar storage and the ZStandard compression. With data and indexes being stored column by column, it is easier to compress them, and the ZStandard algorithm is faster with higher compression ratio so it is perfect for log processing. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="690-query-speed">690% query speed<a href="#690-query-speed" class="hash-link" aria-label="Direct link to 690% query speed" title="Direct link to 690% query speed"></a></h3><p>To compare the query performance before and after upgrading, they tested the old and the new systems with 79 of their frequently executed SQL statements on the same 100 billion rows of log data with the same cluster size of 10 backend nodes.</p><p>They jotted down the query response time as follows:</p><p>The new Apache Doris-based system is faster in all 79 queries. On average, it reduces the query execution time by a factor of 7.</p><p><img loading="lazy" alt="apache-doris-vs-starrocks-query-performance" src="https://cdnd.selectdb.com/assets/images/apache-doris-vs-starrocks-query-performance-d4377592d59672165b17a6bc5158d8fe.png" width="1280" height="1017" class="img_ev3q"></p><p>Among these queries, the greatest increases in speed were enabled by a few features and optimizations of Apache Doris for log analysis.</p><p><strong>1. Inverted index accelerating keyword searches: Q23, Q24, Q30, Q31, Q42, Q43, Q50</strong></p><p>Example: Q43 was sped up 88.2 times.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT count() from table2 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WHERE ( event_time &gt;= 1693065600000 and event_time &lt; 1693152000000) </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> AND (rule_hit_big MATCH &#x27;xxxx&#x27;);</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>How is <a href="https://doris.apache.org/docs/dev/data-table/index/inverted-index/" target="_blank" rel="noopener noreferrer">inverted index</a> implemented? Upon data writing, Apache Doris tokenizes the texts into words, and takes notes of which word exists in which rows. For example, the word &quot;machine&quot; is in Row 127 and Row 201. In keyword searches, the system can quickly locate the relevant data by tracking the row numbers in the indexes.</p><p>Inverted index is much more efficient than brutal-force scanning in text searches. For one thing, it doesn&#x27;t have to read that much data. For another, it doesn&#x27;t require text matching. So it is able to increase execution speed by orders of magnitudes.</p><p><img loading="lazy" alt="cyber-security-inverted-index" src="https://cdnd.selectdb.com/assets/images/cyber-security-inverted-index-20f3d1267475f3074304b15f8a901db3.png" width="961" height="720" class="img_ev3q"></p><p><strong>2. NGram BloomFilter accelerating the LIKE operator: Q75, Q76, Q77, Q78</strong></p><p>Example: Q75 was sped up 44.4 times.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT * FROM table1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">WHERE ent_id = &#x27;xxxxx&#x27; </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> AND event_date = &#x27;2023-08-27&#x27; </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> AND file_level = 70 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> AND rule_group_id LIKE &#x27;adid:%&#x27; </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">ORDER BY event_time LIMIT 100;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>For non-verbatim searches, the LIKE operator is an important implementation method, so Apache Doris 2.0 introduces the <a href="https://doris.apache.org/docs/dev/data-table/index/ngram-bloomfilter-index" target="_blank" rel="noopener noreferrer">NGram BloomFilter</a> to empower that. </p><p>Different from regular BloomFilter, the NGram BloomFilter does not put the entire text into the filter, but splits it into continuous sub-strings of length N, and then puts the sub-strings into the filter. For a query like <code>cola LIKE &#x27;%pattern%&#x27;</code>, it splits <code>&#x27;pattern&#x27;</code> into several strings of length N, and sees if each of these sub-strings exists in the dataset. The absence of any sub-string in the dataset will indicate that the dataset does not contain the word <code>&#x27;pattern&#x27;</code>, so it will be skipped in data scanning, and that&#x27;s how the NGram BloomFilter accelerates queries.</p><p><strong>3. Optimizations for Top-N queries: Q19~Q29</strong></p><p>Example: Q22 was sped up 50.3 times.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT * FROM table1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">where event_date = &#x27;2023-08-27&#x27; and file_level = 70 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> and ent_id = &#x27;nnnnnnn&#x27; and file_name = &#x27;xxx.exe&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">order by event_time limit 100;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>Top-N queries are to find the N logs that fit into the specified conditions. It is a common type of query in log analysis, with the SQl being like <code>SELECT * FROM t WHERE xxx ORDER BY xx LIMIT n</code>. Apache Doris has optimized itself for that. Based on the intermediate status of queries, it figures out the dynamic range of the ranking field and implements automatic predicate pushdown to reduce data scanning. In some cases, this can decrease the scanned data volume by an order of magnitude.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="visualized-operation--maintenance">Visualized operation &amp; maintenance<a href="#visualized-operation--maintenance" class="hash-link" aria-label="Direct link to Visualized operation &amp; maintenance" title="Direct link to Visualized operation &amp; maintenance"></a></h3><p>For more efficient cluster maintenance, VeloDB, the commercial supporter of Apache Doris , has contributed a visualized cluster management tool called <a href="https://github.com/apache/doris-manager" target="_blank" rel="noopener noreferrer">Doris Manager</a> to the Apache Doris project. Everyday management and maintenance operations can be done via the Doris Manager, including cluster monitoring, inspection, configuration modification, scaling, and upgrading. The visualized tool can save a lot of manual efforts and avoid the risks of maloperations on Doris.</p><p><img loading="lazy" alt="doris-manager-for-visualized-operation-and-maintenance" src="https://cdnd.selectdb.com/assets/images/doris-manager-for-visualized-operation-and-maintenance-b1f63cbae23f025b6ac4d49bf6b9ca36.png" width="1280" height="642" class="img_ev3q"></p><p>Apart from cluster management, Doris Manager provides a visualized WebUI for log analysis (think of Kibana), so it&#x27;s very friendly to users who are familiar with the ELK Stack. It supports keyword searches, trend charts, field filtering, and detailed data listing and collapsed display, so it enables interactive analysis and easy drilling down of logs.</p><p><img loading="lazy" alt="doris-manager-webui-showcase" src="https://cdnd.selectdb.com/assets/images/doris-manager-webui-showcase-cba1b2b240ff03357c833aae15e614da.png" width="1280" height="687" class="img_ev3q"></p><p>After a month-long trial run, they officially replaced their old LSAS with the Apache Doris-based system for production, and achieved great results as they expected. Now, they ingest their 100s of billions of new logs every day via the <a href="https://doris.apache.org/docs/dev/data-operate/import/import-way/routine-load-manual/" target="_blank" rel="noopener noreferrer">Routine Load</a> method at a speed 3 times as fast as before. Among the 7-time overall query performance increase, they benefit from a speedup of over 20 times in full-text searches. And they enjoy easier maintenance and interactive analysis. Their next step is to expand the coverage of JSON data type and delve into semi-structured data analysis. Luckily, the upcoming Apache Doris 2.1 will provide more schema-free support. It will have a new Variant data type, support JSON data of any structures, and allow for flexible changes in field numbers and field types. Relevant updates will be released on the <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris website</a> and the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris community</a>.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/how-big-data-is-saving-lives-in-real-time-iov-data-analytics-helps-prevent-accidents">How big data is saving lives in real time: IoV data analytics helps prevent accidents</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2023-11-29T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">November 29, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Internet of Vehicles, or IoV, is the product of the marriage between the automotive industry and IoT. IoV data is expected to get larger and larger, especially with electric vehicles being the new growth engine of the auto market. The question is: Is your data platform ready for that? This post shows you what an OLAP solution for IoV looks like.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="what-is-special-about-iov-data">What is special about IoV data?<a href="#what-is-special-about-iov-data" class="hash-link" aria-label="Direct link to What is special about IoV data?" title="Direct link to What is special about IoV data?"></a></h2><p>The idea of IoV is intuitive: to create a network so vehicles can share information with each other or with urban infrastructure. What‘s often under-explained is the network within each vehicle itself. On each car, there is something called Controller Area Network (CAN) that works as the communication center for the electronic control systems. For a car traveling on the road, the CAN is the guarantee of its safety and functionality, because it is responsible for:</p><ul><li><strong>Vehicle system monitoring</strong>: The CAN is the pulse of the vehicle system. For example, sensors send the temperature, pressure, or position they detect to the CAN; controllers issue commands (like adjusting the valve or the drive motor) to the executor via the CAN. </li><li><strong>Real-time feedback</strong>: Via the CAN, sensors send the speed, steering angle, and brake status to the controllers, which make timely adjustments to the car to ensure safety. </li><li><strong>Data sharing and coordination</strong>: The CAN allows for data exchange (such as status and commands) between various devices, so the whole system can be more performant and efficient.</li><li><strong>Network management and troubleshooting</strong>: The CAN keeps an eye on devices and components in the system. It recognizes, configures, and monitors the devices for maintenance and troubleshooting.</li></ul><p>With the CAN being that busy, you can imagine the data size that is traveling through the CAN every day. In the case of this post, we are talking about a car manufacturer who connects 4 million cars together and has to process 100 billion pieces of CAN data every day. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="iov-data-processing">IoV data processing<a href="#iov-data-processing" class="hash-link" aria-label="Direct link to IoV data processing" title="Direct link to IoV data processing"></a></h2><p>To turn this huge data size into valuable information that guides product development, production, and sales is the juicy part. Like most data analytic workloads, this comes down to data writing and computation, which are also where challenges exist:</p><ul><li><strong>Data writing at scale</strong>: Sensors are everywhere in a car: doors, seats, brake lights... Plus, many sensors collect more than one signal. The 4 million cars add up to a data throughput of millions of TPS, which means dozens of terabytes every day. With increasing car sales, that number is still growing. </li><li><strong>Real-time analysis</strong>: This is perhaps the best manifestation of &quot;time is life&quot;. Car manufacturers collect the real-time data from their vehicles to identify potential malfunctions, and fix them before any damage happens.</li><li><strong>Low-cost computation and storage</strong>: It&#x27;s hard to talk about huge data size without mentioning its costs. Low cost makes big data processing sustainable.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="from-apache-hive-to-apache-doris-a-transition-to-real-time-analysis">From Apache Hive to Apache Doris: a transition to real-time analysis<a href="#from-apache-hive-to-apache-doris-a-transition-to-real-time-analysis" class="hash-link" aria-label="Direct link to From Apache Hive to Apache Doris: a transition to real-time analysis" title="Direct link to From Apache Hive to Apache Doris: a transition to real-time analysis"></a></h2><p>Like Rome, a real-time data processing platform is not built in a day. The car manufacturer used to rely on the combination of a batch analytic engine (Apache Hive) and some streaming frameworks and engines (Apache Flink, Apache Kafka) to gain near real-time data analysis performance. They didn&#x27;t realize they needed real-time that bad until real-time was a problem.</p><p><strong>Near Real-Time Data Analysis Platform</strong></p><p>This is what used to work for them:</p><p><img loading="lazy" alt="IoV-Hive-based-data-warehouse" src="https://cdnd.selectdb.com/assets/images/IoV-Hive-based-data-warehouse-1bbef26f4fbb3012d0ae17fc3b1c4fa5.png" width="1280" height="766" class="img_ev3q"></p><p>Data from the CAN and vehicle sensors are uploaded via 4G network to the cloud gateway, which writes the data into Kafka. Then, Flink processes this data and forwards it to Hive. Going through several data warehousing layers in Hive, the aggregated data is exported to MySQL. At the end, Hive and MySQL provide data to the application layer for data analysis, dashboarding, etc.</p><p>Since Hive is primarily designed for batch processing rather than real-time analytics, you can tell the mismatch of it in this use case.</p><ul><li><strong>Data writing</strong>: With such a huge data size, the data ingestion time from Flink into Hive was noticeably long. In addition, Hive only supports data updating at the granularity of partitions, which is not enough for some cases.</li><li><strong>Data analysis</strong>: The Hive-based analytic solution delivers high query latency, which is a multi-factor issue. Firstly, Hive was slower than expected when handling large tables with 1 billion rows. Secondly, within Hive, data is extracted from one layer to another by the execution of Spark SQL, which could take a while. Thirdly, as Hive needs to work with MySQL to serve all needs from the application side, data transfer between Hive and MySQL also adds to the query latency. </li></ul><p><strong>Real-Time Data Analysis Platform</strong></p><p>This is what happens when they add a real-time analytic engine to the picture:</p><p><img loading="lazy" alt="IoV-Doris-based-data-warehouse" src="https://cdnd.selectdb.com/assets/images/IoV-Doris-based-data-warehouse-6eb6329ab3bedda6ed707f02219d85c7.png" width="1280" height="1058" class="img_ev3q"></p><p>Compared to the old Hive-based platform, this new one is more efficient in three ways:</p><ul><li><strong>Data writing</strong>: Data ingestion into <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a> is quick and easy, without complicated configurations and the introduction of extra components. It supports a variety of data ingestion methods. For example, in this case, data is written from Kafka into Doris via <a href="https://doris.apache.org/docs/data-operate/import/import-way/stream-load-manual" target="_blank" rel="noopener noreferrer">Stream Load</a>, and from Hive into Doris via <a href="https://doris.apache.org/docs/data-operate/import/import-way/broker-load-manual" target="_blank" rel="noopener noreferrer">Broker Load</a>. </li><li><strong>Data analysis</strong>: To showcase the query speed of Apache Doris by example, it can return a 10-million-row result set within seconds in a cross-table join query. Also, it can work as a <a href="https://doris.apache.org/docs/lakehouse/multi-catalog/" target="_blank" rel="noopener noreferrer">unified query gateway</a> with its quick access to external data (Hive, MySQL, Iceberg, etc.), so analysts don&#x27;t have to juggle between multiple components.</li><li><strong>Computation and storage costs</strong>: Apache Doris provides the Z-Standard algorithm that can bring a 3~5 times higher data compression ratio. That&#x27;s how it helps reduce costs in data computation and storage. Moreover, the compression can be done solely in Doris so it won&#x27;t consume resources from Flink.</li></ul><p>A good real-time analytic solution not only stresses data processing speed, it also considers all the way along your data pipeline and smoothens every step of it. Here are two examples:</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-the-arrangement-of-can-data">1. The arrangement of CAN data<a href="#1-the-arrangement-of-can-data" class="hash-link" aria-label="Direct link to 1. The arrangement of CAN data" title="Direct link to 1. The arrangement of CAN data"></a></h3><p>In Kafka, CAN data was arranged by the dimension of CAN ID. However, for the sake of data analysis, analysts had to compare signals from various vehicles, which meant to concatenate data of different CAN ID into a flat table and align it by timestamp. From that flat table, they could derive different tables for different analytic purposes. Such transformation was implemented using Spark SQL, which was time-consuming in the old Hive-based architecture, and the SQL statements are high-maintenance. Moreover, the data was updated by batch on a daily basis, which meant they could only get data from a day ago. </p><p>In Apache Doris, all they need is to build the tables with the <a href="https://doris.apache.org/docs/data-table/data-model#aggregate-model" target="_blank" rel="noopener noreferrer">Aggregate Key model</a>, specify VIN (Vehicle Identification Number) and timestamp as the Aggregate Key, and define other data fields by <code>REPLACE_IF_NOT_NULL</code>. With Doris, they don&#x27;t have to take care of the SQL statements or the flat table, but are able to extract real-time insights from real-time data.</p><p><img loading="lazy" alt="IoV-CAN-data" src="https://cdnd.selectdb.com/assets/images/IoV-CAN-data-21c4722dff0b60c64dd2286cbf3df3be.jpeg" width="1280" height="937" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-dtc-data-query">2. DTC data query<a href="#2-dtc-data-query" class="hash-link" aria-label="Direct link to 2. DTC data query" title="Direct link to 2. DTC data query"></a></h3><p>Of all CAN data, DTC (Diagnostic Trouble Code) deserves high attention and separate storage, because it tells you what&#x27;s wrong with a car. Each day, the manufacturer receives around 1 billion pieces of DTC. To capture life-saving information from the DTC, data engineers need to relate the DTC data to a DTC configuration table in MySQL.</p><p>What they used to do was to write the DTC data into Kafka every day, process it in Flink, and store the results in Hive. In this way, the DTC data and the DTC configuration table were stored in two different components. That caused a dilemma: a 1-billion-row DTC table was hard to write into MySQL, while querying from Hive was slow. As the DTC configuration table was also constantly updated, engineers could only import a version of it into Hive on a regular basis. That meant they didn&#x27;t always get to relate the DTC data to the latest DTC configurations. </p><p>As is mentioned, Apache Doris can work as a unified query gateway. This is supported by its <a href="https://doris.apache.org/docs/lakehouse/multi-catalog/" target="_blank" rel="noopener noreferrer">Multi-Catalog</a> feature. They import their DTC data from Hive into Doris, and then they create a MySQL Catalog in Doris to map to the DTC configuration table in MySQL. When all this is done, they can simply join the two tables within Doris and get real-time query response.</p><p><img loading="lazy" alt="IoV-DTC-data-query" src="https://cdnd.selectdb.com/assets/images/IoV-DTC-data-query-7e0534a9aafd3005e1e08439acb288fc.png" width="1280" height="523" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>This is an actual real-time analytic solution for IoV. It is designed for data at really large scale, and it is now supporting a car manufacturer who receives 10 billion rows of new data every day in improving driving safety and experience.</p><p>Building a data platform to suit your use case is not easy, I hope this post helps you in building your own analytic solution.</p><p>Apache Doris <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">GitHub repo</a></p><p>Find Apache Doris makers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/less-components-higher-performance-apache-doris-instead-of-clickhouse-mysql-presto-and-hbase">Less components, higher performance: Apache Doris instead of ClickHouse, MySQL, Presto, and HBase</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">CIGNA &amp; CMB</span></span><time datetime="2023-11-22T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">November 22, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>This post is about building a unified OLAP platform. An insurance company tries to build a data warehouse that can undertake all their customer-facing, analyst-facing, and management-facing data analysis workloads. The main tasks include: </p><ul><li><strong>Self-service insurance contract query</strong>: This is for insurance customers to check their contract details by their contract ID. It should also support filters such as coverage period, insurance types, and claim amount. </li><li><strong>Multi-dimensional analysis</strong>: Analysts develop their reports based on different data dimensions as they need, so they can extract insights to facilitate product innovation and their anti-fraud efforts. </li><li><strong>Dashboarding</strong>: This is to create visual overview of the insurance sales trends and the horizontal and vertical comparison of different metrics.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="component-heavy-data-architecture">Component-Heavy Data Architecture<a href="#component-heavy-data-architecture" class="hash-link" aria-label="Direct link to Component-Heavy Data Architecture" title="Direct link to Component-Heavy Data Architecture"></a></h2><p>The user started with Lambda architecture, spliting their data pipeline into a batch processing link and a stream processing link. For real-time data streaming, they apply Flink CDC; for batch import, they incorporate Sqoop, Python, and DataX to build their own data integration tool named Hisen. </p><p><img loading="lazy" alt="multi-component-data-warehouse-mysql-clickhouse-hbase-hive-presto" src="https://cdnd.selectdb.com/assets/images/multi-component-data-warehouse-mysql-clickhouse-hbase-hive-presto-6e3dbac016295bce3108943b4bddcf4c.png" width="1280" height="640" class="img_ev3q"></p><p>Then, the real-time and offline data meets in the data warehousing layer, which is made up of five components.</p><p><strong>ClickHouse</strong></p><p>The data warehouse is of flat table design and ClickHouse is superb in flat table reading. But as business evolves, things become challenging in two ways:</p><ul><li>To support cross-table joins and point queries, the user requires the star schema, but that&#x27;s difficult to implement in ClickHouse.</li><li>Changes in insurance contracts need to be updated in the data warehouse in real time. In ClickHouse, that is done by recreating a flat table to overwrite the old one, which is not fast enough.</li></ul><p><strong>MySQL</strong></p><p>After calculation, data metrics are stored in MySQL, but as the data size grows, MySQL starts to struggle, with emerging problems like prolonged execution time and errors thrown.</p><p><strong>Apache</strong> <strong>Hive</strong> <strong>+ Presto</strong></p><p>Hive is the main executor in the batch processing link. It can transform, aggregate, query offline data. Presto is a complement to Hive for interactive analysis.</p><p><strong>Apache HBase</strong></p><p>HBase undertakes primary key queries. It reads customer status from MySQL and Hive, including customer credits, coverage period, and sum insured. However, since HBase does not support secondary indexes, it has limited capability in reading non-primary key columns. Plus, as a NoSQL database, HBase does not support SQL statements.</p><p>The components have to work in conjunction to serve all needs, making the data warehouse too much to take care of. It is not easy to get started with because engineers must be trained on all these components. Also, the complexity of architecture adds to the risks of latency. </p><p>So the user tried to look for a tool that ticks more boxes in fulfilling their requirements. The first thing they need is real-time capabilities, including real-time writing, real-time updating, and real-time response to data queries. Secondly, they need more flexibility in data analysis to support customer-facing self-service queries, like multi-dimensional analysis, join queries of large tables, primary key indexes, roll-ups, and drill-downs. Then, for batch processing, they also want high throughput in data writing.</p><p>They eventually made up their mind with <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a>. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="replacing-four-components-with-apache-doris">Replacing Four Components with Apache Doris<a href="#replacing-four-components-with-apache-doris" class="hash-link" aria-label="Direct link to Replacing Four Components with Apache Doris" title="Direct link to Replacing Four Components with Apache Doris"></a></h2><p> Apache Doris is capable of both real-time and offline data analysis, and it supports both high-throughput interactive analysis and high-concurrency point queries. That&#x27;s why it can replace ClickHouse, MySQL, Presto, and Apache HBase and work as the unified query gateway for the entire data system. </p><p><img loading="lazy" alt="unified-data-warehouse-kafka-apache-doris-hive" src="https://cdnd.selectdb.com/assets/images/unified-data-warehouse-kafka-apache-doris-hive-0c1accc90b4280a26b81be17b31e5a63.png" width="1280" height="686" class="img_ev3q"></p><p>The improved data pipeline is a much cleaner Lambda architecture. </p><p>Apache Doris provides a wide range of data ingestion methods. It&#x27;s quick in data writing. On top of this, it also implements Merge-on-Write to improve its performance on concurrent point queries. </p><p><strong>Reduced Cost</strong></p><p>The new architecture has reduced the user&#x27;s cost in human efforts. For one thing, the much simpler data architecture leads to much easier maintenance; for another, developers no longer need to join the real-time and offline data in the data serving API.</p><p>The user can also save money with Doris because it supports tiered storage. It allows the user to put their huge amount of rarely accessed historical data in object storage, which is much cheaper to hoard data.</p><p><strong>Higher Efficiency</strong></p><p>Apache Doris can reach a QPS of 10,000s and respond to billions of point queries within milliseconds, so the customer-facing queries are easy for it to handle. Tiered storage that separates hot data from cold data also increases their query efficiency.</p><p><strong>Service Availability</strong></p><p>As a unified data warehouse for storage, computation, and data services, Apache Doris allows for easy disaster recovery. With less components, they don&#x27;t have to worry about data loss or duplication. </p><p>An important guarantee of service availability for the user is the Cross-Cluster Replication (CCR) capability of Apache Doris. It can synchronize data from cluster to cluster within minutes or even seconds, and it implements two mechanisms to ensure data reliability:</p><ul><li><strong>Binlog</strong>: This mechanism can automatically log the data changes and generate a LogID for each data modification operation. The incremental LogIDs make sure that data changes are traceable and orderly.</li><li><strong>Data persistence</strong>: In the case of system meltdown or emergencies, data will be put into disks.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="a-deeper-look-into-apache-doris">A Deeper Look into Apache Doris<a href="#a-deeper-look-into-apache-doris" class="hash-link" aria-label="Direct link to A Deeper Look into Apache Doris" title="Direct link to A Deeper Look into Apache Doris"></a></h2><p>Apache Doris can replace the ClickHouse, MySQL, Presto, and HBase because it has a comprehensive collection of capabilities all along the data processing pipeline. In data ingestion, it enables low-latency real-time writing based on its support for Flink CDC and Merge-on-Write. It guarantees Exactly-Once writing by its Label mechanism and transactional loading. In data queries, it supports both Star Schema and flat table aggregation, so it can provide high performance in bother multi-table joins and large single table queries. It also provides various ways to speed up different queries, like <a href="https://doris.apache.org/docs/dev/data-table/index/inverted-index/" target="_blank" rel="noopener noreferrer">inverted index</a> for full-text search and range queries, short-circuit plan and prepared statements for point queries.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/data-analysis-for-live-streaming-what-happens-in-real-time-is-analyzed-in-real-time">Data analysis for live streaming: what happens in real time is analyzed in real time</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">He Gong</span></span><time datetime="2023-10-30T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">October 30, 2023</time></div></header><div class="markdown" itemprop="articleBody"><h2 class="anchor anchorWithStickyNavbar_LWe7" id="whats-different-about-data-analytics-in-live-streaming">What&#x27;s different about data analytics in live streaming?<a href="#whats-different-about-data-analytics-in-live-streaming" class="hash-link" aria-label="Direct link to What&#x27;s different about data analytics in live streaming?" title="Direct link to What&#x27;s different about data analytics in live streaming?"></a></h2><p>Live streaming is one typical use case for real-time data analysis, because it stresses speed. Livestream organizers need to keep abreast of the latest data to see what is happening and maximize effectiveness. To realize that requires high efficiency in every step of data processing:</p><ul><li><strong>Data writing</strong>: A live event churns out huge amounts of data every second, so the database should be able to ingest such high throughput stably.</li><li><strong>Data update</strong>: As life itself, live streaming entails a lot of data changes, so there should be a quick and reliable data updating mechanism to absorb the changes.</li><li><strong>Data queries</strong>: Data should be ready and accessible as soon as analysts want it. Mostly that means real-time visibility.</li><li><strong>Maintenance</strong>: What&#x27;s special about live streaming is that the data stream has prominent highs and lows. The analytic system should be able to ensure stability during peak times, and allow scaling down in off-peak times in order to improve resource utilization. If possible, it should also provide disaster recovery services to guarantee system availability, since the worst case in live streaming is interruption. </li></ul><p>The rest of this post is about how a live streaming service provider with 800 million end users found the right database to support its analytic solution.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="simplify-the-components">Simplify the Components<a href="#simplify-the-components" class="hash-link" aria-label="Direct link to Simplify the Components" title="Direct link to Simplify the Components"></a></h2><p>In this case, the live streaming data analytic platform adopts the Lambda architecture, which consists of a batch processing pipeline and a streaming pipeline, the former for user profile information and the latter for real-time generated data, including metrics like real-time subscription, visitor count, comments and responses. </p><ul><li><strong>Batching processing</strong>: The user basic information stored in HDFS is written into HBase to form a table.</li><li><strong>Streaming</strong>: Real-time generated data from MySQL, collected via Flink CDC, goes into Apache Kafka. Flink works as the computation engine and then the data is stored in Redis.</li></ul><p><img loading="lazy" alt="database-for-live-shopping-Elasticsearch-HBase" src="https://cdnd.selectdb.com/assets/images/xiaoe-tech-1-85a1ce0c20ef5cee50ca0b3c908f9ee0.png" width="1898" height="966" class="img_ev3q"></p><p>The real-time metrics will be combined with the user profile information to form a flat table, and Elasticsearch will work as the query engine.</p><p>As their business burgeons, the expanding data size becomes unbearable for this platform, with problems like:</p><ul><li><strong>Delayed data writing</strong>: The multiple components result in multiple steps in data writing, and inevitably lead to prolonged data writing, especially during peak times. </li><li><strong>Complicated updating mechanism</strong>: Every time there is a data change, such as that in user subscription information, it must be updated into the main tables and dimensional tables, and then the tables are correlated to generate a new flat table. And don&#x27;t forget that this long process has to be executed across multiple components. So just imagine the complexity.</li><li><strong>Slow queries</strong>: As the query engine, Elasticsearch struggles with concurrent query requests and data accesses. It is also not flexible enough to deal with the join queries.</li><li><strong>Time-consuming maintenance</strong>: All engineers developing or maintaining this platform need to master all the components. That&#x27;s a lot of training. And adding new metrics to the data pool is labor-intensive.</li></ul><p>So to sum up, the main problem for this architecture is its complexity. To reduce the components means to find a database that is not only capable of most workloads, but also performant in data writing and queries. After 6 months of testing, they finally upgraded their live streaming analytic platform with <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a>. </p><p>They converge the streaming and the batch processing pipelines at Apache Doris. It can undertake analytic workloads and also provides a storage layer so data doesn&#x27;t have to shuffle back to Elasticsearch and HBase as it did in the old architecture.</p><p>With Apache Doris as the data warehouse, the platform architecture becomes neater.</p><p><img loading="lazy" alt="database-for-live-shopping-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/xiaoe-tech-2-53446135cfc264b66e055259af6ff08b.png" width="1908" height="936" class="img_ev3q"></p><ul><li><strong>Smooth data writing</strong>: Raw data is processed by Flink and written into Apache Doris in real time. The Doris community provides a <a href="https://github.com/apache/doris-flink-connector" target="_blank" rel="noopener noreferrer">Flink-Doris-Connector</a> with built-in Flink CDC.</li><li><strong>Flexible data update</strong>: For data changes, Apache Doris implements <a href="https://doris.apache.org/docs/data-table/data-model/#merge-on-write" target="_blank" rel="noopener noreferrer">Merge-on-Write</a>. This is especially useful in small-batch real-time writing because you don&#x27;t have to renew the entire flat table. It also supports partial update of columns, which is another way to make data updates more lightweight. In this case, Apache Doris is able to finish Upsert or Insert Overwrite operations for <strong>200,000 rows per second</strong>, and these are all done in large tables with the biggest ones reaching billions of rows. </li><li><strong>Faster queries</strong>: For join queries, Apache Doris can easily join multiple large tables (10 billion rows). It can respond to a rich variety of queries within seconds or even milliseconds, including tag retrievals, fuzzy queries, ranking, and paginated queries.</li><li><strong>Easier maintenance</strong>: As for Apache Doris itself, the frontend and backend nodes are both flexibly scalable. It is compatible with MySQL protocol. What took the developers a month now can be finished within a week, which allows for more agile iteration of metrics. </li></ul><p>The above shows how Apache Doris speeds up the entire data processing pipeline with its all-in-one capabilities. Beyond that, it has some delightful features that can increase query efficiency and ensure service reliability in the case of live streaming. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="disaster-recovery">Disaster Recovery<a href="#disaster-recovery" class="hash-link" aria-label="Direct link to Disaster Recovery" title="Direct link to Disaster Recovery"></a></h2><p>The last thing you want in live streaming is service breakdown, so disaster recovery is necessary.</p><p>Before the live streaming platform had Apache Doris in place, they only backed up their data to object storage. It took an hour from when a failure was reported to when it was fixed. That one-hour window is fatal for live commerce because viewers will leave immediately. Thus, disaster recovery must be quick.</p><p>Now, with Apache Doris, they have a dual-cluster solution: a primary cluster and a backup cluster. This is for hot backup. Besides that, they have a cold backup plan, which is the same as what they did: backing up their everyday data to object storage via Backup and Freeze policies.</p><p>This is how they do hot backup before <a href="https://doris.apache.org/zh-CN/blog/release-note-2.0.0" target="_blank" rel="noopener noreferrer">Apache Doris 2.0</a>: </p><ul><li><strong>Data dual-write</strong>: Write data to both the primary cluster and backup cluster. </li><li><strong>Load balancing</strong>: In case there is something wrong with one cluster, query requests can be directed to the other cluster via reverse proxy.</li><li><strong>Monitoring</strong>: Regularly check the data consistency between the two clusters. </li></ul><p>Apache Doris 2.0 supports <a href="https://doris.apache.org/zh-CN/blog/release-note-2.0.0#support-for-cross-cluster-replication-ccr" target="_blank" rel="noopener noreferrer">Cross Cluster Replication (CCR)</a>, which can automate the above processes to reduce maintenance costs and inconsistency risks due to human factors.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-visualization">Data Visualization<a href="#data-visualization" class="hash-link" aria-label="Direct link to Data Visualization" title="Direct link to Data Visualization"></a></h2><p>In addition to reporting, dashboarding, and ad-hoc queries, the platform also allows analysts to configure various data sources to produce their own visualized data lists. </p><p>Apache Doris is compatible with most BI tools on the market, so the platform developers can tap on that and provide a broader set of functionalities for live streamers.</p><p>Also, built on the real-time capabilities and quick computation of Apache Doris, live streams can view data and see what happens in real time, instead of waiting for a day for data analysis.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="bitmap-index-to-accelerate-tag-queries">Bitmap Index to Accelerate Tag Queries<a href="#bitmap-index-to-accelerate-tag-queries" class="hash-link" aria-label="Direct link to Bitmap Index to Accelerate Tag Queries" title="Direct link to Bitmap Index to Accelerate Tag Queries"></a></h2><p>A big part of data analysis in live streaming is viewer profiling. Viewers are divided into groups based on their online footprint. They are given tags like &quot;watched for over one minute&quot; and &quot;visited during the past minute&quot;. As the show goes on, viewers are constantly tagged and untagged. In the data warehouse, it means frequent data insertion and deletion. Plus, one viewer is given multiple tags. To gain an overall understanding of users entail join queries, which is why the join performance of the data warehouse is important. </p><p>The following snippets give you a general idea of how to tag users and conduct tag queries in Apache Doris.</p><p><strong>Create a Tag Table</strong></p><p>A tag table lists all the tags that are given to the viewers, and maps the tags to the corresponding viewer ID.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">create table db.tags ( </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">u_id string, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">version string, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">tags string</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">) with ( </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;connector&#x27; = &#x27;doris&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;fenodes&#x27; = &#x27;&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;table.identifier&#x27; = &#x27;tags&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;username&#x27; = &#x27;&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;password&#x27; = &#x27;&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.format&#x27; = &#x27;json&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.strip_outer_array&#x27; = &#x27;true&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.fuzzy_parse&#x27; = &#x27;true&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.columns&#x27; = &#x27;id,u_id,version,a_tags,m_tags,a_tags=bitmap_from_string(a_tags),m_tags=bitmap_from_string(m_tags)&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.batch.interval&#x27; = &#x27;10s&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.batch.size&#x27; = &#x27;100000&#x27; </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">);</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Create a Tag Version Table</strong></p><p>The tag table is constantly changing, so there are different versions of it as time goes by.</p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">create table db.tags_version ( </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">id string, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">u_id string, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">version string </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">) with ( </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;connector&#x27; = &#x27;doris&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;fenodes&#x27; = &#x27;&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;table.identifier&#x27; = &#x27;db.tags_version&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;username&#x27; = &#x27;&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;password&#x27; = &#x27;&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.format&#x27; = &#x27;json&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.strip_outer_array&#x27; = &#x27;true&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.fuzzy_parse&#x27; = &#x27;true&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.properties.columns&#x27; = &#x27;id,u_id,version&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.batch.interval&#x27; = &#x27;10s&#x27;, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">&#x27;sink.batch.size&#x27; = &#x27;100000&#x27; </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">);</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Write Data into Tag Table and Tag Version Table</strong></p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">insert into db.tags</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">select</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">u_id, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">last_timestamp as version,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">tags</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">from db.source; </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">insert into rtime_db.tags_version</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">select </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">u_id, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">last_timestamp as version</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">from db.source;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p><strong>Tag Queries Accelerated by Bitmap Index</strong></p><p>For example, analysts need to find out the latest tags related to a certain viewer with the last name Thomas. Apache Doris will run the LIKE operator in the user information table to find all &quot;Thomas&quot;. Then it creates bitmap indexes for the tags. Lastly, it relates all user information table, tag table, and tag version table to return the result.</p><p><strong>Of almost a billion viewers and each of them has over a thousand tags, the bitmap index can help reduce the query response time to less than one second.</strong></p><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">with t_user as (</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> select </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> u_id,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> name</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> from db.user</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> where partition_id = 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> and name like &#x27;%Thomas%&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">),</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> t_tags as (</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> select </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> u_id, </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> version</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> from db.tags</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> where (</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> bitmap_and_count(a_tags, bitmap_from_string(&quot;123,124,125,126,333&quot;)) &gt; 0 </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> )</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> ),</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> t_tag_version as (</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> select id, u_id, version</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> from db.tags_version</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> )</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">select </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> t1.u_id</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> t1.name</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">from t_user t1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">join t_tags t2 on t1.u_id = t2.u_id</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">join t_tag_version t3 on t2.u_id = t3.u_id and t2.version = t3.version</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">order by t1.u_id desc</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">limit 1,10;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>Data analysis in live streaming is challenging for the underlying database, but it is also where the key competitiveness of Apache Doris comes to play. First of all, Apache Doris can handle most data processing workloads, so platform builders don&#x27;t have to worry about putting many components together and consequential maintenance issues. Secondly, it has a lot of query-accelerating features, including but not limited to indexes. After tackling the speed issues, the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris developer community</a> has been exploring its boundaries, such as introducing a more efficient cost-based query optimizer in version 2.0 and inverted index for text searches, fuzzy queries, and range queries. These features are embraced by the live streaming service provider as they are actively testing them and planning to transfer their log analytic workloads to Apache Doris, too.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/migrating-from-clickhouse-to-apache-doris-what-happened">Migrating from ClickHouse to Apache Doris: what happened?</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Chuang Li</span></span><time datetime="2023-10-11T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">October 11, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Migrating from one OLAP database to another is huge. Even if you&#x27;re unhappy with your current data tool and have found some promising candidate, you might still hesitate to do the big surgery on your data architecture, because you&#x27;re uncertain about how things are going to work. So you need experience shared by someone who has walked the path. </p><p>Luckily, a user of Apache Doris has written down their migration process from ClickHouse to Doris, including why they need the change, what needs to be taken care of, and how they compare the performance of the two databases in their environment. </p><p>To decide whether you want to continue reading, check if you tick one of the following boxes:</p><ul><li>You need your join queries to be executed faster.</li><li>You need flexible data updates.</li><li>You need real-time data analysis.</li><li>You need to minimize your components.</li></ul><p>If you do, this post might be of some help to you.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="replacing-kylin-clickhouse-and-druid-with-apache-doris">Replacing Kylin, ClickHouse, and Druid with Apache Doris<a href="#replacing-kylin-clickhouse-and-druid-with-apache-doris" class="hash-link" aria-label="Direct link to Replacing Kylin, ClickHouse, and Druid with Apache Doris" title="Direct link to Replacing Kylin, ClickHouse, and Druid with Apache Doris"></a></h2><p>The user undergoing this change is an e-commerce SaaS provider. Its data system serves realtime and offline reporting, customer segmentation, and log analysis. Initially, they used different OLAP engines for these various purposes:</p><ul><li><strong>Apache Kylin for offline reporting</strong>: The system provides offline reporting services for over 5 million sellers. The big ones among them have more than 10 million registered members and 100,000 SKU, and the detailed information is put into over 400 data cubes on the platform. </li><li><strong>ClickHouse for customer segmentation and Top-N log queries</strong>: This entails high-frequency updates, high QPS, and complicated SQL.</li><li><strong>Apache Druid for real-time reporting</strong>: Sellers extract data they need by combining different dimensions, and such real-time reporting requires quick data updates, quick query response, and strong stability of the system. </li></ul><p><img loading="lazy" alt="ClickHouse-Druid-Apache-Kylin" src="https://cdnd.selectdb.com/assets/images/youzan-1-21f1d14ff97ac4bbf038e58c72a95e85.png" width="1280" height="529" class="img_ev3q"></p><p>The three components have their own sore spots.</p><ul><li><strong>Apache Kylin</strong> runs well with a fixed table schema, but every time you want to add a dimension, you need to create a new data cube and refill the historical data in it.</li><li><strong>ClickHouse</strong> is not designed for multi-table processing, so you might need an extra solution for federated queries and multi-table join queries. And in this case, it was below expectation in high-concurrency scenarios.</li><li><strong>Apache Druid</strong> implements idempotent writing so it does not support data updating or deletion itself. That means when there is something wrong at the upstream, you will need a full data replacement. And such data fixing is a multi-step process if you think it all the way through, because of all the data backups and movements. Plus, newly ingested data will not be accessible for queries until it is put in segments in Druid. That means a longer window such that data inconsistency between upstream and downstream.</li></ul><p>As they work together, this architecture might be too demanding to navigate because it requires knowledge of all these components in terms of development, monitoring, and maintenance. Also, every time the user scales a cluster, they must stop the current cluster and migrate all databases and tables, which is not only a big undertaking but also a huge interruption to business.</p><p><img loading="lazy" alt="Replace-ClickHouse-Druid-Apache-Kylin-with-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/youzan-2-2f605efbaf41cb9b534ea86c82b209a8.png" width="1280" height="529" class="img_ev3q"></p><p>Apache Doris fills these gaps.</p><ul><li><strong>Query performance</strong>: Doris is good at high-concurrency queries and join queries, and it is now equipped with inverted index to speed up searches in logs.</li><li><strong>Data update</strong>: The Unique Key model of Doris supports both large-volume update and high-freqency real-time writing, and the Duplicate Key model and Unique Key model supports partial column update. It also provides exactly-once guarantee in data writing and ensures consistency between base tables, materialized views, and replicas.</li><li><strong>Maintenance</strong>: Doris is MySQL-compatible. It supports easy scaling and light schema change. It comes with its own integration tools such as Flink-Doris-Connector and Spark-Doris-Connector. </li></ul><p>So they plan on the migration.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="the-replacement-surgery">The Replacement Surgery<a href="#the-replacement-surgery" class="hash-link" aria-label="Direct link to The Replacement Surgery" title="Direct link to The Replacement Surgery"></a></h2><p>ClickHouse was the main performance bottleneck in the old data architecture and why the user wanted the change in the first place, so they started with ClickHouse.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="changes-in-sql-statements">Changes in SQL statements<a href="#changes-in-sql-statements" class="hash-link" aria-label="Direct link to Changes in SQL statements" title="Direct link to Changes in SQL statements"></a></h3><p><strong>Table creation statements</strong></p><p><img loading="lazy" alt="table-creation-statements-in-ClickHouse-and-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/youzan-3-80a2c58fe513a6bbf303d5b95a023fd9.png" width="1280" height="426" class="img_ev3q"></p><p>The user built their own SQL rewriting tool that can convert a ClickHouse table creation statement into a Doris table creation statement. The tool can automate the following changes:</p><ul><li><strong>Mapping the field types</strong>: It converts ClickHouse field types into the corresponding ones in Doris. For example, it converts String as a Key into Varchar, and String as a partitioning field into Date V2.</li><li><strong>Setting the number of historical partitions in dynamic partitioning tables</strong>: Some tables have historical partitions and the number of partitions should be specified upon table creation in Doris, otherwise a &quot;No Partition&quot; error will be thrown.</li><li><strong>Determining the number of buckets</strong>: It decides the number of buckets based on the data volume of historical partitions; for non-partitioned tables, it decides the bucketing configurations based on the historical data volume.</li><li><strong>Determining TTL</strong>: It decides the time to live of partitions in dynamic partitioning tables.</li><li><strong>Setting the import sequence</strong>: For the Unique Key model of Doris, it can specify the data import order based on the Sequence column to ensure orderliness in data ingestion.</li></ul><p><img loading="lazy" alt="changes-in-table-creation-statements-from-ClickHouse-to-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/youzan-4-3ee70bd47be6c98aeef15c24027bfb07.png" width="1280" height="881" class="img_ev3q"></p><p><strong>Query statements</strong></p><p>Similarly, they have their own tool to transform the ClickHouse query statements into Doris query statements. This is to prepare for the comparison test between ClickHouse and Doris. The key considerations in the conversions include:</p><ul><li><strong>Conversion of table names</strong>: This is simple given the mapping rules in table creation statements.</li><li><strong>Conversion of functions</strong>: For example, the <code>COUNTIF</code> function in ClickHouse is equivalent to <code>SUM(CASE WHEN_THEN 1 ELSE 0)</code>, <code>Array Join</code> is equivalent to <code>Explode</code> and <code>Lateral View</code>, and <code>ORDER BY</code> and <code>GROUP BY</code> should be converted to window functions.</li><li><strong>Difference</strong> <strong>in semantics</strong>: ClickHouse goes by its own protocol while Doris is MySQL-compatible, so there needs to be alias for subqueries. In this use case, subqueries are common in customer segmentation, so they use <code>sqlparse</code> </li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="changes-in-data-ingestion-methods">Changes in data ingestion methods<a href="#changes-in-data-ingestion-methods" class="hash-link" aria-label="Direct link to Changes in data ingestion methods" title="Direct link to Changes in data ingestion methods"></a></h3><p><img loading="lazy" alt="changes-in-data-ingestion-methods-from-ClickHouse-to-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/youzan-5-8223b76f140f27992ef2d3843ed7d572.png" width="1280" height="642" class="img_ev3q"></p><p>Apache Doris provides broad options of data writing methods. For the real-time link, the user adopts Stream Load to ingest data from NSQ and Kafka. </p><p>For the sizable offline data, the user tested different methods and here are the takeouts:</p><ol><li><strong>Insert Into</strong></li></ol><p>Using Multi-Catalog to read external data sources and ingesting with Insert Into can serve most needs in this use case.</p><ol start="2"><li><strong>Stream Load</strong></li></ol><p>The Spark-Doris-Connector is a more general method. It can handle large data volumes and ensure writing stability. The key is to find the right writing pace and parallelism.</p><p>The Spark-Doris-Connector also supports Bitmap. It allows you to move the computation workload of Bitmap data in Spark clusters. </p><p>Both the Spark-Doris-Connector and the Flink-Doris-Connector rely on Stream Load. CSV is the recommended format choice. Tests on the user&#x27;s billions of rows showed that CSV was 40% faster than JSON. </p><ol start="3"><li><strong>Spark Load</strong></li></ol><p>The Spark Load method utilizes Spark resources for data shuffling and ranking. The computation results are put in HDFS, and then Doris reads the files from HDFS directly (via Broker Load). This approach is ideal for huge data ingestion. The more data there is, the faster and more resource-efficient the ingestion is. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="pressure-test">Pressure Test<a href="#pressure-test" class="hash-link" aria-label="Direct link to Pressure Test" title="Direct link to Pressure Test"></a></h2><p>The user compared performance of the two components on their SQL and join query scenarios, and calculated the CPU and memory consumption of Apache Doris.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="sql-query-performance">SQL query performance<a href="#sql-query-performance" class="hash-link" aria-label="Direct link to SQL query performance" title="Direct link to SQL query performance"></a></h3><p>Apache Doris outperformed ClickHouse in 10 of the 16 SQL queries, and the biggest performance gap was a ratio of almost 30. Overall, Apache Doris was 2~3 times faster than ClickHouse. </p><p><img loading="lazy" alt="SQL-query-performance-ClickHouse-VS-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/youzan-6-a4a80e719c4ef27b9db683b502796fce.png" width="1313" height="1057" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="join-query-performance">Join query performance<a href="#join-query-performance" class="hash-link" aria-label="Direct link to Join query performance" title="Direct link to Join query performance"></a></h3><p>For join query tests, the user used different sizes of main tables and dimension tables.</p><ul><li><strong>Primary tables</strong>: user activity table (4 billion rows), user attribute table (25 billion rows), and user attribute table (96 billion rows)</li><li><strong>Dimension tables</strong>: 1 million rows, 10 million rows, 50 million rows, 100 million rows, 500 million rows, 1 billion rows, and 2.5 billion rows.</li></ul><p>The tests include <strong>full join queries</strong> and <strong>filtering join queries</strong>. Full join queries join all rows of the primary table and dimension tables, while filtering join queries retrieve data of a certain seller ID with a <code>WHERE</code> filter. The results are concluded as follows:</p><p><strong>Primary table (4 billion rows):</strong></p><ul><li>Full join queries: Doris outperforms ClickHouse in full join queries with all dimension tables. The performance gap widens as the dimension tables get larger. The largest difference is a ratio of 5.</li><li>Filtering join queries: Based on the seller ID, the filter screened out 41 million rows from the primary table. With small dimension tables, Doris was 2~3 times faster than ClickHouse; with large dimension tables, Doris was over 10 times faster; with dimension tables larger than 100 million rows, ClickHouse threw an OOM error and Doris functions normally. </li></ul><p><strong>Primary table (25 billion rows):</strong></p><ul><li>Full join queries: Doris outperforms ClickHouse in full join queries with all dimension tables. ClickHouse produced an OOM error with dimension tables larger than 50 million rows.</li><li>Filtering join queries: The filter screened out 570 million rows from the primary table. Doris responded within seconds and ClickHouse finished within minutes and broke down when joining large dimension tables.</li></ul><p><strong>Primary table (96 billion rows):</strong></p><p>Doris delivered relatively quick performance in all queries and ClickHouse was unable to execute all of them.</p><p>In terms of CPU and memory consumption, Apache Doris maintained stable cluster loads in all sizes of join queries.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="future-directions">Future Directions<a href="#future-directions" class="hash-link" aria-label="Direct link to Future Directions" title="Direct link to Future Directions"></a></h2><p>As the migration goes on, the user works closely with the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Doris community</a>, and their feedback has contributed to the making of <a href="https://doris.apache.org/docs/dev/releasenotes/release-2.0.0/" target="_blank" rel="noopener noreferrer">Apache Doris 2.0.0</a>. We will continue assisting them in their migration from Kylin and Druid to Doris, and we look forward to see their Doris-based unified data platform come into being.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Log-Analysis-How-to-Digest-15-Billion-Logs-Per-Day-and-Keep-Big-Queries-Within-1-Second">Log analysis: how to digest 15 billion logs per day and keep big queries within 1 second</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Yuqi Liu</span></span><time datetime="2023-09-16T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">September 16, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>This data warehousing use case is about <strong>scale</strong>. The user is <a href="https://en.wikipedia.org/wiki/China_Unicom" target="_blank" rel="noopener noreferrer">China Unicom</a>, one of the world&#x27;s biggest telecommunication service providers. Using Apache Doris, they deploy multiple petabyte-scale clusters on dozens of machines to support their 15 billion daily log additions from their over 30 business lines. Such a gigantic log analysis system is part of their cybersecurity management. For the need of real-time monitoring, threat tracing, and alerting, they require a log analytic system that can automatically collect, store, analyze, and visualize logs and event records.</p><p>From an architectural perspective, the system should be able to undertake real-time analysis of various formats of logs, and of course, be scalable to support the huge and ever-enlarging data size. The rest of this post is about what their log processing architecture looks like, and how they realize stable data ingestion, low-cost storage, and quick queries with it.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="system-architecture">System Architecture<a href="#system-architecture" class="hash-link" aria-label="Direct link to System Architecture" title="Direct link to System Architecture"></a></h2><p>This is an overview of their data pipeline. The logs are collected into the data warehouse, and go through several layers of processing.</p><p><img loading="lazy" alt="real-time-data-warehouse-2.0" src="https://cdnd.selectdb.com/assets/images/Unicom-1-0c734fbe7faf4875c3a647ac5136cce9.png" width="1280" height="609" class="img_ev3q"></p><ul><li><strong>ODS</strong>: Original logs and alerts from all sources are gathered into Apache Kafka. Meanwhile, a copy of them will be stored in HDFS for data verification or replay.</li><li><strong>DWD</strong>: This is where the fact tables are. Apache Flink cleans, standardizes, backfills, and de-identifies the data, and write it back to Kafka. These fact tables will also be put into Apache Doris, so that Doris can trace a certain item or use them for dashboarding and reporting. As logs are not averse to duplication, the fact tables will be arranged in the <a href="https://doris.apache.org/docs/dev/data-table/data-model#duplicate-model" target="_blank" rel="noopener noreferrer">Duplicate Key model</a> of Apache Doris. </li><li><strong>DWS</strong>: This layer aggregates data from DWD and lays the foundation for queries and analysis.</li><li><strong>ADS</strong>: In this layer, Apache Doris auto-aggregates data with its Aggregate Key model, and auto-updates data with its Unique Key model. </li></ul><p>Architecture 2.0 evolves from Architecture 1.0, which is supported by ClickHouse and Apache Hive. The transition arised from the user&#x27;s needs for real-time data processing and multi-table join queries. In their experience with ClickHouse, they found inadequate support for concurrency and multi-table joins, manifested by frequent timeouts in dashboarding and OOM errors in distributed joins.</p><p><img loading="lazy" alt="real-time-data-warehouse-1.0" src="https://cdnd.selectdb.com/assets/images/Unicom-2-6b242b382e769bf8acd4f0e08471045f.png" width="1280" height="607" class="img_ev3q"></p><p>Now let&#x27;s take a look at their practice in data ingestion, storage, and queries with Architecture 2.0.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="real-case-practice">Real-Case Practice<a href="#real-case-practice" class="hash-link" aria-label="Direct link to Real-Case Practice" title="Direct link to Real-Case Practice"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="stable-ingestion-of-15-billion-logs-per-day">Stable ingestion of 15 billion logs per day<a href="#stable-ingestion-of-15-billion-logs-per-day" class="hash-link" aria-label="Direct link to Stable ingestion of 15 billion logs per day" title="Direct link to Stable ingestion of 15 billion logs per day"></a></h3><p>In the user&#x27;s case, their business churns out 15 billion logs every day. Ingesting such data volume quickly and stably is a real problem. With Apache Doris, the recommended way is to use the Flink-Doris-Connector. It is developed by the Apache Doris community for large-scale data writing. The component requires simple configuration. It implements Stream Load and can reach a writing speed of 200,000~300,000 logs per second, without interrupting the data analytic workloads.</p><p>A lesson learned is that when using Flink for high-frequency writing, you need to find the right parameter configuration for your case to avoid data version accumulation. In this case, the user made the following optimizations:</p><ul><li><strong>Flink Checkpoint</strong>: They increase the checkpoint interval from 15s to 60s to reduce writing frequency and the number of transactions processed by Doris per unit of time. This can relieve data writing pressure and avoid generating too many data versions.</li><li><strong>Data Pre-Aggregation</strong>: For data of the same ID but comes from various tables, Flink will pre-aggregate it based on the primary key ID and create a flat table, in order to avoid excessive resource consumption caused by multi-source data writing.</li><li><strong>Doris Compaction</strong>: The trick here includes finding the right Doris backend (BE) parameters to allocate the right amount of CPU resources for data compaction, setting the appropriate number of data partitions, buckets, and replicas (too much data tablets will bring huge overheads), and dialing up <code>max_tablet_version_num</code> to avoid version accumulation.</li></ul><p>These measures together ensure daily ingestion stability. The user has witnessed stable performance and low compaction score in Doris backend. In addition, the combination of data pre-processing in Flink and the <a href="https://doris.apache.org/docs/dev/data-table/data-model#unique-model" target="_blank" rel="noopener noreferrer">Unique Key model</a> in Doris can ensure quicker data updates.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="storage-strategies-to-reduce-costs-by-50">Storage strategies to reduce costs by 50%<a href="#storage-strategies-to-reduce-costs-by-50" class="hash-link" aria-label="Direct link to Storage strategies to reduce costs by 50%" title="Direct link to Storage strategies to reduce costs by 50%"></a></h3><p>The size and generation rate of logs also impose pressure on storage. Among the immense log data, only a part of it is of high informational value, so storage should be differentiated. The user has three storage strategies to reduce costs. </p><ul><li><strong>ZSTD (ZStandard) compression algorithm</strong>: For tables larger than 1TB, specify the compression method as &quot;ZSTD&quot; upon table creation, it will realize a compression ratio of 10:1. </li><li><strong>Tiered storage of hot and cold data</strong>: This is supported by the <a href="https://blog.devgenius.io/hot-cold-data-separation-what-why-and-how-5f7c73e7a3cf" target="_blank" rel="noopener noreferrer">new feature</a> of Doris. The user sets a data &quot;cooldown&quot; period of 7 days. That means data from the past 7 days (namely, hot data) will be stored in SSD. As time goes by, hot data &quot;cools down&quot; (getting older than 7 days), it will be automatically moved to HDD, which is less expensive. As data gets even &quot;colder&quot;, it will be moved to object storage for much lower storage costs. Plus, in object storage, data will be stored with only one copy instead of three. This further cuts down costs and the overheads brought by redundant storage. </li><li><strong>Differentiated replica numbers for different data partitions</strong>: The user has partitioned their data by time range. The principle is to have more replicas for newer data partitions and less for the older ones. In their case, data from the past 3 months is frequently accessed, so they have 2 replicas for this partition. Data that is 3~6 months old has two replicas, and data from 6 months ago has one single copy. </li></ul><p>With these three strategies, the user has reduced their storage costs by 50%.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="differentiated-query-strategies-based-on-data-size">Differentiated query strategies based on data size<a href="#differentiated-query-strategies-based-on-data-size" class="hash-link" aria-label="Direct link to Differentiated query strategies based on data size" title="Direct link to Differentiated query strategies based on data size"></a></h3><p>Some logs must be immediately traced and located, such as those of abnormal events or failures. To ensure real-time response to these queries, the user has different query strategies for different data sizes:</p><ul><li><strong>Less than 100G</strong>: The user utilizes the dynamic partitioning feature of Doris. Small tables will be partitioned by date and large tables will be partitioned by hour. This can avoid data skew. To further ensure balance of data within a partition, they use the snowflake ID as the bucketing field. They also set a starting offset of 20 days, which means data of the recent 20 days will be kept. In this way, they find the balance point between data backlog and analytic needs.</li><li><strong>100G~1T</strong>: These tables have their materialized views, which are the pre-computed result sets stored in Doris. Thus, queries on these tables will be much faster and less resource-consuming. The DDL syntax of materialized views in Doris is the same as those in PostgreSQL and Oracle.</li><li><strong>More than 100T</strong>: These tables are put into the Aggregate Key model of Apache Doris and pre-aggregate them. <strong>In this way, we enable queries of 2 billion log records to be done in 1~2s.</strong> </li></ul><p>These strategies have shortened the response time of queries. For example, a query of a specific data item used to take minutes, but now it can be finished in milliseconds. In addition, for big tables that contain 10 billion data records, queries on different dimensions can all be done in a few seconds.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="ongoing-plans">Ongoing Plans<a href="#ongoing-plans" class="hash-link" aria-label="Direct link to Ongoing Plans" title="Direct link to Ongoing Plans"></a></h2><p>The user is now testing with the newly added <a href="https://doris.apache.org/docs/dev/data-table/index/inverted-index?_highlight=inverted" target="_blank" rel="noopener noreferrer">inverted index</a> in Apache Doris. It is designed to speed up full-text search of strings as well as equivalence and range queries of numerics and datetime. They have also provided their valuable feedback about the auto-bucketing logic in Doris: Currently, Doris decides the number of buckets for a partition based on the data size of the previous partition. The problem for the user is, most of their new data comes in during daytime, but little at nights. So in their case, Doris creates too many buckets for night data but too few in daylight, which is the opposite of what they need. They hope to add a new auto-bucketing logic, where the reference for Doris to decide the number of buckets is the data size and distribution of the previous day. They&#x27;ve come to the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris community</a> and we are now working on this optimization.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Tencent-LLM">LLM-powered OLAP: the Tencent application with Apache Doris</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Jun Zhang &amp; Lei Luo</span></span><time datetime="2023-08-29T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">August 29, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Six months ago, I wrote about <a href="https://doris.apache.org/blog/Tencent-Data-Engineers-Why-We-Went-from-ClickHouse-to-Apache-Doris" target="_blank" rel="noopener noreferrer">why we replaced ClickHouse with Apache Doris as an OLAP engine</a> for our data management system. Back then, we were struggling with the auto-generation of SQL statements. As days pass, we have made progresses big enough to be references for you (I think), so here I am again. </p><p>We have adopted Large Language Models (LLM) to empower our Doris-based OLAP services.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="llm--olap">LLM + OLAP<a href="#llm--olap" class="hash-link" aria-label="Direct link to LLM + OLAP" title="Direct link to LLM + OLAP"></a></h2><p>Our incentive was to save our internal staff from the steep learning curve of SQL writing. Thus, we used LLM as an intermediate. It transforms natural language questions into SQL statements and sends the SQLs to the OLAP engine for execution.</p><p><img loading="lazy" alt="LLM-OLAP-solution" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_1-6672112c0d09d75171d8ed9a749ff196.png" width="1280" height="253" class="img_ev3q"></p><p>Like every AI-related experience, we came across some friction:</p><ol><li>LLM does not understand data jargons, like &quot;fields&quot;, &quot;rows&quot;, &quot;columns&quot; and &quot;tables&quot;. Instead, they can perfectly translate business terms like &quot;corporate income&quot; and &quot;DAU&quot;, which are basically what the fields/rows/columns are about. That means it can work well only if the analysts use the exact right word to refer to the metric they need when typing their questions.</li><li>The LLM we are using is slow in inference. It takes over 10 seconds to respond. As it charges fees by token, cost-effectiveness becomes a problem.</li><li>Although the LLM is trained on a large collection of public datasets, it is under-informed of niche knowledge. In our case, the LLM is super unfamiliar with indie songs, so even if the songs are included in our database, the LLM will not able to identify them properly. </li><li>Sometimes our input questions require adequate and latest legal, political, financial, and regulatory information, which is hard to be included in a training dataset or knowledge base. We need to connect the LLM to wider info bases in order to perform more diversified tasks.</li></ol><p>We knock these problems down one by one.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-a-semantic-layer">1. A semantic layer<a href="#1-a-semantic-layer" class="hash-link" aria-label="Direct link to 1. A semantic layer" title="Direct link to 1. A semantic layer"></a></h3><p>For problem No.1, we introduce a semantic layer between the LLM and the OLAP engine. This layer translates business terms into the corresponding data fields. It can identify data filtering conditions from the various natural language wordings, relate them to the metrics involved, and then generate SQL statements. </p><p>Besides that, the semantic layer can optimize the computation logic. When analysts input a question that involves a complicated query, let&#x27;s say, a multi-table join, the semantic layer can split that into multiple single-table queries to reduce semantic distortion.</p><p><img loading="lazy" alt="LLM-OLAP-semantic-layer" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_2-bb2fdaed64ef15214c0542204dd45832.png" width="1280" height="289" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-llm-parsing-rules">2. LLM parsing rules<a href="#2-llm-parsing-rules" class="hash-link" aria-label="Direct link to 2. LLM parsing rules" title="Direct link to 2. LLM parsing rules"></a></h3><p>To increase cost-effectiveness in using LLM, we evaluate the computation complexity of all scenarios, such as metric computation, detailed record retrieval, and user segmentation. Then, we create rules and dedicate the LLM-parsing step to only complicated tasks. That means for the simple computation tasks, it will skip the parsing. </p><p>For example, when an analyst inputs &quot;tell me the earnings of the major musical platforms&quot;, the LLM identifies that this question only entails several metrics or dimensions, so it will not further parse it but send it straight for SQL generation and execution. This can largely shorten query response time and reduce API expenses. </p><p><img loading="lazy" alt="LLM-OLAP-parsing-rules" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_3-3ab023081e1acb069d34a4ce24aef010.png" width="1280" height="406" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3-schema-mapper-and-external-knowledge-base">3. Schema Mapper and external knowledge base<a href="#3-schema-mapper-and-external-knowledge-base" class="hash-link" aria-label="Direct link to 3. Schema Mapper and external knowledge base" title="Direct link to 3. Schema Mapper and external knowledge base"></a></h3><p>To empower the LLM with niche knowledge, we added a Schema Mapper upstream from the LLM. The Schema Mapper maps the input question to an external knowledge base, and then the LLM will do parsing.</p><p>We are constantly testing and optimizing the Schema Mapper. We categorize and rate content in the external knowledge base, and do various levels of mapping (full-text mapping and fuzzy mapping) to enable better semantic parsing.</p><p><img loading="lazy" alt="LLM-OLAP-schema-mapper" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_4-261ee680cf77335b25f32e41d7a4924b.png" width="2001" height="647" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="4-plugins">4. Plugins<a href="#4-plugins" class="hash-link" aria-label="Direct link to 4. Plugins" title="Direct link to 4. Plugins"></a></h3><p>We used plugins to connect the LLM to more fields of information, and we have different integration methods for different types of plugins:</p><ul><li><strong>Embedding local files</strong>: This is especially useful when we need to &quot;teach&quot; the LLM the latest regulatory policies, which are often text files. Firstly, the system vectorizes the local text file, executes semantic searches to find matching or similar terms in the local file, extracts the relevant contents and puts them into the LLM parsing window to generate output. </li><li><strong>Third-party plugins</strong>: The marketplace is full of third-party plugins that are designed for all kinds of sectors. With them, the LLM is able to deal with wide-ranging topics. Each plugin has its own prompts and calling function. Once the input question hits a prompt, the relevant plugin will be called.</li></ul><p><img loading="lazy" alt="LLM-OLAP-plugins" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_5-70a170e771dd9eadcc1488b94d892478.png" width="2001" height="645" class="img_ev3q"></p><p>After we are done with above four optimizations, the SuperSonic framework comes into being.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="the-supersonic-framework">The SuperSonic framework<a href="#the-supersonic-framework" class="hash-link" aria-label="Direct link to The SuperSonic framework" title="Direct link to The SuperSonic framework"></a></h2><p>Now let me walk you through this <a href="https://github.com/tencentmusic/supersonic" target="_blank" rel="noopener noreferrer">framework</a>:</p><p><img loading="lazy" alt="LLM-OLAP-supersonic-framework" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_6-cbbbb25041c807376b2b9d14609e82c8.png" width="1280" height="1117" class="img_ev3q"></p><ul><li>An analyst inputs a question.</li><li>The Schema Mapper maps the question to an external knowledge base.</li><li>If there are matching fields in the external knowledge base, the question will not be parsed by the LLM. Instead, a metric computation formula will trigger the OLAP engine to start querying. If there is no matching field, the question will enter the LLM.</li><li>Based on the pre-defined rules, the LLM rates the complexity level of the question. If it is a simple query, it will go directly to the OLAP engine; if it is a complicated query, it will be semantically parsed and converted to a DSL statement.</li><li>At the Semantic Layer, the DSL statement will be split based on its query scenario. For example, if it is a multi-table join query, this layer will generate multiple single-table query SQL statements.</li><li>If the question involves external knowledge, the LLM will call a third-party plugin.</li></ul><p><strong>Example</strong></p><p><img loading="lazy" alt="LLM-OLAP-chatbot-query-interface" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_7-c20b3cc2b0b00b32bc2825c1d62b1d5d.png" width="2001" height="1126" class="img_ev3q"></p><p>To answer whether a certain song can be performed on variety shows, the system retrieves the OLAP data warehouse for details about the song, and presents it with results from the Commercial Use Query third-party plugin.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="olap-architecture">OLAP Architecture<a href="#olap-architecture" class="hash-link" aria-label="Direct link to OLAP Architecture" title="Direct link to OLAP Architecture"></a></h2><p>As for the OLAP part of this framework, after several rounds of architectural evolution, this is what our current OLAP pipeline looks like. </p><p>Raw data is sorted into tags and metrics, which are custom-defined by the analysts. The tags and metrics are under unified management in order to avoid inconsistent definitions. Then, they are combined into various tagsets and metricsets for various queries. </p><p><img loading="lazy" alt="LLM-OLAP-architecture" src="https://cdnd.selectdb.com/assets/images/Tencent_LLM_8-6d517a787c782510bf3869176730ce3a.png" width="1709" height="1119" class="img_ev3q"></p><p>We have drawn two main takeaways for you from our architectural optimization experience.</p><p><strong>1. Streamline the links</strong></p><p>Before we adopted Apache Doris, we used to have ClickHouse to accelerate the computation of tags and metrics, and Elasticsearch to process dimensional data. That&#x27;s two analytic engines and requires us to adapt the query statements to both of them. It was high-maintenance.</p><p>Thus, we replaced ClickHouse with Apache Doris, and utilized the <a href="https://doris.apache.org/docs/dev/lakehouse/multi-catalog/es" target="_blank" rel="noopener noreferrer">Elasticsearch Catalog</a> functionality to connect Elasticsearch data to Doris. In this way, we make Doris our unified query gateway. </p><p><strong>2. Split the flat tables</strong></p><p>In early versions of our OLAP architecture, we used to put data into flat tables, which made things tricky. For one thing, flat tables absorbed all the writing latency from upstreams, and that added up to considerable loss in data realtimeliness. For another, 50% of data in a flat table was dimensional data, which was rarely updated. With every new flat table came some bulky dimensional data that consumed lots of storage space. </p><p>Therefore, we split the flat tables into metric tables and dimension tables. As they are updated in different paces, we put them into different data models.</p><ul><li><strong>Metric tables</strong>: We arrange metric data in the Aggregate Key model of Apache Doris, which means new data will be merged with the old data by way of SUM, MAX, MIN, etc.</li><li><strong>Dimension tables</strong>: These tables are in the Unique Key model of Apache Doris, which means new data record will replace the old. This can greatly increase performance in our query scenarios.</li></ul><p>You might ask, does this cause trouble in queries, since most queries require data from both types of tables? Don&#x27;t worry, we address that with the Rollup feature of Doris. On the basis of the base tables, we can select the dimensions we need to create Rollup views, which will automatically execute <code>GROUP BY</code>. This relieves us of the need to define tags for each Rollup view and largely speed up queries.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="other-tricks">Other Tricks<a href="#other-tricks" class="hash-link" aria-label="Direct link to Other Tricks" title="Direct link to Other Tricks"></a></h2><p>In our experience with Apache Doris, we also find some other functionalities handy, so I list them here for you, too:</p><p><strong>1. Materialized View</strong></p><p>A Materialized View is a pre-computed dataset. It is a way to accelerate queries when you frequently need to access data of certain dimensions. In these scenarios, we define derived tags and metrics based on the original ones. For example, we create a derived metric by combining Metric 1, Metric 2, and Metric 3: <code>sum(m1+m2+m3)</code>. Then, we can create a Materialized View for it. According to the Doris release schedule, version 2.1 will support multi-table Materialized Views, and we look forward to that.</p><p><strong>2. Flink-Doris-Connector</strong></p><p>This is for Exactly-Once guarantee in data ingestion. The Flink-Doris-Connector implements a checkpoint mechanism and two-stage commit, and allows for auto data synchronization from relational databases to Doris.</p><p><strong>3. Compaction</strong></p><p>When the number of aggregation tasks or data volume becomes overwhelming for Flink, there might be huge latency in data compaction. We solve that with Vertical Compaction and Segment Compaction. Vertical Compaction supports loading of only part of the columns, so it can reduce storage consumption when compacting flat tables. Segment Compaction can avoid generating too much segments during data writing, and allows for compaction while writing simultaneously. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="whats-next">What&#x27;s Next<a href="#whats-next" class="hash-link" aria-label="Direct link to What&#x27;s Next" title="Direct link to What&#x27;s Next"></a></h2><p>With an aim to reduce costs and increase service availability, we plan to test the newly released Storage-Compute Separation and Cross-Cluster Replication of Doris, and we embrace any ideas and inputs about the SuperSonic framework and the Apache Doris project.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Choosing-an-OLAP-Engine-for-Financial-Risk-Management-What-to-Consider">Choosing an OLAP engine for financial risk management: what to consider?</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Jianbo Liu</span></span><time datetime="2023-08-17T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">August 17, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>From a data engineer&#x27;s point of view, financial risk management is a series of data analysis activities on financial data. The financial sector imposes its unique requirements on data engineering. This post explains them with a use case of Apache Doris, and provides reference for what you should take into account when choosing an OLAP engine in a financial scenario. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-must-be-combined">Data Must Be Combined<a href="#data-must-be-combined" class="hash-link" aria-label="Direct link to Data Must Be Combined" title="Direct link to Data Must Be Combined"></a></h2><p>The financial data landscape is evolving from standalone to distributed, heterogeneous systems. For example, in this use case scenario, the fintech service provider needs to connect the various transaction processing (TP) systems (MySQL, Oracle, and PostgreSQL) of its partnering banks. Before they adopted an OLAP engine, they were using Kettle to collect data. The ETL tool did not support join queries across different data sources and it could not store data. The ever-enlarging data size at the source end was pushing the system towards latency and instability. That&#x27;s when they decided to introduce an OLAP engine.</p><p>The financial user&#x27;s main pursuit is quick queries on large data volume with as least engineering and maintenance efforts as possible, so when it comes to the choice of OLAP engines, SQL on Hadoop should be crossed off the list due to its huge ecosystem and complicated components. One reason that they landed on Apache Doris was the metadata management capability. Apache Doris collects metadata of various data sources via API so it is a fit for the case which requires combination of different TP systems. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="high-concurrency--high-throughput">High Concurrency &amp; High Throughput<a href="#high-concurrency--high-throughput" class="hash-link" aria-label="Direct link to High Concurrency &amp; High Throughput" title="Direct link to High Concurrency &amp; High Throughput"></a></h2><p>Financial risk control is based on analysis of large amounts of transaction data. Sometimes analysts identify abnormalities by combining data from different large tables, and often times they need to check a certain data record, which comes in the form of concurrent point queries in the data system. Thus, the OLAP engine should be able to handle both high-throughput queries and high-concurrency queries. </p><p>To speed up the highly concurrent point queries, you can create <a href="https://doris.apache.org/docs/dev/query-acceleration/materialized-view/" target="_blank" rel="noopener noreferrer">Materialized Views</a> in Apache Doris. A Materialized View is a pre-computed data set stored in Apache Doris so that the system can respond much faster to queries that are frequently conducted. </p><p>To facilitate queries on large tables, you can leverage the <a href="https://doris.apache.org/docs/dev/query-acceleration/join-optimization/colocation-join/" target="_blank" rel="noopener noreferrer">Colocation Join</a> mechanism. Colocation Join minimizes data transfer between computation nodes to reduce overheads brought by data movement. Thus, it can largely improve query speed when joining large tables.</p><p><img loading="lazy" alt="colocation-join" src="https://cdnd.selectdb.com/assets/images/Xingyun_1-d07e739500944ff34d4ad3c75968850b.png" width="1280" height="687" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="log-analysis">Log Analysis<a href="#log-analysis" class="hash-link" aria-label="Direct link to Log Analysis" title="Direct link to Log Analysis"></a></h2><p>Log analysis is important in financial data processing. Real-time processing and monitoring of logs can expose risks in time. Apache Doris provides data storage and analytics capabilities to make log analysis easier and more efficient. As logs are bulky, Apache Doris can deliver a high data compression rate to lower storage costs. </p><p>Retrieval is a major part of log analysis, so <a href="https://doris.apache.org/docs/dev/releasenotes/release-2.0.0" target="_blank" rel="noopener noreferrer">Apache Doris 2.0</a> supports inverted index, which is a way to accelerate text searching and equivalence/range queries on numerics and datetime. It allows users to quickly locate the log record that they need among the massive data. The JSON storage feature in Apache Doris is reported to reduce storage costs of user activity logs by 70%, and the variety of parse functions provided can save data engineers from developing their own SQl functions. </p><p><img loading="lazy" alt="log-analysis" src="https://cdnd.selectdb.com/assets/images/Xingyun_2-84440a0d5bfc678448d3a3e3063bd7f9.png" width="1280" height="473" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="easy-maintenance">Easy Maintenance<a href="#easy-maintenance" class="hash-link" aria-label="Direct link to Easy Maintenance" title="Direct link to Easy Maintenance"></a></h2><p>In addition to the easy deployment, Apache Doris has a few mechanisms that are designed to save maintenance efforts. For example, it ensures high availability of cluster nodes with Systemd, and high availability of data with multi-replica and auto-balancing of replicas, so all maintenance required is to backup metadata on a regular basis. Apache Doris also supports <a href="https://doris.apache.org/docs/dev/advanced/partition/dynamic-partition/" target="_blank" rel="noopener noreferrer">dynamic partitioning of data</a>, which means it will automatically create or delete data partitions according to the rules specified by the user. This saves efforts in partition management and eliminates possible efforts caused by manual management.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="architecture-overview">Architecture Overview<a href="#architecture-overview" class="hash-link" aria-label="Direct link to Architecture Overview" title="Direct link to Architecture Overview"></a></h2><p>This is overall data architecture in the case. The user utilizes Apache Flume for log data collection, and DataX for data update. Data from multiple sources will be collected into Apache Doris to form a data mart, from which analysts extract information to generate reports and dashboards for reference in risk control and business decisions. As for stability of the data mart itself, Grafana and Prometheus are used to monitor memory usage, compaction score and query response time of Apache Doris to make sure it is running well.</p><p><img loading="lazy" alt="data-architecture" src="https://cdnd.selectdb.com/assets/images/Xingyun_3-ef9c50ef508df963514a76a7365b0490.png" width="1280" height="792" class="img_ev3q"></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Database-in-Fintech-How-to-Support-ten-thousand-Dashboards-Without-Causing-a-Mess">Database in fintech: how to support 10,000 dashboards without causing a mess</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Hou Lan</span></span><time datetime="2023-08-05T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">August 5, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>In a data-intensive industry like finance, data comes from numerous entries and goes to numerous exits. Such status quo can easily, and almost inevitably, lead to chaos in data analysis and management. For example, analysts from different business lines define their own financial metrics in data reports. When you pool these countless reports together in your data architecture, you will find that many metrics overlap or even contradict each other in definition. The consequence is, developing a simple data report will require lots of clarification back and forth, making the process more complicated and time-consuming than it should be.</p><p>As your business grows, your data management will arrive at a point when &quot;standardization&quot; is needed. In terms of data engineering, that means you need a data platform where you can produce and manage all metrics. That&#x27;s your architectural prerequisite to provide efficient financial services. </p><p>This article introduces the lifecycle of financial metrics in a database (in this case, <a href="https://doris.apache.org/" target="_blank" rel="noopener noreferrer">Apache Doris</a>), from how they&#x27;re produced to how they&#x27;re efficiently presented in data reports. You will get an inside view of what&#x27;s behind those fancy financial dashboards. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="define-new-metrics--add-them-to-your-database">Define New Metrics &amp; Add Them to Your Database<a href="#define-new-metrics--add-them-to-your-database" class="hash-link" aria-label="Direct link to Define New Metrics &amp; Add Them to Your Database" title="Direct link to Define New Metrics &amp; Add Them to Your Database"></a></h2><p>Fundamentally, metrics are fields in a table. To provide a more concrete idea of them, I will explain with an example in the banking industry. </p><p>Banks measure the assets of customers by AUM (Assets Under Management). In this scenario, AUM is an <strong>atomic metric</strong>, which is often a field in the source data table. On the basis of AUM, analysts derive a series of <strong>derivative metrics</strong>, such as &quot;year-on-year AUM growth&quot;, &quot;month-on-month AUM growth&quot;, and &quot;AUM per customer&quot;.</p><p>Once you define the new metrics, you add them to your data reports, which involves a few simple configurations in Apache Doris:</p><p>Developers update the metadata accordingly, register the base table where the metrics are derived, configure the data granularity and update frequency of intermediate tables, and input the metric name and definition. Some engineers will also monitor the metrics to identify abnormalities and remove redundant metrics based on a metric evaluation system.</p><p>When the metrics are soundly put in place, you can ingest new data into your database to get your data reports. For example, if you ingest CSV files, we recommend the Stream Load method of Apache Doris and a file size of 1~10G per batch. Eventually, these metrics will be visualized in data charts. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="calculate-your-metrics">Calculate Your Metrics<a href="#calculate-your-metrics" class="hash-link" aria-label="Direct link to Calculate Your Metrics" title="Direct link to Calculate Your Metrics"></a></h2><p>As is mentioned, some metrics are produced by combining multiple fields in the source table. In data engineering, that is a multi-table join query. Based on the optimization experience of an Apache Doris user, we recommend flat tables instead of Star/Snowflake Schema. The user reduced the query response time on tables of 100 million rows <strong>from 5s to 63ms</strong> after such a change.</p><p><img loading="lazy" alt="join-queries" src="https://cdnd.selectdb.com/assets/images/Pingan_1-ca53619302ca8b80b8fdb1c73a5c39c9.png" width="1280" height="642" class="img_ev3q"></p><p>The flat table solution also eliminates jitter.</p><p><img loading="lazy" alt="reduced-jitter" src="https://cdnd.selectdb.com/assets/images/Pingan_2-325bffe3684325c0fd1970d82aadf4ff.png" width="1280" height="283" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="enable-sql-caching-to-reduce-resource-consumption">Enable SQL Caching to Reduce Resource Consumption<a href="#enable-sql-caching-to-reduce-resource-consumption" class="hash-link" aria-label="Direct link to Enable SQL Caching to Reduce Resource Consumption" title="Direct link to Enable SQL Caching to Reduce Resource Consumption"></a></h2><p>Analysts often check data reports of the same metrics on a regular basis. These reports are produced by the same SQL, so one way to further improve query speed is SQL caching. Here is how it turns out in a use case with SQL caching enabled.</p><ul><li>All queries are responded within 10ms;</li><li>When computing 30 metrics simultaneously (over 120 SQL commands), results can be returned within 600ms;</li><li>A TPS (Transactions Per Second) of 300 is reached, with CPU, memory, disk, and I/O usage under 80%;</li><li>Under the recommended cluster size, over 10,000 metrics can be cached, which means you can save a lot of computation resources.</li></ul><p><img loading="lazy" alt="reduced-computation-resources" src="https://cdnd.selectdb.com/assets/images/Pingan_3-6f36c1669284dcc3672824c3fa772c55.png" width="1280" height="1212" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>The complexity of data analysis in the financial industry lies in the data itself other than the engineering side. Thus, the underlying data architecture should focus on facilitating the unified and efficient management of data. Apache Doris provides the flexibility of simple metric registration and the ability of fast and resource-efficient metric computation. In this case, the user is able to handle 10,000 active financial metrics in 10,000 dashboards with 30% less ETL efforts.</p><p>Find Apache Doris developers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a>.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/For-Entry-Level-Data-Engineers-How-to-Build-a-Simple-but-Solid-Data-Architecture">For entry-level data engineers: how to build a simple but solid data architecture</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Zhenwei Liu</span></span><time datetime="2023-07-31T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 31, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>This article aims to provide reference for non-tech companies who are seeking to empower your business with data analytics. You will learn the basics about how to build an efficient and easy-to-use data system, and I will walk you through every aspect of it with a use case of Apache Doris, an MPP-based analytic data warehouse. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="what-you-need">What You Need<a href="#what-you-need" class="hash-link" aria-label="Direct link to What You Need" title="Direct link to What You Need"></a></h2><p>This case is about a ticketing service provider who want a data platform that boasts quick processing, low maintenance costs, and ease of use, and I think they speak for the majority of entry-level database users.</p><p>A prominent feature of ticketing services is the periodic spikes in ticket orders, you know, before the shows go on. So from time to time, the company has a huge amount of new data rushing in and requires real-time processing of it, so they can make timely adjustments during the short sales window. But in other time, they don&#x27;t want to spend too much energy and funds on maintaining the data system. Furthermore, for a beginner of digital operation who only require basic analytic functions, it is better to have a data architecture that is easy-to-grasp and user-friendly. After research and comparison, they came to the Apache Doris community and we help them build a Doris-based data architecture.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="simple-architecture">Simple Architecture<a href="#simple-architecture" class="hash-link" aria-label="Direct link to Simple Architecture" title="Direct link to Simple Architecture"></a></h2><p>The building blocks of this architecture are simple. You only need Apache Flink and Apache Kafka for data ingestion, and Apache Doris as an analytic data warehouse. </p><p><img loading="lazy" alt="simple-data-architecture-with-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/Poly_1-4657c20d910093fd2ab45c5355bf13dc.png" width="1280" height="599" class="img_ev3q"></p><p>Connecting data sources to the data warehouse is simple, too. The key component, Apache Doris, supports various data loading methods to fit with different data sources. You can perform column mapping, transforming, and filtering during data loading to avoid duplicate collection of data. To ingest a table, users only need to add the table name to the configurations, instead of writing a script themselves. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-update">Data Update<a href="#data-update" class="hash-link" aria-label="Direct link to Data Update" title="Direct link to Data Update"></a></h2><p>Flink CDC was found to be the optimal choice if you are looking for higher stability in data ingestion. It also allows you to update the dynamically changing tables in real time. The process includes the following steps:</p><ul><li>Configure Flink CDC for the source MySQL database, so that it allows dynamic updating of the table management configurations (which you can think of as the &quot;metadata&quot;).</li><li>Create two CDC jobs in Flink, one to capture the changed data (the Forward stream), the other to update the table management configurations (the Broadcast stream).</li><li>Configure all tables of the source database at the Sink end (the output end of Flink CDC). When there is newly added table in the source database, the Broadcast stream will be triggered to update the table management configurations. (You just need to configure the tables, instead of &quot;creating&quot; the tables.)</li></ul><p><img loading="lazy" alt="configure-Flink-CDC" src="https://cdnd.selectdb.com/assets/images/Poly_2-0bd77b804cf526923be9c603871a34e7.png" width="1280" height="899" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="layering-of-data-warehouse">Layering of Data Warehouse<a href="#layering-of-data-warehouse" class="hash-link" aria-label="Direct link to Layering of Data Warehouse" title="Direct link to Layering of Data Warehouse"></a></h2><p>Data flows from various sources into the data warehouse, where it is cleaned and organized before it is ready for queries and analysis. The data processing here is divided into five typical layers. Such layering simplifies the data cleaning process because it provides a clear division of labor and makes things easier to locate and comprehend. </p><ul><li><strong>ODS</strong>: This is the prep zone of the data warehouse. The unprocessed original data is put in the <a href="https://doris.apache.org/docs/dev/data-table/data-model/#unique-model" target="_blank" rel="noopener noreferrer">Unique Key Model</a> of Apache Doris, which can avoid duplication of data. </li><li><strong>DWD</strong>: This layer cleans, formats, and de-identifies data to produce fact tables. Every detailed data record is preserved. Data in this layer is also put into the Unique Key Model.</li><li><strong>DWS</strong>: This layer produces flat tables of a certain theme (order, user, etc.) based on data from the DWD layer. </li><li><strong>ADS</strong>: This layer auto-aggregates data, which is implemented by the <a href="https://doris.apache.org/docs/dev/data-table/data-model/#aggregate-model" target="_blank" rel="noopener noreferrer">Aggregate Key Model</a> of Apache Doris.</li><li><strong>DIM</strong>: The DIM layer accommodates dimension data (in this case, data about the theaters, projects, and show sessions, etc.), which is used in combination with the order details.</li></ul><p>After the original data goes through these layers, it is available for queries via one data export interface.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="reporting">Reporting<a href="#reporting" class="hash-link" aria-label="Direct link to Reporting" title="Direct link to Reporting"></a></h2><p>Like many non-tech business, the ticketing service provider needs a data warehouse mainly for reporting. They derive trends and patterns from all kinds of data reports, and then figure out ways towards efficient management and sales increase. Specifically, this is the information they are observing in their reports:</p><ul><li><strong>Statistical Reporting</strong>: These are the most frequently used reports, including sales reports by theater, distribution channel, sales representative, and show.</li><li><strong>Agile Reporting</strong>: These are reports developed for specific purposes, such as daily and weekly project data reports, sales summary reports, GMV reports, and settlement reports.</li><li><strong>Data Analysis</strong>: This involves data such as membership orders, attendance rates, and user portraits.</li><li><strong>Dashboarding</strong>: This is to visually display sales data.</li></ul><p><img loading="lazy" alt="Real-Time-Data-Warehouse-and-Reporting" src="https://cdnd.selectdb.com/assets/images/Poly_3-8dbc669ac5f492a38335618a36ef214f.png" width="1280" height="584" class="img_ev3q"></p><p>These are all entry-level tasks in data analytics. One of the biggest burdens for the data engineers was to quickly develop new reports as the internal analysts required. The <a href="https://doris.apache.org/docs/dev/data-table/data-model#aggregate-model" target="_blank" rel="noopener noreferrer">Aggregate Key Model</a> of Apache Doris is designed for this. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="quick-aggregation-to-produce-reports-on-demand">Quick aggregation to produce reports on demand<a href="#quick-aggregation-to-produce-reports-on-demand" class="hash-link" aria-label="Direct link to Quick aggregation to produce reports on demand" title="Direct link to Quick aggregation to produce reports on demand"></a></h3><p>For example, supposing that analysts want a sales report by sales representatives, data engineers can produce that by simple configuration:</p><ol><li>Put the original data in the Aggregate Key Model</li><li>Specify the sales representative ID column and the payment date column as the Key columns, and the order amount column as the Value column</li></ol><p>Then, order amounts of the same sale representative within the specified period of time will be auto-aggregated. Bam! That&#x27;s the report you need! </p><p>According to the user, this whole process only takes them 10~30 minutes, depending on the complexity of the report required. So the Aggregate Key Model largely releases data engineers from the pressure of report development.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="quick-response-to-data-queries">Quick response to data queries<a href="#quick-response-to-data-queries" class="hash-link" aria-label="Direct link to Quick response to data queries" title="Direct link to Quick response to data queries"></a></h3><p>Most data analysts would just want their target data to be returned the second they need it. In this case, the user often leverages two capabilities of Apache Doris to realize quick query response.</p><p>Firstly, Apache Doris is famously fast in Join queries. So if you need to extract information across multiple tables, you are in good hands. Secondly, in data analysis, it often happens that analysts frequently input the same request. For example, they frequently want to check the sales data of different theaters. In this scenario, Apache Doris allows you to create a <a href="https://doris.apache.org/docs/dev/query-acceleration/materialized-view/" target="_blank" rel="noopener noreferrer">Materialized View</a>, which means you pre-aggregate the sales data of each theater, and store this table in isolation from the original tables. In this way, every time you need to check the sales data by theater, the system directly goes to the Materialized View and reads data from there, instead of scanning the original table all over again. This can increase query speed by orders of magnitudes.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>This is the overview of a simple data architecture and how it can provide the data services you need. It ensures data ingestion stability and quality with Flink CDC, and quick data analysis with Apache Doris. The deployment of this architecture is simple, too. If you plan for a data analytic upgrade for your business, you might refer to this case. If you need advice and help, you may join our <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">community here</a>.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Database-Dissection-How-Fast-Data-Queries-Are-Implemented">Database dissection: how fast data queries are implemented</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Rong Hou</span></span><time datetime="2023-07-16T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 16, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>In data analytics, fast query performance is more of a result than a guarantee. What&#x27;s more important than the result itself is the architectural design and mechanism that enables quick performance. This is exactly what this post is about. I will put you into context with a typical use case of Apache Doris, an open-source MPP-based analytic database.</p><p>The user in this case is an all-category Q&amp;A website. As a billion-dollar listed company, they have their own data management platform. What Doris does is to support the data filtering, packaging, analyzing, and monitoring workloads of that platform. Based on their huge data size, the user demands quick data loading and quick response to queries. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="how-to-enable-quick-queries-on-huge-dataset">How to Enable Quick Queries on Huge Dataset<a href="#how-to-enable-quick-queries-on-huge-dataset" class="hash-link" aria-label="Direct link to How to Enable Quick Queries on Huge Dataset" title="Direct link to How to Enable Quick Queries on Huge Dataset"></a></h2><ul><li><strong>Scenario</strong>: user segmentation for the website</li><li><strong>Data size</strong>: 100 billion data objects, 2.4 million tags</li><li><strong>Requirements</strong>: query response time &lt; 1 second; result packaging &lt; 10 seconds</li></ul><p>For these goals, the engineers have made three critical changes in their data processing pipeline.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1distribute-the-data">1.Distribute the data<a href="#1distribute-the-data" class="hash-link" aria-label="Direct link to 1.Distribute the data" title="Direct link to 1.Distribute the data"></a></h3><p>User segmentation is when analysts pick out a group of website users that share certain characteristics (tags). In the database system, this process is implemented by a bunch of set operations (union, intersection, and difference). </p><p><strong>Narration from the engineers:</strong></p><p>We realize that instead of executing set operations on one big dataset, we can divide our dataset into smaller ones, execute set operations on each of them, and then merge all the results. In this way, each small dataset is computed by one thread/queue. Then we have a queue to do the final merging. It&#x27;s simple distributed computing thinking.</p><p><img loading="lazy" alt="distributed-computing-in-database" src="https://cdnd.selectdb.com/assets/images/Zhihu_1-7c5ee52877c98c9502ba57d03becdd9b.png" width="1280" height="651" class="img_ev3q"></p><p>Example:</p><ol><li>Every 1 million users are put into one group with a <code>group_id</code>.</li><li>All user tags in that same group will relate to the corresponding <code>group_id</code>.</li><li>Calculate the union/intersection/difference within each group. (Enable multi-thread mode to increase computation efficiency.)</li><li>Merge the results from the groups.</li></ol><p>The problem here is, since user tags are randomly distributed across various machines, the computation entails multi-time shuffling, which brings huge network overhead. That leads to the second change.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2pre-bind-a-data-group-to-a-machine">2.Pre-bind a data group to a machine<a href="#2pre-bind-a-data-group-to-a-machine" class="hash-link" aria-label="Direct link to 2.Pre-bind a data group to a machine" title="Direct link to 2.Pre-bind a data group to a machine"></a></h3><p>This is enabled by the Colocate mechanism of Apache Doris. The idea of Colocate is to place data chunks that are often accessed together onto the same node, so as to reduce cross-node data transfer and thus, get lower latency.</p><p><img loading="lazy" alt="colocate-mechanism" src="https://cdnd.selectdb.com/assets/images/Zhihu_2-6f75c0c47ef7106018774d6a70bf0e99.png" width="1280" height="331" class="img_ev3q"></p><p>The implementation is simple: Bind one group key to one machine. Then naturally, data corresponding to that group key will be pre-bound to that machine. </p><p>The following is the query plan before we adopted Collocate: It is complicated, with a lot of data shuffling.</p><p><img loading="lazy" alt="complicated-data-shuffling" src="https://cdnd.selectdb.com/assets/images/Zhihu_3-a6af7fe391aa9eaa717e558112e38d18.png" width="720" height="765" class="img_ev3q"></p><p>This is the query plan after. It is much simpler, which is why queries are much faster and less costly.</p><p><img loading="lazy" alt="simpler-query-plan-after-colocation-join" src="https://cdnd.selectdb.com/assets/images/Zhihu_4-ad4a6e9be6d812a88220544a77ce1c73.png" width="1280" height="616" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3merge-the-operators">3.Merge the operators<a href="#3merge-the-operators" class="hash-link" aria-label="Direct link to 3.Merge the operators" title="Direct link to 3.Merge the operators"></a></h3><p>In data queries, the engineers realized that they often use a couple of functions in combination, so they decided to develop compound functions to further improve execution efficiency. They came to the Doris <a href="https://t.co/XD4uUSROft" target="_blank" rel="noopener noreferrer">community</a> and talked about their thoughts. The Doris developers provided support for them and soon the compound functions are ready for use on Doris. These are a few examples:</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">bitmap_and_count == bitmap_count(bitmap_and(bitmap1, bitmap2))</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">bitmap_and_not_count == bitmap_count(bitmap_not(bitmap1, bitmap_and(bitmap1, bitmap2))</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">orthogonal_bitmap_union_count==bitmap_and(bitmap1,bitmap_and(bitmap2,bitmap3)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>Query execution with one compound function is much faster than that with a chain of simple functions, as you can tell from the lengths of the flow charts:</p><p><img loading="lazy" alt="operator-merging" src="https://cdnd.selectdb.com/assets/images/Zhihu_5-8ad26e082d2a60188e8928ab82192330.png" width="1280" height="396" class="img_ev3q"></p><ul><li><strong>Multiple Simple functions</strong>: This involves three function executions and two intermediate storage. It&#x27;s a long and slow process.</li><li><strong>One compound function</strong>: Simple in and out.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="how-to-quickly-ingest-large-amounts-of-data">How to Quickly Ingest Large Amounts of Data<a href="#how-to-quickly-ingest-large-amounts-of-data" class="hash-link" aria-label="Direct link to How to Quickly Ingest Large Amounts of Data" title="Direct link to How to Quickly Ingest Large Amounts of Data"></a></h2><p>This is about putting the right workload on the right component. Apache Doris supports a variety of data loading methods. After trials and errors, the user settled on Spark Load and thus decreased their data loading time by 90%. </p><p><strong>Narration from the engineers:</strong></p><p>In offline data ingestion, we used to perform most computation in Apache Hive, write the data files to HDFS, and pull data regularly from HDFS to Apache Doris. However, after Doris obtains parquet files from HDFS, it performs a series of operations on them before it can turn them into segment files: decompressing, bucketing, sorting, aggregating, and compressing. These workloads will be borne by Doris backends, which have to undertake a few bitmap operations at the same time. So there is a huge pressure on the CPU. </p><p><img loading="lazy" alt="Broker-Load" src="https://cdnd.selectdb.com/assets/images/Zhihu_6-10aa0935e2acd8774b0cb1f70d7013e8.png" width="1280" height="629" class="img_ev3q"></p><p>So we decided on the Spark Load method. It allows us to split the ingestion process into two parts: computation and storage, so we can move all the bucketing, sorting, aggregating, and compressing to Spark clusters. Then Spark writes the output to HDFS, from which Doris pulls data and flushes it to the local disks.</p><p><img loading="lazy" alt="Spark-Load" src="https://cdnd.selectdb.com/assets/images/Zhihu_7-5eacf11ecef47a4bdebd2b820d1f2bd6.png" width="1280" height="372" class="img_ev3q"></p><p>When ingesting 1.2 TB data (that&#x27;s 110 billion rows), the Spark Load method only took 55 minutes. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="a-vectorized-execution-engine">A Vectorized Execution Engine<a href="#a-vectorized-execution-engine" class="hash-link" aria-label="Direct link to A Vectorized Execution Engine" title="Direct link to A Vectorized Execution Engine"></a></h2><p>In addition to the above changes, a large part of the performance of a database relies on its execution engine. In the case of Apache Doris, it has fully vectorized its storage and computation layers since version 1.1. The longtime user also witnessed this revolution, so we invited them to test how the vectorized engine worked.</p><p>They compared query response time before and after the vectorization in seven of its frequent scenarios:</p><ul><li>Scenario 1: Simple user segmentation (hundreds of filtering conditions), data packaging of a multi-million user group.</li><li>Scenario 2: Complicated user segmentation (thousands of filtering conditions), data packaging of a tens-of-million user group.</li><li>Scenario 3: Multi-dimensional filtering (6 dimensions), single-table query, <strong>single-date flat table</strong>, data aggregation, 180 million rows per day.</li><li>Scenario 4: Multi-dimensional filtering (6 dimensions), single-table query, <strong>multi-date flat table</strong>, data aggregation, 180 million rows per day.</li><li>Scenario 5: <strong>Single-table query</strong>, COUNT, 180 million rows per day.</li><li>Scenario 6: <strong>Multi-table query</strong>, (Table A: 180 million rows, SUM, COUNT; Table B: 1.5 million rows, bitmap aggregation), aggregate Table A and Table B, join them with Table C, and then join the sub-tables, six joins in total.</li><li>Scenario 7: Single-table query, 500 million rows of itemized data</li></ul><p>The results are as below:</p><p><img loading="lazy" alt="performance-after-vectorization" src="https://cdnd.selectdb.com/assets/images/Zhihu_8-db8b7d375c494f0e806a2286ea9144b0.png" width="1280" height="591" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>In short, what contributed to the fast data loading and data queries in this case?</p><ul><li>The Colocate mechanism that&#x27;s designed for distributed computing</li><li>Collaboration between database users and <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">developers</a> that enables the operator merging</li><li>Support for a wide range of data loading methods to choose from</li><li>A vectorized engine that brings overall performance increase</li></ul><p>It takes efforts from both the database developers and users to make fast performance possible. The user&#x27;s experience and knowledge of their own status quo will allow them to figure out the quickest path, while a good database design will help pave the way and make users&#x27; life easier.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Listen-to-That-Poor-BI-Engineer-We-Need-Fast-Joins">Listen to that poor BI engineer: we need fast joins</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Baoming Zhang</span></span><time datetime="2023-07-10T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 10, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Business intelligence (BI) tool is often the last stop of a data processing pipeline. It is where data is visualized for analysts who then extract insights from it. From the standpoint of a SaaS BI provider, what are we looking for in a database? In my job, we are in urgent need of support for fast join queries.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="why-join-query-matters">Why JOIN Query Matters<a href="#why-join-query-matters" class="hash-link" aria-label="Direct link to Why JOIN Query Matters" title="Direct link to Why JOIN Query Matters"></a></h2><p>I work as an engineer that supports a human resource management system. One prominent selling point of our services is <strong>self-service</strong> <strong>BI</strong>. That means we allow users to customize their own dashboards: they can choose the fields they need and relate them to form the dataset as they want. </p><p><img loading="lazy" alt="self-service-BI" src="https://cdnd.selectdb.com/assets/images/Moka_1-6653b0bedab8b84497aad6667ab2db9c.png" width="1280" height="709" class="img_ev3q"></p><p>Join query is a more efficient way to realize self-service BI. It allows people to break down their data assets into many smaller tables instead of putting it all in a flat table. This would make data updates much faster and more cost-effective, because updating the whole flat table is not always the optimal choice when you have plenty of new data flowing in and old data being updated or deleted frequently, as is the case for most data input.</p><p>In order to maximize the time value of data, we need data updates to be executed really quickly. For this purpose, we looked into three OLAP databases on the market. They are all fast in some way but there are some differences.</p><p><img loading="lazy" alt="Apache-Doris-VS-ClickHouse-VS-Greenplum" src="https://cdnd.selectdb.com/assets/images/Moka_2-fe0c3aef14ac2449ef661d83ca293e8d.png" width="1280" height="627" class="img_ev3q"></p><p>Greenplum is really quick in data loading and batch DML processing, but it is not good at handling high concurrency. There is a steep decline in performance as query concurrency rises. This can be risky for a BI platform that tries to ensure stable user experience. ClickHouse is mind-blowing in single-table queries, but it only allows batch update and batch delete, so that&#x27;s less timely.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="welcome-to-join-hell">Welcome to JOIN Hell<a href="#welcome-to-join-hell" class="hash-link" aria-label="Direct link to Welcome to JOIN Hell" title="Direct link to Welcome to JOIN Hell"></a></h2><p>JOIN, my old friend JOIN, is always a hassle. Join queries are demanding for both engineers and the database system. Firstly, engineers must have a thorough grasp of the schema of all tables. Secondly, these queries are resource-intensive, especially when they involve large tables. Some of the reports on our platform entail join queries across up to 20 tables. Just imagine the mess.</p><p>We tested our candidate OLAP engines with our common join queries and our most notorious slow queries. </p><p><img loading="lazy" alt="Apache-Doris-VS-ClickHouse" src="https://cdnd.selectdb.com/assets/images/Moka_3-dab994e57f63d5b0b6c72b18de3a562b.png" width="1280" height="726" class="img_ev3q"></p><p>As the number of tables joined grows, we witness a widening performance gap between Apache Doris and ClickHouse. In most join queries, Apache Doris was about 5 times faster than ClickHouse. In terms of slow queries, Apache Doris responded to most of them within less than 1 second, while the performance of ClickHouse fluctuated within a relatively large range. </p><p>And just like that, we decided to upgrade our data architecture with Apache Doris. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="architecture-that-supports-our-bi-services">Architecture that Supports Our BI Services<a href="#architecture-that-supports-our-bi-services" class="hash-link" aria-label="Direct link to Architecture that Supports Our BI Services" title="Direct link to Architecture that Supports Our BI Services"></a></h2><p><strong>Data Input:</strong> </p><p>Our business data flows into DBLE, a distributed middleware based on MySQL. Then the DBLE binlogs are written into Flink, getting deduplicated, merged, and then put into Kafka. Finally, Apache Doris reads data from Kafka via its Routine Load approach. We apply the &quot;delete&quot; configuration in Routine Load to enable real-time deletion. The combination of Apache Flink and the idempotent write mechanism of Apache Doris is how we get exactly-once guarantee. We have a data size of billions of rows per table, and this architecture is able to finish data updates in one minute. </p><p>In addition, taking advantage of Apache Kafka and the Routine Load method, we are able to shave the traffic peaks and maintain cluster stability. Kafka also allows us to have multiple consumers of data and recompute intermediate data by resetting the offsets.</p><p><strong>Data Output</strong>: </p><p>As a self-service BI platform, we allow users to customize their own reports by configuring the rows, columns, and filters as they need. This is supported by Apache Doris with its capabilities in join queries. </p><p>In total, we have 400 data tables, of which 50 has over 100 million rows. That adds up to a data size measured in TB. We put all our data into two Doris clusters on 40 servers.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="no-longer-stalled-by-privileged-access-queries">No Longer Stalled by Privileged Access Queries<a href="#no-longer-stalled-by-privileged-access-queries" class="hash-link" aria-label="Direct link to No Longer Stalled by Privileged Access Queries" title="Direct link to No Longer Stalled by Privileged Access Queries"></a></h2><p>On our BI platform, privileged queries are often much slower than non-privileged queries. Timeout is often the case and even more so for queries on large datasets.</p><p>Human resource data is subject to very strict and fine-grained access control policies. The role and position of users and the confidentiality level of data determine who has access to what (the data granularity here is up to fields in a table). Occasionally, we need to separately grant a certain privilege to a particular person. On top of that, we need to ensure data isolation between the multiple tenants on our platform.</p><p>How does all this add to complexity in engineering? Any user who inputs a query on our BI platform must go through multi-factor authentication, and the authenticated information will all be inserted into the SQL via <code>in</code> and then passed on to the OLAP engine. Therefore, the more fine-grained the privilege controls are, the longer the SQL will be, and the more time the OLAP system will spend on ID filtering. That&#x27;s why our users are often tortured by high latency.</p><p><img loading="lazy" alt="privileged-access-queries" src="https://cdnd.selectdb.com/assets/images/Moka_4-64db81a5dd0659c2fe09805142c25b39.png" width="1396" height="650" class="img_ev3q"></p><p>So how did we fix that? We use the <a href="https://doris.apache.org/docs/dev/data-table/index/bloomfilter/" target="_blank" rel="noopener noreferrer">Bloom Filter index</a> in Apache Doris. </p><p><img loading="lazy" alt="BloomFilter-index" src="https://cdnd.selectdb.com/assets/images/Moka_5-666c3e530937abfa6243f0f3bb1f645c.png" width="1280" height="118" class="img_ev3q"></p><p>By adding Bloom Filter indexes to the relevant ID fields, we improve the speed of privileged queries by 30% and basically eliminate timeout errors.</p><p><img loading="lazy" alt="faster-privileged-access-queries" src="https://cdnd.selectdb.com/assets/images/Moka_6-946cd1d988bc4d2cd18f580775cb89a7.png" width="1852" height="863" class="img_ev3q"></p><p>Tips on when you should use the Bloom Filter index:</p><ul><li>For non-prefix filtering</li><li>For <code>in</code> and <code>=</code> filters on a particular column</li><li>For filtering on high-cardinality columns, such as UserID. In essence, the Bloom Filter index is used to check if a certain value exists in a dataset. There is no point in using the Bloom Filter index for a low-cardinality column, like &quot;gender&quot;, for example, because almost every data block contains all the gender values.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="to-all-bi-engineers">To All BI Engineers<a href="#to-all-bi-engineers" class="hash-link" aria-label="Direct link to To All BI Engineers" title="Direct link to To All BI Engineers"></a></h2><p>We believe self-service BI is the future in the BI landscape, just like AGI is the future for artificial intelligence. Fast join queries is the way towards it, and the foregoing architectural upgrade is part of our ongoing effort to empower that. May there be less painful JOINs in the BI world. Cheers.</p><p>Find the Apache Doris developers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Replacing-Apache-Hive-Elasticsearch-and-PostgreSQL-with-Apache-Doris">Replacing Apache Hive, Elasticsearch and PostgreSQL with Apache Doris</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Tao Wang</span></span><time datetime="2023-07-01T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 1, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>How does a data service company build its data warehouse? I worked as a real-time computing engineer for a due diligence platform, which is designed to allow users to search for a company&#x27;s business data, financial, and legal details. It has collected information of over 300 million entities in more than 300 dimensions. The duty of my colleagues and I is to ensure real-time updates of such data so we can provide up-to-date information for our registered users. That&#x27;s the customer-facing function of our data warehouse. Other than that, it needs to support our internal marketing and operation team in ad-hoc queries and user segmentation, which is a new demand that emerged with our growing business. </p><p>Our old data warehouse consisted of the most popular components of the time, including <strong>Apache</strong> <strong>Hive</strong>, <strong>MySQL</strong>, <strong>Elasticsearch</strong>, and <strong>PostgreSQL</strong>. They support the data computing and data storage layers of our data warehouse: </p><ul><li><strong>Data Computing</strong>: Apache Hive serves as the computation engine.</li><li><strong>Data Storage</strong>: <strong>MySQL</strong> provides data for DataBank, Tableau, and our customer-facing applications. <strong>Elasticsearch</strong> and <strong>PostgreSQL</strong> serve for our DMP user segmentation system: the former stores user profiling data, and the latter stores user group data packets. </li></ul><p>As you can imagine, a long and complicated data pipeline is high-maintenance and detrimental to development efficiency. Moreover, they are not capable of ad-hoc queries. So as an upgrade to our data warehouse, we replaced most of these components with <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">Apache Doris</a>, a unified analytic database.</p><p><img loading="lazy" alt="replace-MySQL-Elasticsearch-PostgreSQL-with-Apache-Doris-before" src="https://cdnd.selectdb.com/assets/images/Tianyancha_1-9cc7124fc979257cf029e086ce018e78.png" width="1280" height="640" class="img_ev3q"></p><p><img loading="lazy" alt="replace-MySQL-Elasticsearch-PostgreSQL-with-Apache-Doris-after" src="https://cdnd.selectdb.com/assets/images/Tianyancha_2-56765f2ef0a2d26069c3cd115e694882.png" width="1280" height="548" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-flow">Data Flow<a href="#data-flow" class="hash-link" aria-label="Direct link to Data Flow" title="Direct link to Data Flow"></a></h2><p>This is a lateral view of our data warehouse, from which you can see how the data flows.</p><p><img loading="lazy" alt="data-flow" src="https://cdnd.selectdb.com/assets/images/Tianyancha_3-733959d2cc60e873ec5b3b9fc06d9e0e.png" width="1280" height="489" class="img_ev3q"></p><p>For starters, binlogs from MySQL will be ingested into Kafka via Canal, while user activity logs will be transferred to Kafka via Apache Flume. In Kafka, data will be cleaned and organized into flat tables, which will be later turned into aggregated tables. Then, data will be passed from Kafka to Apache Doris, which serves as the storage and computing engine. </p><p>We adopt different data models in Apache Doris for different scenarios: data from MySQL will be arranged in the <a href="https://doris.apache.org/docs/dev/data-table/data-model/#unique-model" target="_blank" rel="noopener noreferrer">Unique model</a>, log data will be put in the <a href="https://doris.apache.org/docs/dev/data-table/data-model/#duplicate-model" target="_blank" rel="noopener noreferrer">Duplicate model</a>, while data in the DWS layer will be merged in the <a href="https://doris.apache.org/docs/dev/data-table/data-model/#aggregate-model" target="_blank" rel="noopener noreferrer">Aggregate model</a>.</p><p>This is how Apache Doris replaces the roles of Hive, Elasticsearch, and PostgreSQL in our datawarehouse. Such transformation has saved us lots of efforts in development and maintenance. It also made ad-hoc queries possible and our user segmentation more efficient. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="ad-hoc-queries">Ad-Hoc Queries<a href="#ad-hoc-queries" class="hash-link" aria-label="Direct link to Ad-Hoc Queries" title="Direct link to Ad-Hoc Queries"></a></h2><p><strong>Before</strong>: Every time a new request was raised, we developed and tested the data model in Hive, and wrote the scheduling task in MySQL so that our customer-facing application platforms could read results from MySQL. It was a complicated process that took a lot of time and development work. </p><p><strong>After</strong>: Since Apache Doris has all the itemized data, whenever it is faced with a new request, it can simply pull the metadata and configure the query conditions. Then it is ready for ad-hoc queries. In short, it only requires low-code configuration to respond to new requests. </p><p><img loading="lazy" alt="ad-hoc-queries" src="https://cdnd.selectdb.com/assets/images/Tianyancha_4-9a9132537dbc478b0aa9948131184564.png" width="1280" height="712" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="user-segmentation">User Segmentation<a href="#user-segmentation" class="hash-link" aria-label="Direct link to User Segmentation" title="Direct link to User Segmentation"></a></h2><p><strong>Before</strong>: After a user segmentation task was created based on metadata, the relevant user IDs would be written into the PostgreSQL profile list and the MySQL task list. Meanwhile, Elasticsearch would execute the query according to the task conditions; after the results are produced, it would update status in the task list and write the user group bitmap package into PostgreSQL. (The PostgreSQL plug-in is capable of computing the intersection, union, and difference set of bitmap.) Then PostgreSQL would provide user group packets for downstream operation platforms.</p><p>Tables in Elasticsearch and PostgreSQL were unreusable, making this architecture cost-ineffective. Plus, we had to pre-define the user tags before we could execute a new type of query. That slowed things down. </p><p><strong>After</strong>: The user IDs will only be written into the MySQL task list. For first-time segmentation, Apache Doris will execute the <strong>ad-hoc query</strong> based on the task conditions. In subsequent segmentation tasks, Apache Doris will perform <strong>micro-batch rolling</strong> and compute the difference set compared with the previously produced user group packet, and notify downstream platforms of any updates. (This is realized by the <a href="https://doris.apache.org/docs/dev/sql-manual/sql-functions/bitmap-functions/bitmap_union" target="_blank" rel="noopener noreferrer">bitmap functions</a> in Apache Doris.) </p><p>In this Doris-centered user segmentation process, we don&#x27;t have to pre-define new tags. Instead, tags can be auto-generated based on the task conditions. The processing pipeline has the flexibility that can make our user-group-based A/B testing easier. Also, as both the itemized data and user group packets are in Apache Doris, we don&#x27;t have to attend to the read and write complexity between multiple components.</p><p><img loading="lazy" alt="user-segmentation-pipeline" src="https://cdnd.selectdb.com/assets/images/Tianyancha_5-82288dba1ffdb438be29168a2eafd7f9.png" width="1280" height="688" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="trick-to-speed-up-user-segmentation-by-70">Trick to Speed up User Segmentation by 70%<a href="#trick-to-speed-up-user-segmentation-by-70" class="hash-link" aria-label="Direct link to Trick to Speed up User Segmentation by 70%" title="Direct link to Trick to Speed up User Segmentation by 70%"></a></h2><p>Due to risk aversion reasons, random generation of <code>user_id</code> is the choice for many companies, but that produces sparse and non-consecutive user IDs in user group packets. Using these IDs in user segmentation, we had to endure a long waiting time for bitmaps to be generated. </p><p>To solve that, we created consecutive and dense mappings for these user IDs. <strong>In this way, we decreased our user segmentation latency by 70%.</strong></p><p><img loading="lazy" alt="user-segmentation-latency-1" src="https://cdnd.selectdb.com/assets/images/Tianyancha_6-22694f7b8d5e06aa2c8c4757c52c8c05.png" width="1030" height="218" class="img_ev3q"></p><p><img loading="lazy" alt="user-segmentation-latency-2" src="https://cdnd.selectdb.com/assets/images/Tianyancha_7-e5d5d3312ade5d026533922a01207660.png" width="1280" height="698" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="example">Example<a href="#example" class="hash-link" aria-label="Direct link to Example" title="Direct link to Example"></a></h3><p><strong>Step 1: Create a user ID mapping table:</strong></p><p>We adopt the Unique model for user ID mapping tables, where the user ID is the unique key. The mapped consecutive IDs usually start from 1 and are strictly increasing. </p><p><img loading="lazy" alt="create-user-ID-mapping-table" src="https://cdnd.selectdb.com/assets/images/Tianyancha_8-74c77b6500d66dfb6aa2fc8ba742868c.png" width="1280" height="540" class="img_ev3q"></p><p><strong>Step 2: Create a user group table:</strong></p><p>We adopt the Aggregate model for user group tables, where user tags serve as the aggregation keys. </p><p><img loading="lazy" alt="create-user-group-table" src="https://cdnd.selectdb.com/assets/images/Tianyancha_9-76a30c385266aadc57e8ab898cc53bce.png" width="1280" height="604" class="img_ev3q"></p><p>Supposing that we need to pick out the users whose IDs are between 0 and 2000000. </p><p>The following snippets use non-consecutive (<code>tyc_user_id</code>) and consecutive (<code>tyc_user_id_continuous</code>) user IDs for user segmentation, respectively. There is a big gap between their <strong>response time:</strong></p><ul><li>Non-Consecutive User IDs: <strong>1843ms</strong></li><li>Consecutive User IDs: <strong>543ms</strong> </li></ul><p><img loading="lazy" alt="response-time-of-consecutive-and-non-consecutive-user-IDs" src="https://cdnd.selectdb.com/assets/images/Tianyancha_10-c239e3a39b72d21c1d65fc74858b36a3.png" width="1920" height="736" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>We have 2 clusters in Apache Doris accommodating tens of TBs of data, with almost a billion new rows flowing in every day. We used to witness a steep decline in data ingestion speed as data volume expanded. But after upgrading our data warehouse with Apache Doris, we increased our data writing efficiency by 75%. Also, in user segmentation with a result set of less than 5 million, it is able to respond within milliseconds. Most importantly, our data warehouse has been simpler and friendlier to developers and maintainers. </p><p><img loading="lazy" alt="user-segmentation-latency-3" src="https://cdnd.selectdb.com/assets/images/Tianyancha_11-3fe828cadbc9a5972a82bbbd2a0b473e.png" width="1280" height="667" class="img_ev3q"></p><p>Lastly, I would like to share with you something that interested us most when we first talked to the <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Apache Doris community</a>:</p><ul><li>Apache Doris supports data ingestion transactions so it can ensure data is written <strong>exactly once</strong>.</li><li>It is well-integrated with the data ecosystem and can smoothly interface with most data sources and data formats.</li><li>It allows us to implement elastic scaling of clusters using the command line interface.</li><li>It outperforms ClickHouse in <strong>join queries</strong>.</li></ul><p>Find Apache Doris developers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-1t3wfymur-0soNPATWQ~gbU8xutFOLog" target="_blank" rel="noopener noreferrer">Slack</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/AB-Testing-was-a-Handful-Until-we-Found-the-Replacement-for-Druid">A/B Testing was a handful, until we found the replacement for Druid</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Heyu Dou, Xinxin Wang</span></span><time datetime="2023-06-01T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">June 1, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Unlike normal reporting, A/B testing collects data of a different combination of dimensions every time. It is also a complicated kind of analysis of immense data. In our case, we have a real-time data volume of millions of OPS (Operations Per Second), with each operation involving around 20 data tags and over a dozen dimensions.</p><p>For effective A/B testing, as data engineers, we must ensure quick computation as well as high data integrity (which means no duplication and no data loss). I&#x27;m sure I&#x27;m not the only one to say this: it is hard!</p><p>Let me show you our long-term struggle with our previous Druid-based data platform.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="platform-architecture-10">Platform Architecture 1.0<a href="#platform-architecture-10" class="hash-link" aria-label="Direct link to Platform Architecture 1.0" title="Direct link to Platform Architecture 1.0"></a></h2><p><strong>Components</strong>: Apache Storm + Apache Druid + MySQL</p><p>This was our real-time datawarehouse, where Apache Storm was the real-time data processing engine and Apache Druid pre-aggregated the data. However, Druid did not support certain paging and join queries, so we wrote data from Druid to MySQL regularly, making MySQL the &quot;materialized view&quot; of Druid. But that was only a duct tape solution as it couldn&#x27;t support our ever enlarging real-time data size. So data timeliness was unattainable.</p><p><img loading="lazy" alt="Apache-Storm-Apache-Druid-MySQL" src="https://cdnd.selectdb.com/assets/images/360_1-8cb2f7a87f8ce60f9da14e0ec0ea7bb5.png" width="1709" height="960" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="platform-architecture-20">Platform Architecture 2.0<a href="#platform-architecture-20" class="hash-link" aria-label="Direct link to Platform Architecture 2.0" title="Direct link to Platform Architecture 2.0"></a></h2><p><strong>Components</strong>: Apache Flink + Apache Druid + TiDB</p><p>This time, we replaced Storm with Flink, and MySQL with TiDB. Flink was more powerful in terms of semantics and features, while TiDB, with its distributed capability, was more maintainable than MySQL. But architecture 2.0 was nowhere near our goal of end-to-end data consistency, either, because when processing huge data, enabling TiDB transactions largely slowed down data writing. Plus, Druid itself did not support standard SQL, so there were some learning costs and frictions in usage.</p><p><img loading="lazy" alt="Apache-Flink-Apache-Druid-TiDB" src="https://cdnd.selectdb.com/assets/images/360_2-d32b762837d3788bdc43f0370fbf8199.png" width="1592" height="1083" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="platform-architecture-30">Platform Architecture 3.0<a href="#platform-architecture-30" class="hash-link" aria-label="Direct link to Platform Architecture 3.0" title="Direct link to Platform Architecture 3.0"></a></h2><p><strong>Components</strong>: Apache Flink + <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">Apache Doris</a></p><p>We replaced Apache Druid with Apache Doris as the OLAP engine, which could also serve as a unified data serving gateway. So in Architecture 3.0, we only need to maintain one set of query logic. And we layered our real-time datawarehouse to increase reusability of real-time data.</p><p><img loading="lazy" alt="Apache-Flink-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/360_3-c04ebf18268d873153f0365681d2a5d0.png" width="1340" height="1101" class="img_ev3q"></p><p>Turns out the combination of Flink and Doris was the answer. We can exploit their features to realize quick computation and data consistency. Keep reading and see how we make it happen.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="quick-computation">Quick Computation<a href="#quick-computation" class="hash-link" aria-label="Direct link to Quick Computation" title="Direct link to Quick Computation"></a></h2><p>As one piece of operation data can be attached to 20 tags, in A/B testing, we compare two groups of data centering only one tag each time. At first, we thought about splitting one piece of operation data (with 20 tags) into 20 pieces of data of only one tag upon data ingestion, and then importing them into Doris for analysis, but that could cause a data explosion and thus huge pressure on our clusters. </p><p>Then we tried moving part of such workload to the computation engine. So we tried and &quot;exploded&quot; the data in Flink, but soon regretted it, because when we aggregated the data using the global hash windows in Flink jobs, the network and CPU usage also &quot;exploded&quot;.</p><p>Our third shot was to aggregate data locally in Flink right after we split it. As is shown below, we create a window in the memory of one operator for local aggregation; then we further aggregate it using the global hash windows. Since two operators chained together are in one thread, transferring data between operators consumes much less network resources. <strong>The two-step aggregation method, combined with the</strong> <strong><a href="https://doris.apache.org/docs/dev/data-table/data-model" target="_blank" rel="noopener noreferrer">Aggregate model</a></strong> <strong>of Apache Doris, can keep data explosion in a manageable range.</strong></p><p><img loading="lazy" alt="Apache-Flink-Apache-Doris-2" src="https://cdnd.selectdb.com/assets/images/360_4-b4cad8ba4f8625718a23e7297885c40d.png" width="1642" height="624" class="img_ev3q"></p><p>For convenience in A/B testing, we make the test tag ID the first sorted field in Apache Doris, so we can quickly locate the target data using sorted indexes. To further minimize data processing in queries, we create materialized views with the frequently used dimensions. With constant modification and updates, the materialized views are applicable in 80% of our queries.</p><p>To sum up, with the application of sorted index and materialized views, we reduce our query response time to merely seconds in A/B testing.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-integrity-guarantee">Data Integrity Guarantee<a href="#data-integrity-guarantee" class="hash-link" aria-label="Direct link to Data Integrity Guarantee" title="Direct link to Data Integrity Guarantee"></a></h2><p>Imagine that your algorithm designers worked sweat and tears trying to improve the business, only to find their solution unable to be validated by A/B testing due to data loss. This is an unbearable situation, and we make every effort to avoid it.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="develop-a-sink-to-doris-component">Develop a Sink-to-Doris Component<a href="#develop-a-sink-to-doris-component" class="hash-link" aria-label="Direct link to Develop a Sink-to-Doris Component" title="Direct link to Develop a Sink-to-Doris Component"></a></h3><p>To ensure end-to-end data integrity, we developed a Sink-to-Doris component. It is built on our own Flink Stream API scaffolding and realized by the idempotent writing of Apache Doris and the two-stage commit mechanism of Apache Flink. On top of it, we have a data protection mechanism against anomalies. </p><p>It is the result of our long-term evolution. We used to ensure data consistency by implementing &quot;one writing for one tag ID&quot;. Then we realized we could make good use of the transactions in Apache Doris and the two-stage commit of Apache Flink. </p><p><img loading="lazy" alt="idempotent-writing-two-stage-commit" src="https://cdnd.selectdb.com/assets/images/360_5-b5f8490ad14a1b485d4472b3db36e9d6.png" width="3380" height="3334" class="img_ev3q"></p><p>As is shown above, this is how two-stage commit works to guarantee data consistency:</p><ol><li>Write data into local files;</li><li>Stage One: pre-commit data to Apache Doris. Save the Doris transaction ID into status;</li><li>If checkpoint fails, manually abandon the transaction; if checkpoint succeeds, commit the transaction in Stage Two;</li><li>If the commit fails after multiple retries, the transaction ID and the relevant data will be saved in HDFS, and we can restore the data via Broker Load.</li></ol><p>We make it possible to split a single checkpoint into multiple transactions, so that we can prevent one Stream Load from taking more time than a Flink checkpoint in the event of large data volumes.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="application-display">Application Display<a href="#application-display" class="hash-link" aria-label="Direct link to Application Display" title="Direct link to Application Display"></a></h3><p>This is how we implement Sink-to-Doris. The component has blocked API calls and topology assembly. With simple configuration, we can write data into Apache Doris via Stream Load. </p><p><img loading="lazy" alt="Sink-to-Doris" src="https://cdnd.selectdb.com/assets/images/360_6-9d94599760bc55e52be086ec6d44cc69.png" width="3289" height="1077" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="cluster-monitoring">Cluster Monitoring<a href="#cluster-monitoring" class="hash-link" aria-label="Direct link to Cluster Monitoring" title="Direct link to Cluster Monitoring"></a></h3><p>For cluster and host monitoring, we adopted the metrics templates provided by the Apache Doris community. For data monitoring, in addition to the template metrics, we added Stream Load request numbers and loading rates.</p><p><img loading="lazy" alt="stream-load-cluster-monitoring" src="https://cdnd.selectdb.com/assets/images/360_7-a8f9f0c95e96e136b287be46bdbc4add.png" width="2001" height="832" class="img_ev3q"></p><p>Other metrics of our concerns include data writing speed and task processing time. In the case of anomalies, we will receive notifications in the form of phone calls, messages, and emails.</p><p><img loading="lazy" alt="cluster-monitoring" src="https://cdnd.selectdb.com/assets/images/360_8-e02d4bf0c8cfab543e5693216fee6357.png" width="1280" height="888" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="key-takeaways">Key Takeaways<a href="#key-takeaways" class="hash-link" aria-label="Direct link to Key Takeaways" title="Direct link to Key Takeaways"></a></h2><p>The recipe for successful A/B testing is quick computation and high data integrity. For this purpose, we implement a two-step aggregation method in Apache Flink, utilize the Aggregate model, materialized view, and short indexes of Apache Doris. Then we develop a Sink-to-Doris component, which is realized by the idempotent writing of Apache Doris and the two-stage commit mechanism of Apache Flink.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Building-a-Data-Warehouse-for-Traditional-Industry">Building a data warehouse for traditional industry</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Herman Seah</span></span><time datetime="2023-05-12T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">May 12, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>By Herman Seah, Data Warehouse Planner &amp; Data Analyst at Midland Realty</p><p>This is a part of the digital transformation of a real estate giant. For the sake of confidentiality, I&#x27;m not going to reveal any business data, but you&#x27;ll get a detailed view of our data warehouse and our optimization strategies.</p><p>Now let&#x27;s get started.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="architecture">Architecture<a href="#architecture" class="hash-link" aria-label="Direct link to Architecture" title="Direct link to Architecture"></a></h2><p>Logically, our data architecture can be divided into four parts.</p><p><img loading="lazy" alt="data-processing-architecture" src="https://cdnd.selectdb.com/assets/images/Midland_1-13321d195f728638c4903bdd51e60ef0.png" width="1280" height="616" class="img_ev3q"></p><ul><li><strong>Data integration</strong>: This is supported by Flink CDC, DataX, and the Multi-Catalog feature of Apache Doris.</li><li><strong>Data management</strong>: We use Apache Dolphinscheduler for script lifecycle management, privileges in multi-tenancy management, and data quality monitoring.</li><li><strong>Alerting</strong>: We use Grafana, Prometheus, and Loki to monitor component resources and logs.</li><li><strong>Data services</strong>: This is where BI tools step in for user interaction, such as data queries and analysis.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-tables">1. <strong>Tables</strong><a href="#1-tables" class="hash-link" aria-label="Direct link to 1-tables" title="Direct link to 1-tables"></a></h3><p>We create our dimension tables and fact tables centering each operating entity in business, including customers, houses, etc. If there are a series of activities involving the same operating entity, they should be recorded by one field. (This is a lesson learned from our previous chaotic data management system.)</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-layers">2. <strong>Layers</strong><a href="#2-layers" class="hash-link" aria-label="Direct link to 2-layers" title="Direct link to 2-layers"></a></h3><p>Our data warehouse is divided into five conceptual layers. We use Apache Doris and Apache DolphinScheduler to schedule the DAG scripts between these layers.</p><p><img loading="lazy" alt="ODS-DWD-DWS-ADS-DIM" src="https://cdnd.selectdb.com/assets/images/Midland_2-4d94af927a13961e91486cef3512b47f.png" width="1280" height="729" class="img_ev3q"></p><p>Every day, the layers go through an overall update besides incremental updates in case of changes in historical status fields or incomplete data synchronization of ODS tables.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3-incremental-update-strategies">3. <strong>Incremental Update Strategies</strong><a href="#3-incremental-update-strategies" class="hash-link" aria-label="Direct link to 3-incremental-update-strategies" title="Direct link to 3-incremental-update-strategies"></a></h3><p>(1) Set <code>where &gt;= &quot;activity time -1 day or -1 hour&quot;</code> instead of <code>where &gt;= &quot;activity time</code></p><p>The reason for doing so is to prevent data drift caused by the time gap of scheduling scripts. Let&#x27;s say, with the execution interval set to 10 min, suppose that the script is executed at 23:58:00 and a new piece of data arrives at 23:59:00, if we set <code>where &gt;= &quot;activity time</code>, that piece of data of the day will be missed.</p><p>(2) Fetch the ID of the largest primary key of the table before every script execution, store the ID in the auxiliary table, and set <code>where &gt;= &quot;ID in auxiliary table&quot;</code></p><p>This is to avoid data duplication. Data duplication might happen if you use the Unique Key model of Apache Doris and designate a set of primary keys, because if there are any changes in the primary keys in the source table, the changes will be recorded and the relevant data will be loaded. This method can fix that, but it is only applicable when the source tables have auto-increment primary keys.</p><p>(3) Partition the tables</p><p>As for time-based auto-increment data such as log tables, there might be less changes in historical data and status, but the data volume is large, so there could be huge computing pressure on overall updates and snapshot creation. Hence, it is better to partition such tables, so for each incremental update, we only need to replace one partition. (You might need to watch out for data drift, too.)</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="4-overall-update-strategies">4. <strong>Overall Update Strategies</strong><a href="#4-overall-update-strategies" class="hash-link" aria-label="Direct link to 4-overall-update-strategies" title="Direct link to 4-overall-update-strategies"></a></h3><p>(1) Truncate Table</p><p>Clear out the table and then ingest all data from the source table into it. This is applicable for small tables and scenarios with no user activity in wee hours.</p><p>(2) <code>ALTER TABLE tbl1 REPLACE WITH TABLE tbl2 </code></p><p>This is an atomic operation and it is advisable for large tables. Every time before executing a script, we create a temporary table with the same schema, load all data into it, and replace the original table with it.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="application">Application<a href="#application" class="hash-link" aria-label="Direct link to Application" title="Direct link to Application"></a></h2><ul><li><strong>ETL job</strong>: every minute</li><li><strong>Configuration for first-time deployment</strong>: 8 nodes, 2 frontends, 8 backends, hybrid deployment</li><li><strong>Node configuration</strong>: 32C <em> 60GB </em> 2TB SSD</li></ul><p>This is our configuration for TBs of legacy data and GBs of incremental data. You can use it as a reference and scale your cluster on this basis. Deployment of Apache Doris is simple. You don&#x27;t need other components.</p><ol><li>To integrate offline data and log data, we use DataX, which supports CSV format and readers of many relational databases, and Apache Doris provides a DataX-Doris-Writer.</li></ol><p><img loading="lazy" alt="DataX-Doris-Writer" src="https://cdnd.selectdb.com/assets/images/Midland_3-d394cef81ce173d944a379f14824f5e6.png" width="992" height="636" class="img_ev3q"></p><ol start="2"><li>We use Flink CDC to synchronize data from source tables. Then we aggregate the real-time metrics utilizing the Materialized View or the Aggregate Model of Apache Doris. Since we only have to process part of the metrics in a real-time manner and we don&#x27;t want to generate too many database connections, we use one Flink job to maintain multiple CDC source tables. This is realized by the multi-source merging and full database sync features of Dinky, or you can implement a Flink DataStream multi-source merging task yourself. It is noteworthy that Flink CDC and Apache Doris support Schema Change.</li></ol><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">EXECUTE CDCSOURCE demo_doris WITH (</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;connector&#x27; = &#x27;mysql-cdc&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;hostname&#x27; = &#x27;127.0.0.1&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;port&#x27; = &#x27;3306&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;username&#x27; = &#x27;root&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;password&#x27; = &#x27;123456&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;checkpoint&#x27; = &#x27;10000&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;scan.startup.mode&#x27; = &#x27;initial&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;parallelism&#x27; = &#x27;1&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;table-name&#x27; = &#x27;ods.ods_*,ods.ods_*&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.connector&#x27; = &#x27;doris&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.fenodes&#x27; = &#x27;127.0.0.1:8030&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.username&#x27; = &#x27;root&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.password&#x27; = &#x27;123456&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.doris.batch.size&#x27; = &#x27;1000&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.sink.max-retries&#x27; = &#x27;1&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.sink.batch.interval&#x27; = &#x27;60000&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.sink.db&#x27; = &#x27;test&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.sink.properties.format&#x27; =&#x27;json&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.sink.properties.read_json_by_line&#x27; =&#x27;true&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.table.identifier&#x27; = &#x27;${schemaName}.${tableName}&#x27;,</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> &#x27;sink.sink.label-prefix&#x27; = &#x27;${schemaName}_${tableName}_1&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">);</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ol start="3"><li>We use SQL scripts or &quot;Shell + SQL&quot; scripts, and we perform script lifecycle management. At the ODS layer, we write a general DataX job file and pass parameters for each source table ingestion, instead of writing a DataX job for each source table. In this way, we make things much easier to maintain. We manage the ETL scripts of Apache Doris on DolphinScheduler, where we also conduct version control. In case of any errors in the production environment, we can always rollback.</li></ol><p><img loading="lazy" alt="SQL-script" src="https://cdnd.selectdb.com/assets/images/Midland_4-f50219b88be08e1bdf3a7b31c21ae258.png" width="1280" height="625" class="img_ev3q"></p><ol start="4"><li>After ingesting data with ETL scripts, we create a page in our reporting tool. We assign different privileges to different accounts using SQL, including the privilege of modifying rows, fields, and global dictionary. Apache Doris supports privilege control over accounts, which works the same as that in MySQL. </li></ol><p><img loading="lazy" alt="privilege-control-over-accounts" src="https://cdnd.selectdb.com/assets/images/Midland_5-7b83ea92344d586f4de8cd363b7c6357.png" width="1280" height="516" class="img_ev3q"></p><p>We also use Apache Doris data backup for disaster recovery, Apache Doris audit logs to monitor SQL execution efficiency, Grafana+Loki for cluster metric alerts, and Supervisor to monitor the daemon processes of node components.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="optimization">Optimization<a href="#optimization" class="hash-link" aria-label="Direct link to Optimization" title="Direct link to Optimization"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-data-ingestion">1. Data Ingestion<a href="#1-data-ingestion" class="hash-link" aria-label="Direct link to 1. Data Ingestion" title="Direct link to 1. Data Ingestion"></a></h3><p>We use DataX to Stream Load offline data. It allows us to adjust the size of each batch. The Stream Load method returns results synchronously, which meets the needs of our architecture. If we execute asynchronous data import using DolphinScheduler, the system might assume that the script has been executed, and that can cause a messup. If you use a different method, we recommend that you execute <code>show load</code> in the shell script, and check the regex filtering status to see if the ingestion succeeds.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-data-model">2. Data Model<a href="#2-data-model" class="hash-link" aria-label="Direct link to 2. Data Model" title="Direct link to 2. Data Model"></a></h3><p>We adopt the Unique Key model of Apache Doris for most of our tables. The Unique Key model ensures idempotence of data scripts and effectively avoids upstream data duplication. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3-reading-external-data">3. Reading External Data<a href="#3-reading-external-data" class="hash-link" aria-label="Direct link to 3. Reading External Data" title="Direct link to 3. Reading External Data"></a></h3><p>We use the Multi-Catalog feature of Apache Doris to connect to external data sources. It allows us to create mappings of external data at the Catalog level.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="4-query-optimization">4. Query Optimization<a href="#4-query-optimization" class="hash-link" aria-label="Direct link to 4. Query Optimization" title="Direct link to 4. Query Optimization"></a></h3><p>We suggest that you put the most frequently used fields of non-character types (such as int and where clauses) in the first 36 bytes, so you can filter these fields within milliseconds in point queries.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="5-data-dictionary">5. Data Dictionary<a href="#5-data-dictionary" class="hash-link" aria-label="Direct link to 5. Data Dictionary" title="Direct link to 5. Data Dictionary"></a></h3><p>For us, it is important to create a data dictionary because it largely reduces personnel communication costs, which can be a headache when you have a big team. We use the <code>information_schema</code> in Apache Doris to generate a data dictionary. With it, we can quickly grasp the whole picture of the tables and fields and thus increase development efficiency.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="performance">Performance<a href="#performance" class="hash-link" aria-label="Direct link to Performance" title="Direct link to Performance"></a></h2><p><strong>Offline data ingestion time</strong>: Within minutes</p><p><strong>Query latency</strong>: For tables containing over 100 million rows, Apache Doris responds to ad-hoc queries within one second, and complicated queries in five seconds.</p><p><strong>Resource consumption</strong>: It only takes up a small number of servers to build this data warehouse. The 70% compression ratio of Apache Doris saves us lots of storage resources.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="experience-and-conclusion"><strong>Experience and Conclusion</strong><a href="#experience-and-conclusion" class="hash-link" aria-label="Direct link to experience-and-conclusion" title="Direct link to experience-and-conclusion"></a></h2><p>Actually, before we evolved into our current data architecture, we tried Hive, Spark and Hadoop to build an offline data warehouse. It turned out that Hadoop was overkill for a traditional company like us since we didn&#x27;t have too much data to process. It is important to find the component that suits you most.</p><p><img loading="lazy" alt="old-offline-data warehouse" src="https://cdnd.selectdb.com/assets/images/Midland_6-52e4498a6ab21c3075077b71435e2d28.png" width="832" height="703" class="img_ev3q"></p><p>(Our old off-line data warehouse)</p><p>On the other hand, to smoothen our big data transition, we need to make our data platform as simple as possible in terms of usage and maintenance. That&#x27;s why we landed on Apache Doris. It is compatible with MySQL protocol and provides a rich collection of functions so we don&#x27;t have to develop our own UDFs. Also, it is composed of only two types of processes: frontends and backends, so it is easy to scale and track.</p><p>Find Apache Doris developers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a>.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Zipping-up-the-Lambda-Architecture-for-40-Percent-Faster-Performance">Zipping up the lambda architecture for 40% faster performance</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Tongyang Han</span></span><time datetime="2023-05-05T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">May 5, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Author: Tongyang Han, Senior Data Engineer at Douyu</p><p>The Lambda architecture has been common practice in big data processing. The concept is to separate stream (real time data) and batch (offline data) processing, and that&#x27;s exactly what we did. These two types of data of ours were processed in two isolated tubes before they were pooled together and ready for searches and queries.</p><p><img loading="lazy" alt="Lambda-architecture" src="https://cdnd.selectdb.com/assets/images/Douyu_1-cfd4fa7607d4bf15315307b50436d676.png" width="1276" height="613" class="img_ev3q"></p><p>Then we run into a few problems:</p><ol><li><strong>Isolation of real-time and offline data warehouses</strong><ol><li>I know this is kind of the essence of Lambda architecture, but that means we could not reuse real-time data since it was not layered as offline data, so further customized development was required.</li></ol></li><li><strong>Complex Pipeline from Data Sources to Data Application</strong><ol><li>Data had to go through multi-step processing before it reached our data users. As our architecture involved too many components, navigating and maintaining these tech stacks was a lot of work.</li></ol></li><li><strong>Lack of management of real-time data sources</strong><ol><li>In extreme cases, this worked like a data silo and we had no way to find out whether the ingested data was duplicated or reusable.</li></ol></li></ol><p>So we decided to &quot;zip up&quot; the Lambda architecture a little bit. By &quot;zipping up&quot;, I mean to introduce an OLAP engine that is capable of processing, storing, and analyzing data, so real-time data and offline data converge a little earlier than they used to. It is not a revolution of Lambda, but a minor change in the choice of components, which made our real-time data processing 40% faster.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="zipping-up-lambda-architecture"><strong>Zipping up Lambda Architecture</strong><a href="#zipping-up-lambda-architecture" class="hash-link" aria-label="Direct link to zipping-up-lambda-architecture" title="Direct link to zipping-up-lambda-architecture"></a></h2><p>I am going to elaborate on how this is done using our data tagging process as an example.</p><p>Previously, our offline tags were produced by the data warehouse, put into a flat table, and then written in <strong>HBase</strong>, while real-time tags were produced by <strong>Flink</strong>, and put into <strong>HBase</strong> directly. Then <strong>Spark</strong> would work as the computing engine.</p><p><img loading="lazy" alt="HBase-Redis-Spark" src="https://cdnd.selectdb.com/assets/images/Douyu_2-9cd11673aa896382f99ca957435efd84.png" width="1280" height="602" class="img_ev3q"></p><p>The problem with this stemmed from the low computation efficiency of <strong>Flink</strong> and <strong>Spark</strong>. </p><ul><li><strong>Real-time tag production</strong>: When computing real-time tags that involve data within a long time range, Flink did not deliver stable performance and consumed more resources than expected. And when a task failed, it would take a really long time for checkpoint recovery.</li><li><strong>Tag query</strong>: As a tag query engine, Spark could be slow.</li></ul><p>As a solution, we replaced <strong>HBase</strong> and <strong>Spark</strong> with <strong>Apache Doris</strong>, a real-time analytic database, and moved part of the computational logic of the foregoing wide-time-range real-time tags from <strong>Flink</strong> to <strong>Apache Doris</strong>.</p><p><img loading="lazy" alt="Apache-Doris-Redis" src="https://cdnd.selectdb.com/assets/images/Douyu_3-684e3028f23e722b9892e0afdf472e4b.png" width="1280" height="577" class="img_ev3q"></p><p>Instead of putting our flat tables in HBase, we place them in Apache Doris. These tables are divided into partitions based on time sensitivity. Offline tags will be updated daily while real-time tags will be updated in real time. We organize these tables in the Aggregate Model of Apache Doris, which allows partial update of data.</p><p>Instead of using Spark for queries, we parse the query rules into SQL for execution in Apache Doris. For pattern matching, we use Redis to cache the hot data from Apache Doris, so the system can respond to such queries much faster.</p><p><img loading="lazy" alt="Real-time-and-offline-data-processing-in-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/Douyu_4-afd928fc30baf4ec825e80ab3638e984.png" width="1280" height="486" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="computational-pipeline-of-wide-time-range-real-time-tags"><strong>Computational Pipeline of Wide-Time-Range Real-Time Tags</strong><a href="#computational-pipeline-of-wide-time-range-real-time-tags" class="hash-link" aria-label="Direct link to computational-pipeline-of-wide-time-range-real-time-tags" title="Direct link to computational-pipeline-of-wide-time-range-real-time-tags"></a></h2><p>In some cases, the computation of wide-time-range real-time tags entails the aggregation of historical (offline) data with real-time data. The following figure shows our old computational pipeline for these tags. </p><p><img loading="lazy" alt="offline-data-processing-link" src="https://cdnd.selectdb.com/assets/images/Douyu_5-104e16d5c9830069f513dc4c25665bcf.png" width="1280" height="695" class="img_ev3q"></p><p>As you can see, it required multiple tasks to finish computing one real-time tag. Also, in complicated aggregations that involve a collection of aggregation operations, any improper resource allocation could lead to back pressure or waste of resources. This adds to the difficulty of task scheduling. The maintenance and stability guarantee of such a long pipeline could be an issue, too.</p><p>To improve on that, we decided to move such aggregation workload to Apache Doris.</p><p><img loading="lazy" alt="real-time-data-processing-link" src="https://cdnd.selectdb.com/assets/images/Douyu_6-4243729274c033573acca9a2c621bf45.png" width="1280" height="717" class="img_ev3q"></p><p>We have around 400 million customer tags in our system, and each customer is attached with over 300 tags. We divide customers into more than 10,000 groups, and we have to update 5000 of them on a daily basis. The above improvement has sped up the computation of our wide-time-range real-time queries by <strong>40%</strong>.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="overwrite">Overwrite<a href="#overwrite" class="hash-link" aria-label="Direct link to Overwrite" title="Direct link to Overwrite"></a></h2><p>To atomically replace data tables and partitions in Apache Doris, we customized the <a href="https://github.com/apache/doris-spark-connector" target="_blank" rel="noopener noreferrer">Doris-Spark-Connector</a>, and added an &quot;Overwrite&quot; mode to the Connector.</p><p>When a Spark job is submitted, Apache Doris will call an interface to fetch information of the data tables and partitions.</p><ul><li>If it is a non-partitioned table, we create a temporary table for the target table, ingest data into it, and then perform atomic replacement. If the data ingestion fails, we clear the temporary table;</li><li>If it is a dynamic partitioned table, we create a temporary partition for the target partition, ingest data into it, and then perform atomic replacement. If the data ingestion fails, we clear the temporary partition;</li><li>If it is a non-dynamic partitioned table, we need to extend the Doris-Spark-Connector parameter configuration first. Then we create a temporary partition and take steps as above.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2><p>One prominent advantage of Lambda architecture is the stability it provides. However, in our practice, the processing of real-time data and offline data sometimes intertwines. For example, the computation of certain real-time tags requires historical (offline) data. Such interaction becomes a root cause of instability. Thus, instead of pooling real-time and offline data after they are fully ready for queries, we use an OLAP engine to share part of the pre-query computation burden and make things faster, simpler, and more cost-effective.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Step-by-step-Guide-to-Building-a-High-Performing-Risk-Data-Mart">Step-by-step guide to building a high-performing risk data mart</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Jacob Chow</span></span><time datetime="2023-04-20T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">April 20, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p>Pursuing data-driven management at a consumer financing company, we aim to serve four needs in our data platform development: monitoring and alerting, query and analysis, dashboarding, and data modeling. For these purposes, we built our data processing architecture based on Greenplum and CDH. The most essential part of it is the risk data mart. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="risk-data-mart--apache-hive">Risk Data Mart: Apache Hive<a href="#risk-data-mart--apache-hive" class="hash-link" aria-label="Direct link to Risk Data Mart: Apache Hive" title="Direct link to Risk Data Mart: Apache Hive"></a></h2><p>I will walk you through how the risk data mart works following the data flow: </p><ol><li>Our <strong>business data</strong> is imported into <strong>Greenplum</strong> for real-time analysis to generate BI reports. Part of this data also goes into Apache Hive for queries and modeling analysis. </li><li>Our <strong>risk control variables</strong> are updated into <strong>Elasticsearch</strong> in real time via message queues, while Elasticsearch ingests data into Hive for analysis, too.</li><li>The <strong>risk management decision data</strong> is passed from <strong>MongoDB</strong> to Hive for risk control analysis and modeling.</li></ol><p>So these are the three data sources of our risk data mart.</p><p><img loading="lazy" alt="risk-data-mart" src="https://cdnd.selectdb.com/assets/images/RDM_1-7e8b0a7061d967673ece1d403f03edd3.png" width="826" height="486" class="img_ev3q"></p><p>This whole architecture is built with CDH 6.0. The workflows in it can be divided into real-time data streaming and offline risk analysis.</p><ul><li><strong>Real-time data streaming</strong>: Real-time data from Apache Kafka will be cleaned by Apache Flink, and then written into Elasticsearch. Elasticsearch will aggregate part of the data it receives and send it for reference in risk management. </li><li><strong>Offline risk analysis</strong>: Based on the CDH solution and utilizing Sqoop, we ingest data from Greenplum in an offline manner. Then we put this data together with the third-party data from MongoDB. Then, after data cleaning, we pour all this data into Hive for daily batch processing and data queries.</li></ul><p>To give a brief overview, these are the components that support the four features of our data processing platform:</p><p><img loading="lazy" alt="features-of-a-data-processing-platform" src="https://cdnd.selectdb.com/assets/images/RDM_2-1880ff586d295ecd43f0731f01124965.png" width="1002" height="606" class="img_ev3q"></p><p>As you see, Apache Hive is central to this architecture. But in practice, it takes minutes for Apache Hive to execute analysis, so our next step is to increase query speed.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="what-are-slowing-down-our-queries">What are Slowing Down Our Queries?<a href="#what-are-slowing-down-our-queries" class="hash-link" aria-label="Direct link to What are Slowing Down Our Queries?" title="Direct link to What are Slowing Down Our Queries?"></a></h3><ol><li><strong>Huge data volume in external tables</strong></li></ol><p>Our Hive-based data mart is now carrying more than 300 terabytes of data. That&#x27;s about 20,000 tables and 5 million fields. To put them all in external tables is maintenance-intensive. Plus, data ingestion can be a big headache.</p><ol><li><strong>Big flat tables</strong></li></ol><p>Due to the complexity of the rule engine in risk management, our company invests a lot in the derivation of variables. In some dimensions, we have thousands of variables or even more. As a result, a few of the frequently used flat tables in Hive have over 3000 fields. So you can imagine how time consuming these queries can be.</p><ol><li><strong>Unstable interface</strong></li></ol><p>Results produced by daily offline batch processing will be regularly sent to our Elasticsearch clusters. (The data volume in these updates is huge, and the call of interface can get expired.) This process might cause high I/O and introduce garbage collection jitter, and further leads to unstable interface services. </p><p>In addition, since our risk control analysts and modeling engineers are using Hive with Spark, the expanding data architecture is also dragging down query performance.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="a-unified-query-gateway">A Unified Query Gateway<a href="#a-unified-query-gateway" class="hash-link" aria-label="Direct link to A Unified Query Gateway" title="Direct link to A Unified Query Gateway"></a></h2><p>We wanted a unified gateway to manage our heterogenous data sources. That&#x27;s why we introduced Apache Doris.</p><p><img loading="lazy" alt="unified-query-gateway" src="https://cdnd.selectdb.com/assets/images/RDM_3-89141f14a59c83d413d14f31fcf386f4.png" width="1716" height="1094" class="img_ev3q"></p><p>But doesn&#x27;t that make things even more complicated? Actually, no.</p><p>We can connect various data sources to Apache Doris and simply conduct queries on it. This is made possible by the <strong>Multi-Catalog</strong> feature of Apache Doris: It can interface with various data sources, including datalakes like Apache Hive, Apache Iceberg, and Apache Hudi, and databases like MySQL, Elasticsearch, and Greenplum. That happens to cover our toolkit. </p><p>We create Elasticsearch Catalog and Hive Catalog in Apache Doris. These catalogs map to the external data in Elasticsearch and Hive, so we can conduct federated queries across these data sources using Apache Doris as a unified gateway. Also, we use the <a href="https://github.com/apache/doris-spark-connector" target="_blank" rel="noopener noreferrer">Spark-Doris-Connector</a> to allow data communication between Spark and Doris. So basically, we replace Apache Hive with Apache Doris as the central hub of our data architecture. </p><p><img loading="lazy" alt="Apache-Doris-as-center-of-data-architecture" src="https://cdnd.selectdb.com/assets/images/RDM_4-e6af4e754989aed3aef02a357e7607ad.png" width="1002" height="608" class="img_ev3q"></p><p>How does that affect our data processing efficiency?</p><ul><li><strong>Monitoring &amp; Alerting</strong>: This is about real-time data querying. We access our real-time data in Elasticsearch clusters using Elasticsearch Catalog in Apache Doris. Then we perform queries directly in Apache Doris. It is able to return results within seconds, as opposed to the minute-level response time when we used Hive.</li><li><strong>Query &amp; Analysis</strong>: As I said, we have 20,000 tables in Hive so it wouldn&#x27;t make sense to map all of them to external tables in Hive. That would mean a hell of maintenance. Instead, we utilize the Multi Catalog feature of Apache Doris 1.2. It enables data mapping at the catalog level, so we can simply create one Hive Catalog in Doris before we can conduct queries. This separates query operations from the daily batching processing workload in Hive, so there will be less resource conflict.</li><li><strong>Dashboarding</strong>: We use Tableau and Doris to provide dashboard services. This reduces the query response time to seconds and milliseconds, compared with the several minutes back in the &quot;Tableau + Hive&quot; days.</li><li><strong>Modeling</strong>: We use Spark and Doris for aggregation modeling. The Spark-Doris-Connector allows mutual synchronization of data, so data from Doris can also be used in modeling for more accurate analysis.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="cluster-monitoring-in-production-environment"><strong>Cluster Monitoring in Production Environment</strong><a href="#cluster-monitoring-in-production-environment" class="hash-link" aria-label="Direct link to cluster-monitoring-in-production-environment" title="Direct link to cluster-monitoring-in-production-environment"></a></h3><p>We tested this new architecture in our production environment. We built two clusters.</p><p><strong>Configuration</strong>:</p><p>Production cluster: 4 frontends + 8 backends, m5d.16xlarge</p><p>Backup cluster: 4 frontends + 4 backends, m5d.16xlarge</p><p>This is the monitoring board: </p><p><img loading="lazy" alt="cluster-monitoring-board" src="https://cdnd.selectdb.com/assets/images/RDM_5-8a88d55e3ac69ac6be859a9c367b0c76.png" width="1280" height="523" class="img_ev3q"></p><p>As is shown, the queries are fast. We expected that it would take at least 10 nodes but in real cases, we mainly conduct queries via Catalogs, so we can handle this with a relatively small cluster size. The compatibility is good, too. It doesn&#x27;t rock the rest of our existing system.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="guide-to-faster-data-integration">Guide to Faster Data Integration<a href="#guide-to-faster-data-integration" class="hash-link" aria-label="Direct link to Guide to Faster Data Integration" title="Direct link to Guide to Faster Data Integration"></a></h2><p>To accelerate the regular data ingestion from Hive to Apache Doris 1.2.2, we have a solution that goes as follows:</p><p><img loading="lazy" alt="faster-data-integration" src="https://cdnd.selectdb.com/assets/images/RDM_6-946a2cf22287a5c16c7fc03d2a3e2c18.png" width="1280" height="681" class="img_ev3q"></p><p><strong>Main components:</strong></p><ul><li>DolphinScheduler 3.1.4</li><li>SeaTunnel 2.1.3</li></ul><p>With our current hardware configuration, we use the Shell script mode of DolphinScheduler and call the SeaTunnel script on a regular basis. This is the configuration file of the data synchronization tasks:</p><div class="language-undefined codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-undefined codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain"> env{</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark.app.name = &quot;hive2doris-template&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark.executor.instances = 10</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark.executor.cores = 5</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark.executor.memory = &quot;20g&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">spark {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> spark.sql.catalogImplementation = &quot;hive&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">source {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> hive {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> pre_sql = &quot;select * from ods.demo_tbl where dt=&#x27;2023-03-09&#x27;&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> result_table_name = &quot;ods_demo_tbl&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">transform {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">sink {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> doris {</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> fenodes = &quot;192.168.0.10:8030,192.168.0.11:8030,192.168.0.12:8030,192.168.0.13:8030&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> user = root</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> password = &quot;XXX&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> database = ods</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> table = ods_demo_tbl</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> batch_size = 500000</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> max_retries = 1</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> interval = 10000</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> doris.column_separator = &quot;\t&quot;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> }</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">}</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>This solution consumes less resources and memory but brings higher performance in queries and data ingestion.</p><ol><li><strong>Less storage costs</strong></li></ol><p><strong>Before</strong>: The original table in Hive had 500 fields. It was divided into partitions by day, with 150 million pieces of data per partition. It takes <strong>810G</strong> to store in HDFS.</p><p><strong>After</strong>: For data synchronization, we call Spark on YARN using SeaTunnel. It can be finished within 40 minutes, and the ingested data only takes up <strong>270G</strong> of storage space.</p><ol><li><strong>Less memory usage &amp; higher performance in queries</strong></li></ol><p><strong>Before</strong>: For a GROUP BY query on the foregoing table in Hive, it occupied 720 Cores and 1.44T in YARN, and took a response time of <strong>162 seconds</strong>. </p><p><strong>After</strong>: We perform an aggregate query using Hive Catalog in Doris, <code>set exec_mem_limit=16G</code>, and receive the result after <strong>58.531 seconds</strong>. We also try and put the table in Doris and conduct the same query in Doris itself, that only takes <strong>0.828 seconds</strong>.</p><p>The corresponding statements are as follows:</p><ul><li>Query in Hive, response time: 162 seconds</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">select count(*),product_no FROM ods.demo_tbl where dt=&#x27;2023-03-09&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">group by product_no;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li>Query in Doris using Hive Catalog, response time: 58.531 seconds</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">set exec_mem_limit=16G;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">select count(*),product_no FROM hive.ods.demo_tbl where dt=&#x27;2023-03-09&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">group by product_no;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li>Query in Doris directly, response time: 0.828 seconds</li></ul><div class="language-SQL codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-SQL codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">select count(*),product_no FROM ods.demo_tbl where dt=&#x27;2023-03-09&#x27;</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">group by product_no;</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ol><li><strong>Faster data ingestion</strong></li></ol><p><strong>Before</strong>: The original table in Hive had 40 fields. It was divided into partitions by day, with 1.1 billion pieces of data per partition. It takes <strong>806G</strong> to store in HDFS.</p><p><strong>After</strong>: For data synchronization, we call Spark on YARN using SeaTunnel. It can be finished within 11 minutes (100 million pieces per minute ), and the ingested data only takes up <strong>378G</strong> of storage space.</p><p><img loading="lazy" alt="faster-data-ingestion" src="https://cdnd.selectdb.com/assets/images/RDM_7-aabcb97d311b9da69a1d8722339b633a.png" width="1280" height="463" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="summary">Summary<a href="#summary" class="hash-link" aria-label="Direct link to Summary" title="Direct link to Summary"></a></h2><p>The key step to building a high-performing risk data mart is to leverage the Multi Catalog feature of Apache Doris to unify the heterogenous data sources. This not only increases our query speed but also solves a lot of the problems coming with our previous data architecture.</p><ol><li>Deploying Apache Doris allows us to decouple daily batch processing workloads with ad-hoc queries, so they don&#x27;t have to compete for resources. This reduces the query response time from minutes to seconds.</li><li>We used to build our data ingestion interface based on Elasticsearch clusters, which could lead to garbage collection jitter when transferring large batches of offline data. When we stored the interface service dataset on Doris, no jitter was found during data writing and we were able to transfer 10 million rows within 10 minutes.</li><li>Apache Doris has been optimizing itself in many scenarios including flat tables. As far as we know, compared with ClickHouse, Apache Doris 1.2 is twice as fast in SSB-Flat-table benchmark and dozens of times faster in TPC-H benchmark.</li><li>In terms of cluster scaling and updating, we used to suffer from a big window of restoration time after configuration revision. But Doris supports hot swap and easy scaling out, so we can reboot nodes within a few seconds and minimize interruption to users caused by cluster scaling.</li></ol><p>(One last piece of advice for you: If you encounter any problems with deploying Apache Doris, don&#x27;t hesitate to contact the Doris community for help, they and a bunch of SelectDB engineers will be more than happy to make your adaption journey quick and easy.)</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Tencent-Data-Engineers-Why-We-Went-from-ClickHouse-to-Apache-Doris">Tencent data engineer: why we went from ClickHouse to Apache Doris?</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Jun Zhang &amp; Kai Dai</span></span><time datetime="2023-03-07T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">March 7, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p><img loading="lazy" alt="Tencent-use-case-of-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/TME-7ebdc46ff19cf90eaf92e280c1b1f0e4.png" width="900" height="383" class="img_ev3q"></p><p>This article is co-written by me and my colleague Kai Dai. We are both data platform engineers at <a href="https://www.tencentmusic.com/en-us/" target="_blank" rel="noopener noreferrer">Tencent Music</a> (NYSE: TME), a music streaming service provider with a whopping 800 million monthly active users. To drop the number here is not to brag but to give a hint of the sea of data that my poor coworkers and I have to deal with everyday.</p><h1>What We Use ClickHouse For?</h1><p>The music library of Tencent Music contains data of all forms and types: recorded music, live music, audios, videos, etc. As data platform engineers, our job is to distill information from the data, based on which our teammates can make better decisions to support our users and musical partners.</p><p>Specifically, we do all-round analysis of the songs, lyrics, melodies, albums, and artists, turn all this information into data assets, and pass them to our internal data users for inventory counting, user profiling, metrics analysis, and group targeting.</p><p><img loading="lazy" alt="data-pipeline" src="https://cdnd.selectdb.com/assets/images/TME_1-73b51a1362dc4f6f1cadbee5d51aaa05.png" width="1280" height="693" class="img_ev3q"></p><p>We stored and processed most of our data in Tencent Data Warehouse (TDW), an offline data platform where we put the data into various tag and metric systems and then created flat tables centering each object (songs, artists, etc.).</p><p>Then we imported the flat tables into ClickHouse for analysis and Elasticsearch for data searching and group targeting.</p><p>After that, our data analysts used the data under the tags and metrics they needed to form datasets for different usage scenarios, during which they could create their own tags and metrics.</p><p>The data processing pipeline looked like this:</p><p><img loading="lazy" alt="data-warehouse-architecture-1.0" src="https://cdnd.selectdb.com/assets/images/TME_2-edb671e5b547ca431f4eaa61b59fd2fb.png" width="1280" height="743" class="img_ev3q"></p><h1>The Problems with ClickHouse</h1><p>When working with the above pipeline, we encountered a few difficulties:</p><ol><li><strong>Partial Update</strong>: Partial update of columns was not supported. Therefore, any latency from any one of the data sources could delay the creation of flat tables, and thus undermine data timeliness.</li><li><strong>High storage cost</strong>: Data under different tags and metrics was updated at different frequencies. As much as ClickHouse excelled in dealing with flat tables, it was a huge waste of storage resources to just pour all data into a flat table and partition it by day, not to mention the maintenance cost coming with it.</li><li><strong>High maintenance cost</strong>: Architecturally speaking, ClickHouse was characterized by the strong coupling of storage nodes and compute nodes. Its components were heavily interdependent, adding to the risks of cluster instability. Plus, for federated queries across ClickHouse and Elasticsearch, we had to take care of a huge amount of connection issues. That was just tedious.</li></ol><h1>Transition to Apache Doris</h1><p><a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">Apache Doris</a>, a real-time analytical database, boasts a few features that are exactly what we needed in solving our problems:</p><ol><li><strong>Partial update</strong>: Doris supports a wide variety of data models, among which the Aggregate Model supports real-time partial update of columns. Building on this, we can directly ingest raw data into Doris and create flat tables there. The ingestion goes like this: Firstly, we use Spark to load data into Kafka; then, any incremental data will be updated to Doris and Elasticsearch via Flink. Meanwhile, Flink will pre-aggregate the data so as to release burden on Doris and Elasticsearch.</li><li><strong>Storage cost</strong>: Doris supports multi-table join queries and federated queries across Hive, Iceberg, Hudi, MySQL, and Elasticsearch. This allows us to split the large flat tables into smaller ones and partition them by update frequency. The benefits of doing so include a relief of storage burden and an increase of query throughput.</li><li><strong>Maintenance cost</strong>: Doris is of simple architecture and is compatible with MySQL protocol. Deploying Doris only involves two processes (FE and BE) with no dependency on other systems, making it easy to operate and maintain. Also, Doris supports querying external ES data tables. It can easily interface with the metadata in ES and automatically map the table schema from ES so we can conduct queries on Elasticsearch data via Doris without grappling with complex connections.</li></ol><p>What’s more, Doris supports multiple data ingestion methods, including batch import from remote storage such as HDFS and S3, data reads from MySQL binlog and Kafka, and real-time data synchronization or batch import from MySQL, Oracle, and PostgreSQL. It ensures service availability and data reliability through a consistency protocol and is capable of auto debugging. This is great news for our operators and maintainers.</p><p>Statistically speaking, these features have cut our storage cost by 42% and development cost by 40%.</p><p>During our usage of Doris, we have received lots of support from the open source Apache Doris community and timely help from the SelectDB team, which is now running a commercial version of Apache Doris.</p><p><img loading="lazy" alt="data-warehouse-architecture-2.0" src="https://cdnd.selectdb.com/assets/images/TME_3-877f2cc02538dcf78f20d08c679df9f3.png" width="1280" height="734" class="img_ev3q"></p><h1>Further Improvement to Serve Our Needs</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="introduce-a-semantic-layer">Introduce a Semantic Layer<a href="#introduce-a-semantic-layer" class="hash-link" aria-label="Direct link to Introduce a Semantic Layer" title="Direct link to Introduce a Semantic Layer"></a></h2><p>Speaking of the datasets, on the bright side, our data analysts are given the liberty of redefining and combining the tags and metrics at their convenience. But on the dark side, high heterogeneity of the tag and metric systems leads to more difficulty in their usage and management.</p><p>Our solution is to introduce a semantic layer in our data processing pipeline. The semantic layer is where all the technical terms are translated into more comprehensible concepts for our internal data users. In other words, we are turning the tags and metrics into first-class citizens for data definement and management.</p><p><img loading="lazy" alt="data-warehouse-architecture-3.0" src="https://cdnd.selectdb.com/assets/images/TME_4-f78029c9a317de442e0e00aac053140a.png" width="1280" height="743" class="img_ev3q"></p><p><strong>Why would this help?</strong></p><p>For data analysts, all tags and metrics will be created and shared at the semantic layer so there will be less confusion and higher efficiency.</p><p>For data users, they no longer need to create their own datasets or figure out which one is applicable for each scenario but can simply conduct queries on their specified tagset and metricset.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="upgrade-the-semantic-layer">Upgrade the Semantic Layer<a href="#upgrade-the-semantic-layer" class="hash-link" aria-label="Direct link to Upgrade the Semantic Layer" title="Direct link to Upgrade the Semantic Layer"></a></h2><p>Explicitly defining the tags and metrics at the semantic layer was not enough. In order to build a standardized data processing system, our next goal was to ensure consistent definition of tags and metrics throughout the whole data processing pipeline.</p><p>For this sake, we made the semantic layer the heart of our data management system:</p><p><img loading="lazy" alt="data-warehouse-architecture-4.0" src="https://cdnd.selectdb.com/assets/images/TME_5-69933329bfdc217369664b15c2ec4766.png" width="1280" height="714" class="img_ev3q"></p><p><strong>How does it work?</strong></p><p>All computing logics in TDW will be defined at the semantic layer in the form of a single tag or metric.</p><p>The semantic layer receives logic queries from the application side, selects an engine accordingly, and generates SQL. Then it sends the SQL command to TDW for execution. Meanwhile, it might also send configuration and data ingestion tasks to Doris and decide which metrics and tags should be accelerated.</p><p>In this way, we have made the tags and metrics more manageable. A fly in the ointment is that since each tag and metric is individually defined, we are struggling with automating the generation of a valid SQL statement for the queries. If you have any idea about this, you are more than welcome to talk to us.</p><h1>Give Full Play to Apache Doris</h1><p>As you can see, Apache Doris has played a pivotal role in our solution. Optimizing the usage of Doris can largely improve our overall data processing efficiency. So in this part, we are going to share with you what we do with Doris to accelerate data ingestion and queries and reduce costs.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="what-we-want">What We Want?<a href="#what-we-want" class="hash-link" aria-label="Direct link to What We Want?" title="Direct link to What We Want?"></a></h2><p><img loading="lazy" alt="goals-of-a-data-analytic-solution" src="https://cdnd.selectdb.com/assets/images/TME_6-d0c8cb8b9a7501650f26ae3018b58b14.png" width="1280" height="444" class="img_ev3q"></p><p>Currently, we have 800+ tags and 1300+ metrics derived from the 80+ source tables in TDW.</p><p>When importing data from TDW to Doris, we hope to achieve:</p><ul><li><strong>Real-time availability:</strong> In addition to the traditional T+1 offline data ingestion, we require real-time tagging.</li><li><strong>Partial update</strong>: Each source table generates data through its own ETL task at various paces and involves only part of the tags and metrics, so we require the support for partial update of columns.</li><li><strong>High performance</strong>: We need a response time of only a few seconds in group targeting, analysis and reporting scenarios.</li><li><strong>Low costs</strong>: We hope to reduce costs as much as possible.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="what-we-do">What We Do?<a href="#what-we-do" class="hash-link" aria-label="Direct link to What We Do?" title="Direct link to What We Do?"></a></h2><ol><li><strong>Generate Flat Tables in Flink Instead of TDW</strong></li></ol><p><img loading="lazy" alt="generate-flat-tables-in-Flink" src="https://cdnd.selectdb.com/assets/images/TME_7-6ec720f226a737d5cf91c74a386319b4.png" width="1280" height="567" class="img_ev3q"></p><p>Generating flat tables in TDW has a few downsides:</p><ul><li><strong>High storage cost</strong>: TDW has to maintain an extra flat table apart from the discrete 80+ source tables. That’s huge redundancy.</li><li><strong>Low real-timeliness</strong>: Any delay in the source tables will be augmented and retard the whole data link.</li><li><strong>High development cost</strong>: To achieve real-timeliness would require extra development efforts and resources.</li></ul><p>On the contrary, generating flat tables in Doris is much easier and less expensive. The process is as follows:</p><ul><li>Use Spark to import new data into Kafka in an offline manner.</li><li>Use Flink to consume Kafka data.</li><li>Create a flat table via the primary key ID.</li><li>Import the flat table into Doris.</li></ul><p>As is shown below, Flink has aggregated the five lines of data, of which “ID”=1, into one line in Doris, reducing the data writing pressure on Doris.</p><p><img loading="lazy" alt="flat-tables-in-Flink-2" src="https://cdnd.selectdb.com/assets/images/TME_8-c5c8e4d117fb6c1157c42f6ab14829e0.png" width="1280" height="622" class="img_ev3q"></p><p>This can largely reduce storage costs since TDW no long has to maintain two copies of data and KafKa only needs to store the new data pending for ingestion. What’s more, we can add whatever ETL logic we want into Flink and reuse lots of development logic for offline and real-time data ingestion.</p><p><strong>2. Name the Columns Smartly</strong></p><p>As we mentioned, the Aggregate Model of Doris allows partial update of columns. Here we provide a simple introduction to other data models in Doris for your reference:</p><p><strong>Unique Model</strong>: This is applicable for scenarios requiring primary key uniqueness. It only keeps the latest data of the same primary key ID. (As far as we know, the Apache Doris community is planning to include partial update of columns in the Unique Model, too.)</p><p><strong>Duplicate Model</strong>: This model stores all original data exactly as it is without any pre-aggregation or deduplication.</p><p>After determining the data model, we had to think about how to name the columns. Using the tags or metrics as column names was not a choice because:</p><p>I. Our internal data users might need to rename the metrics or tags, but Doris 1.1.3 does not support modification of column names.</p><p>II. Tags might be taken online and offline frequently. If that involves the adding and dropping of columns, it will be not only time-consuming but also detrimental to query performance.</p><p>Instead, we do the following:</p><ul><li><strong>For flexible renaming of tags and metrics</strong>, we use MySQL tables to store the metadata (name, globally unique ID, status, etc.). Any change to the names will only happen in the metadata but will not affect the table schema in Doris. For example, if a <code>song_name</code> is given an ID of 4, it will be stored with the column name of a4 in Doris. Then if the <code>song_name</code>is involved in a query, it will be converted to a4 in SQL.</li><li><strong>For the onlining and offlining of tags</strong>, we sort out the tags based on how frequently they are being used. The least used ones will be given an offline mark in their metadata. No new data will be put under the offline tags but the existing data under those tags will still be available.</li><li><strong>For real-time availability of newly added tags and metrics</strong>, we prebuild a few ID columns in Doris tables based on the mapping of name IDs. These reserved ID columns will be allocated to the newly added tags and metrics. Thus, we can avoid table schema change and the consequent overheads. Our experience shows that only 10 minutes after the tags and metrics are added, the data under them can be available.</li></ul><p>Noteworthily, the recently released Doris 1.2.0 supports Light Schema Change, which means that to add or remove columns, you only need to modify the metadata in FE. Also, you can rename the columns in data tables as long as you have enabled Light Schema Change for the tables. This is a big trouble saver for us.</p><p><strong>3. Optimize Date Writing</strong></p><p>Here are a few practices that have reduced our daily offline data ingestion time by 75% and our CUMU compaction score from 600+ to 100.</p><ul><li>Flink pre-aggregation: as is mentioned above.</li><li>Auto-sizing of writing batch: To reduce Flink resource usage, we enable the data in one Kafka Topic to be written into various Doris tables and realize the automatic alteration of batch size based on the data amount.</li><li>Optimization of Doris data writing: fine-tune the the sizes of tablets and buckets as well as the compaction parameters for each scenario:</li></ul><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">max_XXXX_compaction_thread</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain">max_cumulative_compaction_num_singleton_deltas</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><ul><li>Optimization of the BE commit logic: conduct regular caching of BE lists, commit them to the BE nodes batch by batch, and use finer load balancing granularity.</li></ul><p><img loading="lazy" alt="stable-compaction-score" src="https://cdnd.selectdb.com/assets/images/TME_9-f599364617a05d42a19e5430e500d6f7.png" width="1280" height="511" class="img_ev3q"></p><p><strong>4. Use Dori-on-ES in Queries</strong></p><p>About 60% of our data queries involve group targeting. Group targeting is to find our target data by using a set of tags as filters. It poses a few requirements for our data processing architecture:</p><ul><li>Group targeting related to APP users can involve very complicated logic. That means the system must support hundreds of tags as filters simultaneously.</li><li>Most group targeting scenarios only require the latest tag data. However, metric queries need to support historical data.</li><li>Data users might need to perform further aggregated analysis of metric data after group targeting.</li><li>Data users might also need to perform detailed queries on tags and metrics after group targeting.</li></ul><p>After consideration, we decided to adopt Doris-on-ES. Doris is where we store the metric data for each scenario as a partition table, while Elasticsearch stores all tag data. The Doris-on-ES solution combines the distributed query planning capability of Doris and the full-text search capability of Elasticsearch. The query pattern is as follows:</p><div class="codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">SELECT tag, agg(metric) </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> FROM Doris </span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> WHERE id in (select id from Es where tagFilter)</span><br></span><span class="token-line" style="color:#F8F8F2"><span class="token plain"> GROUP BY tag</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>As is shown, the ID data located in Elasticsearch will be used in the sub-query in Doris for metric analysis.</p><p>In practice, we find that the query response time is related to the size of the target group. If the target group contains over one million objects, the query will take up to 60 seconds. If it is even larger, a timeout error might occur.</p><p>After investigation, we identified our two biggest time wasters:</p><p>I. When Doris BE pulls data from Elasticsearch (1024 lines at a time by default), for a target group of over one million objects, the network I/O overhead can be huge.</p><p>II. After the data pulling, Doris BE needs to conduct Join operations with local metric tables via SHUFFLE/BROADCAST, which can cost a lot.</p><p><img loading="lazy" alt="Doris-on-Elasticsearch" src="https://cdnd.selectdb.com/assets/images/TME_10-b177da2c3e9ab23ad3fb8e1784012442.png" width="1280" height="883" class="img_ev3q"></p><p>Thus, we make the following optimizations:</p><ul><li>Add a query session variable <code>es_optimize</code> that specifies whether to enable optimization.</li><li>In data writing into ES, add a BK column to store the bucket number after the primary key ID is hashed. The algorithm is the same as the bucketing algorithm in Doris (CRC32).</li><li>Use Doris BE to generate a Bucket Join execution plan, dispatch the bucket number to BE ScanNode and push it down to ES.</li><li>Use ES to compress the queried data; turn multiple data fetch into one and reduce network I/O overhead.</li><li>Make sure that Doris BE only pulls the data of buckets related to the local metric tables and conducts local Join operations directly to avoid data shuffling between Doris BEs.</li></ul><p><img loading="lazy" alt="Doris-on-Elasticsearch-2" src="https://cdnd.selectdb.com/assets/images/TME_11-5ac5f455cdcab0a0b8b1207d61b24afb.png" width="1280" height="924" class="img_ev3q"></p><p>As a result, we reduce the query response time for large group targeting from 60 seconds to a surprising 3.7 seconds.</p><p>Community information shows that Doris is going to support inverted indexing since version 2.0.0, which is soon to be released. With this new version, we will be able to conduct full-text search on text types, equivalence or range filtering of texts, numbers, and datetime, and conveniently combine AND, OR, NOT logic in filtering since the inverted indexing supports array types. This new feature of Doris is expected to deliver 3~5 times better performance than Elasticsearch on the same task.</p><p><strong>5. Refine the Management of Data</strong></p><p>Doris’ capability of cold and hot data separation provides the foundation of our cost reduction strategies in data processing.</p><ul><li>Based on the TTL mechanism of Doris, we only store data of the current year in Doris and put the historical data before that in TDW for lower storage cost.</li><li>We vary the numbers of copies for different data partitions. For example, we set three copies for data of the recent three months, which is used frequently, one copy for data older than six months, and two copies for data in between.</li><li>Doris supports turning hot data into cold data so we only store data of the past seven days in SSD and transfer data older than that to HDD for less expensive storage.</li></ul><h1>Conclusion</h1><p>Thank you for scrolling all the way down here and finishing this long read. We’ve shared our cheers and tears, lessons learned, and a few practices that might be of some value to you during our transition from ClickHouse to Doris. We really appreciate the help from the Apache Doris community, but we might still be chasing them around for a while since we attempt to realize auto-identification of cold and hot data, pre-computation of frequently used tags/metrics, simplification of code logic using Materialized Views, and so on and so forth.</p><p><strong># Links</strong></p><p><strong>Apache Doris</strong>:</p><p><a href="http://doris.apache.org" target="_blank" rel="noopener noreferrer">http://doris.apache.org</a></p><p><strong>Apache Doris Github</strong>:</p><p><a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">https://github.com/apache/doris</a></p><p>Find Apache Doris developers on <a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" target="_blank" rel="noopener noreferrer">Slack</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Improving-Query-Speed-to-Make-the-Most-out-of-Your-Data">Best practice in Duyansoft, improving query speed to make the most out of your data</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Junfei Liu</span></span><time datetime="2023-02-27T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">February 27, 2023</time></div></header><div class="markdown" itemprop="articleBody"><blockquote><p>Author: Junfei Liu, Senior Architect of Duyansoft</p></blockquote><p><img loading="lazy" alt="Duyansoft-use-case-of-Apache-Doris" src="https://cdnd.selectdb.com/assets/images/Duyansoft-338cbc4c47491d4110145175cfa2d0ba.png" width="900" height="383" class="img_ev3q"></p><p>The world is getting more and more value out of data, as exemplified by the currently much-talked-about ChatGPT, which I believe is a robotic data analyst. However, in today’s era, what’s more important than the data itself is the ability to locate your wanted information among all the overflowing data quickly. So in this article, I will talk about how I improved overall data processing efficiency by optimizing the choice and usage of data warehouses.</p><h1>Too Much Data on My Plate</h1><p>The choice of data warehouses was never high on my worry list until 2021. I have been working as a data engineer for a Fintech SaaS provider since its incorporation in 2014. In the company’s infancy, we didn’t have too much data to juggle. We only needed a simple tool for OLTP and business reporting, and the traditional databases would cut the mustard.</p><p><img loading="lazy" alt="data-processing-pipeline-Duyansoft" src="https://cdnd.selectdb.com/assets/images/Duyan_1-be681a0c4e3b94cdca6f6476698be732.png" width="1466" height="590" class="img_ev3q"></p><p>But as the company grew, the data we received became overwhelmingly large in volume and increasingly diversified in sources. Every day, we had tons of user accounts logging in and sending myriads of requests. It was like collecting water from a thousand taps to put out a million scattered pieces of fire in a building, except that you must bring the exact amount of water needed for each fire spot. Also, we got more and more emails from our colleagues asking if we could make data analysis easier for them. That’s when the company assembled a big data team to tackle the beast.</p><p>The first thing we did was to revolutionize our data processing architecture. We used DataHub to collect all our transactional or log data and ingest it into an offline data warehouse for data processing (analyzing, computing. etc.). Then the results would be exported to MySQL and then forwarded to QuickBI to display the reports visually. We also replaced MongoDB with a real-time data warehouse for business queries.</p><p><img loading="lazy" alt="Data-ingestion-ETL-ELT-application" src="https://cdnd.selectdb.com/assets/images/Duyan_2-e87f8780d1f9b74df15d81a94718b378.png" width="1564" height="704" class="img_ev3q"></p><p>This new architecture worked, but there remained a few pebbles in our shoes:</p><ul><li><strong>We wanted faster responses.</strong> MySQL could be slow in aggregating large tables, but our product guys requested a query response time of fewer than five seconds. So first, we tried to optimize MySQL. Then we also tried to skip MySQL and directly connect the offline data warehouse with QuickBI, hoping that the combination of query acceleration capability of the former and caching of the latter would do the magic. Still, that five-second goal seemed to be unreachable. There was a time when I believed the only perfect solution was for the product team to hire people with more patience.</li><li><strong>We wanted less pain in maintaining dimension tables.</strong> The offline data warehouse conducted data synchronization every five minutes, making it not applicable for frequent data updates or deletions scenarios. If we needed to maintain dimension tables in it, we would have to filter and deduplicate the data regularly to ensure data consistency. Out of our trouble-averse instinct, we chose not to do so.</li><li><strong>We wanted support for point queries of high concurrency.</strong> The real-time database that we previously used required up to 500ms to respond to highly concurrent point queries in both columnar storage and row storage, even after optimization. That was not good enough.</li></ul><h1>Hit It Where It Hurts Most</h1><p>In March, 2022, we started our hunt for a better data warehouse. To our disappointment, there was no one-size-fits-all solution. Most of the tools we looked into were only good at one or a few of the tasks, but if we gathered the best performer for each usage scenario, that would add up to a heavy and messy toolkit, which was against instinct.</p><p>So we decided to solve our biggest headache first: slow response, as it was hurting both the experience of our users and our internal work efficiency.</p><p>To begin with, we tried to move the largest tables from MySQL to <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">Apache Doris</a>, a real-time analytical database that supports MySQL protocol. That reduced the query execution time by a factor of eight. Then we tried and used Doris to accommodate more data.</p><p>As for now, we are using two Doris clusters: one to handle point queries (high QPS) from our users and the other for internal ad-hoc queries and reporting. As a result, users have reported smoother experience and we can provide more features that are used to be bottlenecked by slow query execution. Moving our dimension tables to Doris also brought less data errors and higher development efficiency.</p><p><img loading="lazy" alt="Data-ingestion-ETL-ELT-Doris-application" src="https://cdnd.selectdb.com/assets/images/Duyan_3-0abe8037381914932a2d763843a2ed34.png" width="1356" height="864" class="img_ev3q"></p><p>Both the FE and BE processes of Doris can be scaled out, so tens of PBs of data stored in hundreds of devices can be put into one single cluster. In addition, the two types of processes implement a consistency protocol to ensure service availability and data reliability. This removes dependency on Hadoop and thus saves us the cost of deploying Hadoop clusters.</p><h1>Tips</h1><p>Here are a few of our practices to share with you:</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-model"><strong>Data Model:</strong><a href="#data-model" class="hash-link" aria-label="Direct link to data-model" title="Direct link to data-model"></a></h2><p>Out of the three Doris data models, we find the Unique Model and the Aggregate Model suit our needs most. For example, we use the Unique Model to ensure data consistency while ingesting dimension tables and original tables and the Aggregate Model to import report data.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-ingestion"><strong>Data Ingestion:</strong><a href="#data-ingestion" class="hash-link" aria-label="Direct link to data-ingestion" title="Direct link to data-ingestion"></a></h2><p>For real-time data ingestion, we use the Flink-Doris-Connector: After our business data, the MySQL-based binlogs, is written into Kafka, it will be parsed by Flink and then loaded into Doris in a real-time manner.</p><p>For offline data ingestion, we use DataX: This mainly involves the computed report data in our offline data warehouse.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-management"><strong>Data Management:</strong><a href="#data-management" class="hash-link" aria-label="Direct link to data-management" title="Direct link to data-management"></a></h2><p>We back up our cluster data in a remote storage system via Broker. Then, it can restore the data from the backups to any Doris cluster if needed via the restore command.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="monitoring-and-alerting"><strong>Monitoring and Alerting:</strong><a href="#monitoring-and-alerting" class="hash-link" aria-label="Direct link to monitoring-and-alerting" title="Direct link to monitoring-and-alerting"></a></h2><p>In addition to the various monitoring metrics of Doris, we deployed an audit log plugin to keep a closer eye on certain slow SQL of certain users for optimization.</p><p>Slow SQL queries:</p><p><img loading="lazy" alt="slow-SQL-queries-monitoring" src="https://cdnd.selectdb.com/assets/images/Duyan_4-4c0a444296e8e0489a68597c56a23c51.png" width="1080" height="437" class="img_ev3q"></p><p>Some of our often-used monitoring metrics:</p><p><img loading="lazy" alt="monitoring-metrics" src="https://cdnd.selectdb.com/assets/images/Duyan_5-fab738ac780df0ae000a0a7238093e35.png" width="1080" height="451" class="img_ev3q"></p><p><strong>Tradeoff Between Resource Usage and Real-Time Availability:</strong></p><p>It turned out that using Flink-Doris-Connector for data ingestion can result in high cluster resource usage, so we increased the interval between each data writing from 3s to 10 or 20s, compromising a little bit on the real-time availability of data in exchange for much less resource usage.</p><h1>Communication with Developers</h1><p>We have been in close contact with the open source Doris community all the way from our investigation to our adoption of the data warehouse, and we’ve provided a few suggestions to the developers:</p><ul><li>Enable Flink-Doris-Connector to support simultaneous writing of multiple tables in a single sink.</li><li>Enable Materialized Views to support Join of multiple tables.</li><li>Optimize the underlying compaction of data and reduce resource usage as much as possible.</li><li>Provide optimization suggestions for slow SQL and warnings for abnormal table creation behaviors.</li></ul><p>If the perfect data warehouse is not there to be found, I think providing feedback for the second best is a way to help make one. We are also looking into its commercialized version called SelectDB to see if more custom-tailored advanced features can grease the wheels.</p><h1>Conclusion</h1><p>As we set out to find a single data warehouse that could serve all our needs, we ended up finding something less than perfect but good enough to improve our query speed by a wide margin and discovered some surprising features of it along the way. So if you wiggle between different choices, you may bet on the one with the thing you want most badly, and taking care of the rest wouldn’t be so hard.</p><p><strong>Try</strong> <a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer"><strong>Apache Doris</strong></a> <strong>out!</strong></p><p>It is an open source real-time analytical database based on MPP architecture. It supports both high-concurrency point queries and high-throughput complex analysis.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/linkedcare">ClickHouse &amp; Kudu to Doris: 10X concurrency increased, 70% latency down</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Yi Yang</span></span><time datetime="2023-01-28T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">January 28, 2023</time></div></header><div class="markdown" itemprop="articleBody"><p><img loading="lazy" alt="kv" src="https://cdnd.selectdb.com/assets/images/kv-c9c4b972a14903911ba1674b76f5edca.png" width="900" height="383" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="author">Author:<a href="#author" class="hash-link" aria-label="Direct link to Author:" title="Direct link to Author:"></a></h2><p>YiYang, Senior Big Data Developer, Linkedcare</p><h1>About Linkedcare</h1><p>Linkedcare is a leading SaaS software company in the health technology industry, focusing on the medical dental and cosmetic plastic surgery. In 2021, it was selected as one of the top 150 digital healthcare companies in the world by CB Insights. Linkedcare has served thousands of plastic surgery institutions in Los Angeles, Taiwan, and Hong Kong. Linkedcare also provides integrated management system services for dental clinics, covering electronic medical records, customer relationship management, intelligent marketing, B2B trading platform, insurance payment, BI tools, etc.</p><h1>Doris&#x27; Evolution in Linkedcare</h1><p>Let me briefly introduce Doris&#x27;s development in Linkedcare first. In general, the application of Doris in Linkedcare can be divided into two stages:</p><ol><li>The value-added report provided by Linkedcare to customers was initially provided by ClickHouse, which was later replaced by Apache Doris;</li><li>Due to the continuous improvement of real-time data analysis requirements, T+1&#x27;s data reporting gradually cannot meet business needs. Linkedcare needs a data warehouse that can handle real-time processing, and Doris has been introduced into the company&#x27;s data warehouse since then. With the support of the Apache Doris community and the SelectDB professional technical team, our business data has been gradually migrated from Kudu to Doris.</li></ol><p><img loading="lazy" alt="1" src="https://cdnd.selectdb.com/assets/images/1-39a723280720a07dc2ed0a7de5c99c9b.png" width="1696" height="866" class="img_ev3q"></p><h1>Data Service Architecture: From ClickHouse to Doris</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-service-architecture-requirements">Data Service Architecture Requirements<a href="#data-service-architecture-requirements" class="hash-link" aria-label="Direct link to Data Service Architecture Requirements" title="Direct link to Data Service Architecture Requirements"></a></h2><ul><li>Support complex queries: When customers do self-service on the dashboard, a complex SQL query statement will be generated to directly query the database, and the complexity of the statement is unknown, which adds a lot of pressure on the database and affects query performance.</li><li>High concurrency and low latency: At least 100 concurrent queries can be supported, and query results can be return within 1 second;</li><li>Real-time data update: The report data comes from the SaaS system. When the customer modifies the historical data in the system, the report data must be changed accordingly to ensure consistentency, which requires real-time processing.</li><li>Low cost and easy deployment: There are a lot of private cloud customers in our SaaS business. In order to reduce labor costs, the business requires that the architecture deployment and operation and maintenance be simple enough.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="early-problems-found-clickhouse-shuts-down-when-high-concurrency-occurs">Early Problems Found: ClickHouse Shuts Down When High-concurrency Occurs<a href="#early-problems-found-clickhouse-shuts-down-when-high-concurrency-occurs" class="hash-link" aria-label="Direct link to Early Problems Found: ClickHouse Shuts Down When High-concurrency Occurs" title="Direct link to Early Problems Found: ClickHouse Shuts Down When High-concurrency Occurs"></a></h2><p>The previous project chose ClickHouse to provide data query services, but serious concurrency problems occurred during use:
10 concurrent queries will cause ClickHouse to shut down, resulting in the inability to provide services to customers normally, which is the direct reason for us to replace ClickHouse.</p><p>In addition, there are several severe problems:</p><ol><li>The cost of ClickHouse services on the cloud is very high, and the dependency on ClickHouse components is relatively high. The frequent interaction between ClickHouse and Zookeeper during data ingestion will put greater pressure on stability.</li><li>How to seamlessly migrate data without affecting the normal use of customers is another problem.</li></ol><h2 class="anchor anchorWithStickyNavbar_LWe7" id="selection-between-doris-clickhouse-and-kudu">Selection between Doris, Clickhouse and Kudu<a href="#selection-between-doris-clickhouse-and-kudu" class="hash-link" aria-label="Direct link to Selection between Doris, Clickhouse and Kudu" title="Direct link to Selection between Doris, Clickhouse and Kudu"></a></h2><p>To deal with the existing problems and meet the business requirements, we decided to conduct research on Doris (0.14), Clickhouse, and Kudu respectively.</p><p><img loading="lazy" alt="2" src="https://cdnd.selectdb.com/assets/images/2-bd04a72816c9ff95512e08d3f6e8e05f.png" width="1600" height="454" class="img_ev3q"></p><p>As shown in the table above, we made a deep comparison of these 3 databases. And we can see that Doris has excellent performance in many aspects:</p><ul><li>High concurrency: Doris can handle high-concurrency of 1,000 and more. So it will easily solve the problem of 10 concurrent queries which led ClickHouse to shut down.</li><li>Query performance: Doris can achieve millisecond-level query response. In single-table query, although Doris and ClickHouse are almost equivalent in query performance, in multi-table query, Doris is far better than ClickHouse. Doris can make sure that the QPS won&#x27;t drop when high-concurrency happens.</li><li>Data update: Doris&#x27; data model can meet our needs for data update to ensure the consistency of system data and business data, which will be described in detail below.</li><li>Ease of use: Doris has a flat architecture, simple and fast deployment, fully-completed data ingest functions, and good at scaling out; At the same time, Doris can automatically perform replica balancing internally, and the operation and maintenance cost is extremely low. However, ClickHouse and Kudu rely heavily on components and require a lot of preparatory work for use. This requires a professional team to handle a large number of daily operation and maintenance tasks.</li><li>Standard SQL: Doris is compatible with the MySQL protocol and uses standard SQL. It is easy for developers to get started and does not require additional learning costs.</li><li>Distributed JOINs: Doris supports distributed JOINs, but ClickHouse has limitations in JOIN queries and functions as well as poor maintainability.</li><li>Active community: The Apache Doris open source community is active with passion. At the same time, SelectDB provides a professional and full-time team for technical support for the Doris community. If you encounter problems, you can directly contact the community and find out a solution in time.</li></ul><p>From the above research, we can find that Doris has excellent capabilities in all aspects and is very in line with our needs. Therefore, we adopt Doris instead of ClickHouse, which solves the problems of poor concurrency and the shutdown of ClickHouse.</p><h1>Data Warehouse Architecture: From Kudu+Impala to Doris</h1><p>In the process of using data reports, we have gradually discovered many advantages of Doris, so we decided to introduce Doris to the company&#x27;s data warehouse.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-warehouse-architecture-requirements">Data Warehouse Architecture Requirements<a href="#data-warehouse-architecture-requirements" class="hash-link" aria-label="Direct link to Data Warehouse Architecture Requirements" title="Direct link to Data Warehouse Architecture Requirements"></a></h2><ul><li>When the customer modifies the historical data in the system, the report data should also be changed accordingly. At the same time, there should be a feature that can help customers to change the value of a single column;</li><li>When Flink extracts the full amount of data from the business database and writes it into the data warehouse frequently, the version compaction must keep up with the speed of new version generation, and will not cause version accumulation;</li><li>Through resource isolation and other functions, Doris reduces the possibility of resource preemption, improves resource utilization, and makes full use of resources on the core computing nodes;</li><li>Due to the limited memory resources in the company, overloaded tasks must be completed without increasing the number of clusters.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="early-problems-found-kuduimpala-underperforms">Early Problems Found: Kudu+Impala Underperforms<a href="#early-problems-found-kuduimpala-underperforms" class="hash-link" aria-label="Direct link to Early Problems Found: Kudu+Impala Underperforms" title="Direct link to Early Problems Found: Kudu+Impala Underperforms"></a></h2><p>The early company data warehouse architecture used Kudu and Impala for computing and storage. But we found the following problems during use:</p><ol><li>When the number of concurrent queries (QPS) is large, the simple query response time of Kudu+Impala is always more than a few seconds, which cannot reach the millisecond-level required by the business. The long waiting time has brought bad user experience to customers. </li><li>The Kudu+Impala engine cannot perform incremental aggregation of factual data, and can barely support real-time data analysis.</li><li>Kudu relies on a large number of primary key lookups when ingesting data. The batch processing efficiency is low and Kudu consumes a lot of CPU, which is not friendly to resource utilization.</li></ol><h2 class="anchor anchorWithStickyNavbar_LWe7" id="new-data-warehouse-architecture-design-based-on-doris">New Data Warehouse Architecture Design Based on Doris<a href="#new-data-warehouse-architecture-design-based-on-doris" class="hash-link" aria-label="Direct link to New Data Warehouse Architecture Design Based on Doris" title="Direct link to New Data Warehouse Architecture Design Based on Doris"></a></h2><p><img loading="lazy" alt="3" src="https://cdnd.selectdb.com/assets/images/3-e7990ac868e7345d5fda0512b0ec6b8c.png" width="1280" height="690" class="img_ev3q"></p><p>As shown in the figure above, Apache Doris is used in the new architecture and is responsible for data warehouse storage and computing; Data ingestion of real-time data and ODS data through Kafka has been replaced with Flink; We use Duckula as our stream computing platform; While we introduce DolphinSchedular for our task scheduling.</p><h1>Benefits of the new architecture based on Apache Doris:</h1><ul><li>The new data warehouse architecture based on Doris no longer depends on Hadoop related components, and the operation and maintenance cost is low.</li><li>Higher performance. Doris uses less server resources but provides stronger data processing capabilities;</li><li>Doris supports high concurrency and can directly support WebApp query services;</li><li>Doris supports the access to external tables, which enable easy data publishing and data ingestion;</li><li>Doris supports dynamic scaling out and automatic data balance;</li><li>Doris supports multiple federated queries, including Hive, ES, MySQL, etc.;</li><li>Doris&#x27; Aggregate Model supports users updating a single column;</li><li>By adjusting BE parameters and cluster size, the problem of version accumulation can be effectively solved;</li><li>Through the Resource Tag and Query Block function, cluster resource isolation can be realized, resource usage rate can be reduced, and query performance can be improved.</li></ul><p>Thanks to the excellent capabilities of the new architecture, the cluster we use has been reduced from 18 pieces of 16Cores 128G to 12 pieces of 16Cores 128G, saving up to 33% of resources compared to before; Further, the computing performance has been greatly improved. Doris can complete an ETL task that was completed in 3 hours on Kudu in only 1 hour. In addition, in frequent updates, Kudu&#x27;s internal data fragmentation files cannot be automatically merged so that the performance will become worse and worse, requiring regular rebuilding; While the compaction function of Doris can effectively solve this problem.</p><h1>Highly Recommended</h1><p>The cost of using Doris is very low. Only 3 low-end servers or even desktops can be used to deploy easily a data warehouse based on Apache Doris; For enterprises with limited investment and do not want to be left behind by the market, it is highly recommended to try Apache Doris.</p><p>Doris is also a mature analytical database with MPP architecture. At the same time, its community is very active and easy to communicate with. SelectDB, the commercial company behind Doris, has set up a full-time technical team for the community. Any questions can be answered within 1 hour. In the last year, the community has been continuously promoted by SelectDB and introduced a series of industry-leading new features. In addition, the community will seriously consider the user habits when iterating, which will bring a lot of convenience.</p><p>I really appreciate the full support from the Doris community and the SelectDB team. And I sincerely recommend developers and enterprises to start with Apache Doris today.</p><h1>Apache Doris</h1><p>Apache Doris is a real-time analytical database based on MPP architecture, known for its high performance and ease of use. It supports both high-concurrency point queries and high-throughput complex analysis. (<a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">https://github.com/apache/doris</a>)</p><h1>Links</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="github">GitHub:<a href="#github" class="hash-link" aria-label="Direct link to GitHub:" title="Direct link to GitHub:"></a></h2><p><a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">https://github.com/apache/doris</a></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="apache-doris-website">Apache Doris Website:<a href="#apache-doris-website" class="hash-link" aria-label="Direct link to Apache Doris Website:" title="Direct link to Apache Doris Website:"></a></h2><p><a href="https://doris.apache.org" target="_blank" rel="noopener noreferrer">https://doris.apache.org</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/LY">The Efficiency of the data warehouse greatly improved in LY Digital&quot;</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Xing Wang</span></span><time datetime="2022-12-19T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 19, 2022</time></div></header><div class="markdown" itemprop="articleBody"><blockquote><p>Guide: Established in 2015, LY Digital is a financial service platform for tourism industry under LY. Com. In 2020, LY Digital introduced Apache Doris to build a data warehouse because of its rich data import methods, excellent parallel computing capabilities, and low maintenance costs. This article describes the evolution of data warehouse in LY Digital and why we switch to Apache Doris. I hope you like it.</p></blockquote><blockquote><p>Author: XingWang, Lead Developer of LY Digital</p></blockquote><p><img loading="lazy" alt="kv" src="https://cdnd.selectdb.com/assets/images/kv-fb77e142257a98bea6656a33a626b310.png" width="900" height="383" class="img_ev3q"></p><h1>1. Background</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="11-about-ly-digital">1.1 About LY Digital<a href="#11-about-ly-digital" class="hash-link" aria-label="Direct link to 1.1 About LY Digital" title="Direct link to 1.1 About LY Digital"></a></h2><p>LY Digital is a tourism financial service platform under LY. Com. Formally established in 2015, LY Digital takes &quot;Digital technology empowers the tourism industry.&quot; as its vision.
At present, LY Digital&#x27;s business covers financial services, consumer financial services, financial technology and digital technology. So far, more than 10 million users and 76 cities have enjoyed our services.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="12-requirements-for-data-warehouse">1.2 Requirements for Data Warehouse<a href="#12-requirements-for-data-warehouse" class="hash-link" aria-label="Direct link to 1.2 Requirements for Data Warehouse" title="Direct link to 1.2 Requirements for Data Warehouse"></a></h2><ul><li>Dashboard: Needs dashboard for T+1 business, etc.</li><li>Early Warning System: Needs risk control, anomaly capital management and traffic monitoring, etc.</li><li>Business Analysis: Needs timely data query analysis and temporary data retrieval, etc.</li><li>Finance: Needs liquidation and payment reconciliation.</li></ul><h1>2. Previous Data Warehouse</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="21-architecture">2.1 Architecture<a href="#21-architecture" class="hash-link" aria-label="Direct link to 2.1 Architecture" title="Direct link to 2.1 Architecture"></a></h2><p><img loading="lazy" alt="page_1" src="https://cdnd.selectdb.com/assets/images/page_1-42732f62f592f158a33670ae04987e75.png" width="1152" height="679" class="img_ev3q"></p><p>Our previous data warehouse adopted the combination of SteamSets and Apache Kudu, which was very popular in the past few years. In this architecture, Binlog is ingested into Apache Kudu after passing through StreamSets in real-time, and is finally queried and used through Apache Impala and visualization tools.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="212-downside">2.1.2 Downside<a href="#212-downside" class="hash-link" aria-label="Direct link to 2.1.2 Downside" title="Direct link to 2.1.2 Downside"></a></h3><ul><li>The previous data warehouse has a sophisticated structure that consists of many components that interact with one another, which requires huge operation and maintenance costs. </li><li>The previous data warehouse has a sophisticated structure that consists of many components that interact with one another, which requires huge operation and maintenance costs.</li><li>Apache Kudu&#x27;s performance in wide tables Join is not so good.</li><li>SLA is not fully guaranteed because tenant isolation is not provided.</li><li>Although SteamSets are equipped with early warning capabilities, job recovery capabilities are still poor. When configuring multiple tasks, the JVM consumes a lot, resulting in slow recovery.</li></ul><h1>3. New Data Warehouse</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="31-research-of-popular-data-warehouses">3.1 Research of Popular Data Warehouses<a href="#31-research-of-popular-data-warehouses" class="hash-link" aria-label="Direct link to 3.1 Research of Popular Data Warehouses" title="Direct link to 3.1 Research of Popular Data Warehouses"></a></h2><p>Due to so many shortcomings, we had to give up the previous data warehouse. In 2020, we conducted an in-depth research on the popular data warehouses in the market.</p><p>During the research, we focused on comparing Clickhouse and Apache Doris. ClickHouse has a high utilization rate of CPU, so it performs well in single-table query. But it does not perform well in multitable Joins and high QPS. On the other hand, Doris can not only support thousands of QPS per node. Thanks to the function of partitioning, it can also support high-concurrency queries at the QPS level of 10,000. Moreover, the horiziontal scaling in and out of ClickHouse are complex, which cannot be done automatically at present. Doris supports online dynamic scaling, and can be expanded horizontally according to the development of the business.</p><p>In the research, Apache Doris stood out. Doris&#x27;s high-concurrency query capability is very attractive. Its dynamic scaling capabilities are also suitable for our flexible advertising business. So we chose Apache Doris for sure.</p><p><img loading="lazy" alt="page_2" src="https://cdnd.selectdb.com/assets/images/page_2-414a885ce6917a5bfddb76d64d882ea4.png" width="1145" height="676" class="img_ev3q"></p><p>After introducing Apache Doris, we upgraded the entire data warehouse:</p><ul><li>We collect MySQL Binlog through Canal and then it is ingested into Kafka. Because Apache Doris is highly capatible with Kafka, we can easily use Routine Load to load and import data.</li><li>We have made minor adjustments to the batch processing. For data stored in Hive, Apache Doris can ingest data from Hive through Broker Load. In this way, the data in batch processing can be directly ingested into Doris.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="32-why-we-choose-doris">3.2 Why We Choose Doris<a href="#32-why-we-choose-doris" class="hash-link" aria-label="Direct link to 3.2 Why We Choose Doris" title="Direct link to 3.2 Why We Choose Doris"></a></h2><p><img loading="lazy" alt="page_3" src="https://cdnd.selectdb.com/assets/images/page_3-ec6524eea65a399078e60bff590cb3ab.png" width="1137" height="676" class="img_ev3q"></p><p>The overall performance of Apache Doris is impressive:</p><ul><li>Data access: It provides rich data import methods and can support the access of many types of data sources;</li><li>Data connection: Doris supports JDBC and ODBC connections. And it can easily connect with BI tools. In addition, Doris uses the MySQL protocol for communication. Users can directly access Doris through various Client tools;</li><li>SQL syntax: Doris adopts MySQL protocol and it is highly compatible with MySQL syntax, supporting standard SQL, and is low in learning costs for developers;</li><li>MPP parallel computing: Doris provides excellent parallel computing capabilities and has obvious advantages in complex Join and wide table Join;</li><li>Fully-completed documentation: Doris official documentation is very profound, which is friendly for new users. </li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="33--architecture-of-real-time-processing">3.3 Architecture of Real-time Processing<a href="#33--architecture-of-real-time-processing" class="hash-link" aria-label="Direct link to 3.3 Architecture of Real-time Processing" title="Direct link to 3.3 Architecture of Real-time Processing"></a></h2><p><img loading="lazy" alt="page_4" src="https://cdnd.selectdb.com/assets/images/page_4-b6f04242c2a85d92cfd1814319127b20.png" width="1132" height="668" class="img_ev3q"></p><ul><li>Data source: In real-time processing, data sources come from business branches such as industrial finance, consumer finance, and risk control. They are all collected through Canal and API.</li><li>Data collection: After data collection through Canal-Admin, Canal sends the data to Kafka message queue. After that, the data is ingested into the Doris through Routine Load.</li><li>Inside Doris: The Doris cluster constitutes a three-level layer of the data warehouse, namely: the DWD layer with the Unique model, the DWS layer with the Aggregation model, and the ADS application layer.</li><li>Data application: The data is applied in three aspects: real-time dashboard, data timeliness analysis and data service.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="34-new-features">3.4 New Features<a href="#34-new-features" class="hash-link" aria-label="Direct link to 3.4 New Features" title="Direct link to 3.4 New Features"></a></h2><p>The data import method is simple and adopts 3 different import methods according to different scenarios:</p><ul><li>Routine Load: When we submit the Rountine Load task, there will be a process within Doris that consumes Kafka in real time, continuously reads data from Kafka and ingestes it into Doris.</li><li>Broker Load: Offline data such as dim-tables and historical data are ingested into Doris in an orderly manner.</li><li>Insert Into: Used for batch processing tasks, Insert into is responsible for processing data in the DWD layer</li></ul><p>Doris&#x27; data model improves our development efficiency:</p><ul><li>The Unique model is used when accessing the DWD layer, which can effectively prevent repeated consumption of data.</li><li>In Doris, aggregation supports 4 models, such as Sum, Replace, Min, and Max. In this way, it may reduce a large amount of SQL code, and no longer allow us to manually write Sum, Min, Max and other codes.</li></ul><p>Doris query is efficient:</p><ul><li>It supports materialized view and Rollup materialized index. The bottom layer of the materialized view is similar to the concept of Cube and the precomputation process. As a way of exchanging space for time, special tables are generated at the bottom layer. In the query, materialized view maps to the tables and responds quickly.</li></ul><h1>4. Benefits of the New Data Warehouse</h1><ul><li>Data access: In the previous architecture, the Kudu table needs to be created manually during the imports through SteamSets. Lack of tools, the entire process of creating tables and tasks takes 20-30 minutes. Nowadays, fast data access can be realized through the platform. The access process of each table has been shortened from the previous 20-30 minutes to the current 3-5 minutes, which is to say that the performance has been improved by 5-6 times.</li><li>Data development: After using Doris, we can directly use the data models, such as Unique and Aggregation. The Duplicate model can well support logs, greatly speeding up the development process in ETL.</li><li>Query analysis: The bottom layer of Doris has functions such as materialized view and Rollup materialized index. Moreover, Doris has made many optimizations for wide table associations, such as Runtime Filter and other Joins. Compared with Doris, Apache Kudu requires more complex optimization to be better used.</li><li>Data report: It took 1-2 minutes to complete the rendering when we used Kudu to query before, but Doris responded in seconds or even milliseconds.</li><li>Easy maintenance: Doris is not as complex as Hadoop. In March, our IDC was relocated, and 12 Doris virtual machines were all migrated within three days. The overall operation is relatively simple. In addition to physically moving the machine, FE&#x27;s scaling only requires simple commands such as Add and Drop, which do not take a long time to do.</li></ul><h1>5. Look ahead</h1><ul><li>Realize data access based on Flink CDC: At present, Flink CDC is not introduced, but Kafka through Canal instead. The development efficiency can be even faster if we use Flink CDC. Flink CDC still needs us to write a certain amount of code, which is not friendly for data analysts to use directly. We hope that data analysts only need to write simple SQL or directly operate. In the future planning, we plan to introduce Flink CDC.</li><li>Keep up with the latest release: Now the latest version Apache Doris V1.2.0 has made great achievements in vectorization, multi-catalog, and light schema change. We will keep up with the community to upgrade the cluster and make full use of new features.</li><li>Strengthen the construction of related systems: Our current index system management, such as report metadata, business metadata, and other management levels still need to be improved. Although we have data quality monitoring functions, it still needs to be strengthened and improved in automation.</li></ul></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/BestPractice_Kwai">Best practice in Kwai: Apache Doris on Elasticsearch</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Xiang He</span></span><time datetime="2022-12-14T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 14, 2022</time></div></header><div class="markdown" itemprop="articleBody"><blockquote><p>Author: Xiang He, Head Developer of Big Data, Commercialization Team of Kwai</p></blockquote><p><img loading="lazy" alt="kv" src="https://cdnd.selectdb.com/assets/images/kv-846e4e39fd88e1e34d2474b23690d9b2.png" width="900" height="383" class="img_ev3q"></p><h1>1 About Kwai</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="11-kwai">1.1 Kwai<a href="#11-kwai" class="hash-link" aria-label="Direct link to 1.1 Kwai" title="Direct link to 1.1 Kwai"></a></h2><p>Kwai(HKG:1024) is a social network for short videos and trends. Discover funny short videos, contribute to the virtual community with recordings, videos of your life, playing daily challenges or likes the best memes and videos. Share your life with short videos and choose from dozens of magical effects and filters for them.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="12-kwais-commercial-report-engine">1.2 Kwai&#x27;s Commercial Report Engine<a href="#12-kwais-commercial-report-engine" class="hash-link" aria-label="Direct link to 1.2 Kwai&#x27;s Commercial Report Engine" title="Direct link to 1.2 Kwai&#x27;s Commercial Report Engine"></a></h2><p>Kwai’s commercial report engine provides advertisers with real-time query service for multi-dimensional analysis reports. And it also provides query service for multi-dimensional analysis reports for internal users. The engine is committed to dealing with high-performance, high-concurrency, and high-stability query problems in multi-dimensional analysis report cases.</p><h1>2 Previous Architecture</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="21-background">2.1 Background<a href="#21-background" class="hash-link" aria-label="Direct link to 2.1 Background" title="Direct link to 2.1 Background"></a></h2><p>Traditional OLAP engines deal with multi-dimensional analysis in a more pre-modeled way, by building a data cube (Cube) to perform operations such as Drill-down, Roll-up, Slice, and Dice and Pivot. Modern OLAP analysis introduces the idea of ​​a relational model, representing data in two-dimensional relational tables. In the modeling process, usally there are two modeling methods. One is to ingest the data of multiple tables into one wide table through Join; the other is to use the star schema, divide the data into fact table and dim-table. And then Join them when querying.
Both options have some pros and cons:</p><p>Wide table:</p><p>Taking the idea of ​​exchanging space for time. The primary key of the dim-table is the unique ID to fill all dimensions, and multiple dimension data is stored in redundant storage. Its advantage is that it is convenient to query, unnecessary to associate additional dim-tables, which is way better. The disadvantage is that if there is a change in dimension data, the entire table needs to be refreshed, which is bad for high-frequency Update.</p><p>Star Schema:</p><p>Dimension data is completely separated from fact data. Dimension data is often stored in a dedicated engine (such as MySQL, Elasticsearch, etc.). When querying, dimension data is associated with the primary key. The advantage is that changes in dimension data do not affect fact data, which can support high-frequency Update operations. The disadvantage is that the query logic is relatively more complex, and multi-table Join may lead to performance loss.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="22-requirement-for-an-olap-engine">2.2 Requirement for an OLAP Engine<a href="#22-requirement-for-an-olap-engine" class="hash-link" aria-label="Direct link to 2.2 Requirement for an OLAP Engine" title="Direct link to 2.2 Requirement for an OLAP Engine"></a></h2><p>In Kwai’s business, the commercial reports engine supports the real-time query of the advertising effect for advertisers. When building the report engine, we expect to meet the following requirements:</p><ul><li>Immersive data: the original data of a single table increases by ten billion every day</li><li>High QPS in Query: thousand-level QPS on average</li><li>High stability requirements: SLA level of 99.9999 %</li></ul><p>Most importantly, due to frequent changes in dimension data, dim-tables need to support Update operations up to thousand-level QPS and further support requirements such as fuzzy matching and word segmentation retrieval.
Based on the above requirements, we chose star schema and built a report engine architecture with Apache Druid and Elasticsearch.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="23-previous-architecture-based-on-apache-druid">2.3 Previous Architecture: Based on Apache Druid<a href="#23-previous-architecture-based-on-apache-druid" class="hash-link" aria-label="Direct link to 2.3 Previous Architecture: Based on Apache Druid" title="Direct link to 2.3 Previous Architecture: Based on Apache Druid"></a></h2><p>We chose the combination of Elasticsearch and Apache Druid. In data import, we use Flink to pre-aggregate the data at minute-evel, and use Kafka to pre-aggregate the data at hour-level. In data query, the application initiates a query request through RE Front API, and Re Query initiates queries to the dim-table engine (Elasticsearch and MySQL) and the extension engine respectively.</p><p>Druid is a timing-based query engine that supports real-time data ingestion and is used to store and query large amounts of fact data. We adopt Elasticseach based on those concerns:</p><ul><li>High update frequency, QPS is around 1000</li><li>Support word segmentation and fuzzy search, which is suitable for Kwai</li><li>Supports high-level dim-table data, which can be directly qualified without adopting sub-database and sub-table just like MySQL database</li><li>Supports data synchronization monitoring, and has check and recovery services as well</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="24-engine-of-the-reports">2.4 Engine of the Reports<a href="#24-engine-of-the-reports" class="hash-link" aria-label="Direct link to 2.4 Engine of the Reports" title="Direct link to 2.4 Engine of the Reports"></a></h2><p>The report engine can be divided into two layers: REFront and REQuery. REMeta is an independent metadata management module. The report engine implements MEMJoin inside REQuery. It supports associative query between fact data in Druid and dimension data in Elasticsearch. And it also provides virtual cube query for upper-layer business, avoiding the exposion of complex cross-engine management and query logic.</p><p><img loading="lazy" alt="page_1" src="https://cdnd.selectdb.com/assets/images/page_1-9e4af3275a17b4c1c893caa7c6f7290b.png" width="709" height="698" class="img_ev3q"></p><h1>3 New Architecture Based on Apache Doris</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="31-problems-remained">3.1 Problems Remained<a href="#31-problems-remained" class="hash-link" aria-label="Direct link to 3.1 Problems Remained" title="Direct link to 3.1 Problems Remained"></a></h2><p>First, we came across a problem when we build the report engine. Mem Join is single-machine with serial execution. When the amount of data pulled from Elasticsearch exceeds 100,000 at a single time, the response time is close to 10s, and the user experience is poor. Moreover, using a single node to execute large-scale data Join will consume a lot of memory, causing Full GC.</p><p>Second, Druid&#x27;s Lookup Join function is not so perfect, which is a big problem, and it cannot fully meet our business needs.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="32-database-research">3.2 Database Research<a href="#32-database-research" class="hash-link" aria-label="Direct link to 3.2 Database Research" title="Direct link to 3.2 Database Research"></a></h2><p>So we conducted a survey on popular OLAP databases in the industry, the most representative of which are Apache Doris and Clickhouse. We found out that Apache Doris is more capable of Join between large and wide tables. ClickHouse can support Broadcast memory-based Join, but the performance is not good for the Join between large and wide tables with a large data volume. Both Doris and Clickhouse support detailed data storage, but the capability for concurrency of Clickhouse is low. On the contrary, Doris supports high-concurrency and low-latency query services, and a single machine supports up to thousands of QPS. When the concurrency increases, horizontal expansion of FE and BE can be supported. However, Clickhouse&#x27;s data import is not able to support Transaction SQL, which cannot realize Exactly-once semantics and has limited ablility for standard SQL. In contrast, Doris provides Transaction SQL and atomicity for data import. Doris itself can ensure that messages in Kafka are not lost or re-subscribed, which is to say, Exactly-Once semantics is supported. ClickHouse has high learning cost, high operation and maintenance costs, and weak in distribution. The fact that it requires more customization and deeper technical strength is another problem. Doris is different. There are only two core components, FE and BE, and there are fewer external dependencies. We also found that because Doris is closer to the MySQL protocol, it is more convenient than Clickhouse and the cost of migration is not so large. In terms of horizontal expansion, Doris&#x27; expansion and contraction can also achieve self-balancing, which is much better than that of Clickhouse.</p><p>From this point of view, Doris can better improve the performance of Join and is much better in other aspects such as migration cost, horizontal expansion, and concurrency. However, Elasticsearch has inherent advantages in high-frequency Update.</p><p>It would be an ideal solution to deal with high-frequency Upate and Join performance at the same time by building engines through Doris on Elasticsearch.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="33-good-choice-doris-on-elasticsearch">3.3 Good Choice: Doris on Elasticsearch<a href="#33-good-choice-doris-on-elasticsearch" class="hash-link" aria-label="Direct link to 3.3 Good Choice: Doris on Elasticsearch" title="Direct link to 3.3 Good Choice: Doris on Elasticsearch"></a></h2><p>What is the query performance of Doris on Elasticsearch?</p><p>First of all, Apache Doris is a real-time analytical database based on MPP architecture, with strong performance and strong horizontal expansion capability. Doris on Elasticsearch takes advantage on this capability and does a lot of query optimization. Secondly, after integrating Elasticsearch, we have also made a lot of optimizations to the query:</p><ul><li>Shard-level concurrency</li><li>Automatic adaptation of row and column scanning, priority to column scanning</li><li>Sequential read, terminated early</li><li>Two-phase query becomes one-phase query</li><li>Broadcast Join is especially friendly for small batch data</li></ul><p><img loading="lazy" alt="page_2" src="https://cdnd.selectdb.com/assets/images/page_2-a916fe2ffe5eeae0b166d30cfe8d8e42.png" width="890" height="1032" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="34-doris-on-elasticsearch">3.4 Doris on Elasticsearch<a href="#34-doris-on-elasticsearch" class="hash-link" aria-label="Direct link to 3.4 Doris on Elasticsearch" title="Direct link to 3.4 Doris on Elasticsearch"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="341-data-link-upgrade">3.4.1 Data Link Upgrade<a href="#341-data-link-upgrade" class="hash-link" aria-label="Direct link to 3.4.1 Data Link Upgrade" title="Direct link to 3.4.1 Data Link Upgrade"></a></h3><p>The upgrade of the data link is relatively simple. In the first step, in Doris we build a new Olap table and configure the materialized view. Second, the routine load is initiated based on the Kafka topic of the previous fact data, and then real-time data is ingested. The third step is to ingest offline data from Hive&#x27;s broker load. The last step is to create an Elasticsearch external table through Doris.</p><p><img loading="lazy" alt="page_3" src="https://cdnd.selectdb.com/assets/images/page_3-2f23fe1184980f690da326e4446fd7f7.png" width="1313" height="1265" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="342-upgrades-of-the-report-engine">3.4.2 Upgrades of the Report Engine<a href="#342-upgrades-of-the-report-engine" class="hash-link" aria-label="Direct link to 3.4.2 Upgrades of the Report Engine" title="Direct link to 3.4.2 Upgrades of the Report Engine"></a></h3><p><img loading="lazy" alt="page_4" src="https://cdnd.selectdb.com/assets/images/page_4-f9c9b95ac997f1d8f09fb5fe182c368f.png" width="1274" height="895" class="img_ev3q"></p><p>Note: The MySQL dim-table associated above is based on future planning. Currently, Elasticsearch is mainly used as the dim-table engine</p><p>Report Engine Adaptation</p><ul><li>Generate virtual cube table based on Doris&#x27;s star schema</li><li>Adapt to cube table query analysis, intelligent Push-down</li><li>Gray Release</li></ul><h1>4 Online Performance</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="41-fact-table-query-performance-comparison">4.1 Fact Table Query Performance Comparison<a href="#41-fact-table-query-performance-comparison" class="hash-link" aria-label="Direct link to 4.1 Fact Table Query Performance Comparison" title="Direct link to 4.1 Fact Table Query Performance Comparison"></a></h2><p>Druid</p><p><img loading="lazy" alt="page_5" src="https://cdnd.selectdb.com/assets/images/page_5-8e598f4abd11de7482c1a9dcc0747641.png" width="935" height="276" class="img_ev3q"></p><p>Doris</p><p><img loading="lazy" alt="page_6" src="https://cdnd.selectdb.com/assets/images/page_6-7747547b14b4dbce6b2ee99fde03ab16.png" width="959" height="291" class="img_ev3q"></p><p>99th percentile of response time:
Druid: 270ms, Doris: 150ms and which is reduced by 45%</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="42-comparison-of-cube-table-query-performance-in-join">4.2 Comparison of Cube Table Query Performance in Join<a href="#42-comparison-of-cube-table-query-performance-in-join" class="hash-link" aria-label="Direct link to 4.2 Comparison of Cube Table Query Performance in Join" title="Direct link to 4.2 Comparison of Cube Table Query Performance in Join"></a></h2><p>Druid</p><p><img loading="lazy" alt="page_7" src="https://cdnd.selectdb.com/assets/images/page_7-46c2a88aabf031ee764884d78837880f.png" width="987" height="316" class="img_ev3q"></p><p>Doris</p><p><img loading="lazy" alt="page_8" src="https://cdnd.selectdb.com/assets/images/page_8-cc75cc3a5ced01182cac415175d4048a.png" width="950" height="291" class="img_ev3q"></p><p>99th percentile of response time:
Druid: 660ms, Doris: 440ms and which is reduced by 33%</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="43-benefits">4.3 Benefits<a href="#43-benefits" class="hash-link" aria-label="Direct link to 4.3 Benefits" title="Direct link to 4.3 Benefits"></a></h2><ul><li>The overall time consumption of 99 percentile is reduced by about 35%</li><li>Resource saving about 50%</li><li>Remove the complex logic of MemJoin from the report engine; Realize through DO(in the case of large query: dim-table results exceed 100,000, performance improvement exceeds 10 times, 10s to 1s)</li><li>Richer query semantics (Mem Join is relatively simple and does not support complex queries)</li></ul><h1>5 Summary and Plans</h1><p>In Kwai&#x27;s commercial business, Join queries between dimension data and fact data is very common. After using Doris, query becomes simple. We only need to synchronize the fact table and dim-table on a daily basis and Join while querying. By replacing Druid and Clickhouse with Doris, Doris basically covers all scenarios when we use Druid. In this way, Kwai&#x27;s commercial report engine greatly improves the aggregation and analysis capabilities of massive data. During the use of Apache Doris, we also found some unexpected benefits: For example, the import method of Routine Load and Broker Load is relatively simple, which improves the query speed; The data occupation is greatly reduced; Doris supports the MySQL protocol, which is much easier for data analyst to fetch data and make charts.</p><p>Although the Doris on Elasticsearch has fully meet our requirement, Elasticsearch external table still requires manual creation. However, Apache Doris recently released the latest version V1.2.0. The new version has added Multi-Catlog, which provides the ability to seamlessly access external table sources such as Hive, Elasticsearch, Hudi, and Iceberg. Users can connect to external tables through the CREATE CATALOG command, and Doris will automatically map the library and table information of the external dable. In this way, we don&#x27;t need to manually create the Elasticsearch external tables to complete the mapping in the future, which greatly saves us time and cost of development and improves the efficiency of research and development. The power of other new functions such as Vectorization and Ligt Schema Change also gives us new expectations for Apache Doris. Bless Apache Doris!</p><h1>Contact Us</h1><p>Apache Doris Website:<a href="http://doris.apache.org" target="_blank" rel="noopener noreferrer">http://doris.apache.org</a></p><p>Github:<a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">https://github.com/apache/doris</a></p><p>Dev Email:<a href="mailto:dev@doris.apache.org" target="_blank" rel="noopener noreferrer">dev@doris.apache.org</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/xiaomi_vector">Practice and optimization of Apache Doris in Xiaomi</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">ZuoWei</span></span><time datetime="2022-12-08T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 8, 2022</time></div></header><div class="markdown" itemprop="articleBody"><blockquote><p>Guide: Xiaomi Group introduced Apache Doris in 2019. At present, Apache Doris has been widely used in dozens of business departments within Xiaomi. A set of data ecology with Apache Doris has been formed. This article is transcribed from an online meetup speech of the Doris community, aiming to share the practice of Apache Doris in Xiaomi.</p></blockquote><blockquote><p>Author: ZuoWei, OLAP Engineer, Xiaomi</p></blockquote><p><img loading="lazy" alt="kv" src="https://cdnd.selectdb.com/assets/images/kv-b27d71e34981d9850785329cea2cb610.png" width="900" height="383" class="img_ev3q"></p><h1>About Xiaomi</h1><p><a href="https://www.mi.com/global" target="_blank" rel="noopener noreferrer">Xiaomi Corporation</a> (“Xiaomi” or the “Group”; HKG:1810), a consumer electronics and smart manufacturing company with smartphones and smart hardware connected by an Internet of Things (IoT) platform. In 2021, Xiaomi&#x27;s total revenue amounted to RMB328.3 billion(USD472,231,316,200), an increase of 33.5% year-over-year; Adjusted net profit was RMB22.0 billion(USD3,164,510,800), an increase of 69.5% year-over-year.</p><p>Due to the growing need of data analysis, Xiaomi Group introduced Apache Doris in 2019. As one of the earliest users of Apache Doris, Xiaomi Group has been deeply involved in the open-source community. After three years of development, Apache Doris has been widely used in dozens of business departments within Xiaomi, such as Advertising, New Retail, Growth Analysis, Dashboards, UserPortraits, <a href="https://airstar.com/home" target="_blank" rel="noopener noreferrer">AISTAR</a>, <a href="https://www.xiaomiyoupin.com" target="_blank" rel="noopener noreferrer">Xiaomi Youpin</a>. Within Xiaomi, a data ecosystem has been built around Apache Doris. </p><p><img loading="lazy" alt="page_1" src="https://cdnd.selectdb.com/assets/images/page_1-93afbd2f90769776af3083bc49fbf8dd.jpg" width="1135" height="661" class="img_ev3q"></p><p>At present, Apache Doris already has dozens of clusters in Xiaomi, with an overall scale of hundreds of virtual machines . Among them, the largest single cluster reaches nearly 100 nodes, with dozens of real-time data synchronization tasks. And the largest daily increment of a single table rocket to 12 billion, supporting PB-level storage. And a single cluster can support more than 20,000 multi-dimensional analysis queries per day.</p><h1>Architecture Evolution</h1><p>The original intention of Xiaomi to introduce Apache Doris is to solve the problems encountered in user behavior analysis. With the development of Xiaomi&#x27;s Internet business, the demand for growth analysis using user behavior data is becoming stronger and stronger. If each business branch builds its own growth analysis system, it will not only be costly, but also inefficient. Therefore, if there is a product that can help them stop worrying about underlying complex technical details, it would be great to have relevant business personnel focus on their own technical work. In this way, it can greatly improve work efficiency. Therefore, Xiaomi Big Data and the cloud platform jointly developed the growth analysis system called Growing Analytics (referred to as GA), which aims to provide a flexible multi-dimensional real-time query and analysis platform, which can manage data access and query solutions in a unified way, and help business branches to refine operation.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="previous-architecture">Previous Architecture<a href="#previous-architecture" class="hash-link" aria-label="Direct link to Previous Architecture" title="Direct link to Previous Architecture"></a></h2><p>The growth analysis platform project was established in mid-2018. At that time, based on the consideration of development time and cost, Xiaomi reused various existing big data basic components (HDFS, Kudu, SparkSQL, etc.) to build a growth analysis query system based on Lambda architecture. The architecture of the first version of the GA system is shown in the figure below, including the following aspects:</p><ul><li>Data Source: The data source is the front-end embedded data and user behavior data.</li><li>Data Access: The event tracking data is uniformly cleaned and ingested into Xiaomi&#x27;s internal self-developed message queue, and the data is imported into Kudu through Spark Streaming.</li><li>Storage: Separate hot and cold data in the storage layer. Hot data is stored in Kudu, and cold data is stored in HDFS. At the same time, partitioning is carried out in the storage layer. When the partition unit is day, part of the data will be cooled and stored on HDFS every night.</li><li>Compute and Query: In the query layer, use SparkSQL to perform federated queries on the data on Kudu and HDFS, and finally display the query results on the front-end page.</li></ul><p><img loading="lazy" alt="page_2" src="https://cdnd.selectdb.com/assets/images/page_2-db57a1a2eadb0f1c787f440a26358339.jpg" width="1159" height="683" class="img_ev3q"></p><p>At that time, the first version of the growth analysis platform helped us solve a series of problems in the user operation process, but there were also two problems:</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="problem-no1-scattered-components">Problem No.1: Scattered components<a href="#problem-no1-scattered-components" class="hash-link" aria-label="Direct link to Problem No.1: Scattered components" title="Direct link to Problem No.1: Scattered components"></a></h3><p>Since the historical architecture is based on the combination of SparkSQL + Kudu + HDFS, too many dependent components lead to high operation and maintenance costs. The original design is that each component uses the resources of the public cluster, but in practice, it is found that during the execution of the query job, the query performance is easily affected by other jobs in the public cluster, and query jitter is prone to occur, especially when reading data from the HDFS public cluster , sometimes slower.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="problem-no2-high-resource-consumption">Problem No.2: High resource consumption<a href="#problem-no2-high-resource-consumption" class="hash-link" aria-label="Direct link to Problem No.2: High resource consumption" title="Direct link to Problem No.2: High resource consumption"></a></h3><p>When querying through SparkSQL, the latency is relatively high. SparkSQL is a query engine designed based on a batch processing system. In the process of exchanging data shuffle between each stage, it still needs to be placed on the disk, and the delay in completing the SQL query is relatively high. In order to ensure that SQL queries are not affected by resources, we ensure query performance by adding machines. However, in practice, we find that there is limited room for performance improvement. This solution cannot make full use of machine resources to achieve efficient queries. A certain waste of resources.</p><p>In response to the above two problems, our goal is to seek an MPP database that integrates computing and storage to replace our current storage and computing layer components. After technical selection, we finally decided to use Apache Doris to replace the older generation of historical architecture.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="new-choice">New Choice<a href="#new-choice" class="hash-link" aria-label="Direct link to New Choice" title="Direct link to New Choice"></a></h2><p>Popular MPP-based query engines such as Impala and Presto, can efficiently support SQL queries, but they still need to rely on Kudu, HDFS, Hive Metastore and other storage system, which increase the operation and maintenance costs. At the same time, due to the separation of storage and compute, the query engine cannot easily find the data changes in the storage layer, resulting in bad performance in detailed query optimization. If you want to cache at the SQL layer, you cannot guarantee that the query results are up-to-date.</p><p>Apache Doris is a top-level project of the Apache Foundation. It is mainly positioned as a high-performance, real-time analytical database, and is mainly used to solve reports and multi-dimensional analysis. It integrates Google Mesa and Cloudera Impala technologies. We conducted an in-depth performance tests on Doris and communicated with the community many times. And finally, we determined to replace the previous computing and storage components with Doris. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="new-architecture-based-on-apache-doris">New Architecture Based on Apache Doris<a href="#new-architecture-based-on-apache-doris" class="hash-link" aria-label="Direct link to New Architecture Based on Apache Doris" title="Direct link to New Architecture Based on Apache Doris"></a></h2><p>The new architecture obtains event tracking data from the data source. Then data is ingested into Apache Doris. Query results can be directly displayed in the applications. In this way, Doris has truly realized the unification of computing, storage, and resource management tools.</p><p><img loading="lazy" alt="page_3" src="https://cdnd.selectdb.com/assets/images/page_3-30c8cb46f4d289fa768e9a364779bc69.jpg" width="1149" height="674" class="img_ev3q"></p><p>We chose Doris because:</p><ul><li>Doris has excellent query performance and can meet our business needs.</li><li>Doris supports standard SQL, and the learning cost is low.</li><li>Doris does not depend on other external components and is easy to operate and maintain.</li><li>The Apache Doris community is very active and friendly, crowded with contributors. It is easier for further versions upgrades and convenient for maintenance.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="query-performance-comparision-between-apache-doris--spark-sql">Query Performance Comparision between Apache Doris &amp; Spark SQL<a href="#query-performance-comparision-between-apache-doris--spark-sql" class="hash-link" aria-label="Direct link to Query Performance Comparision between Apache Doris &amp; Spark SQL" title="Direct link to Query Performance Comparision between Apache Doris &amp; Spark SQL"></a></h2><p>Note: The comparison is based on Apache Doris V0.13</p><p><img loading="lazy" alt="page_4" src="https://cdnd.selectdb.com/assets/images/page_4-3e71f2a8753e49f5a73bea4bb628fbbf.jpg" width="1242" height="1000" class="img_ev3q"></p><p>We selected a business model with an average daily data volume of about 1 billion, and conducted performance tests on Doris in different scenarios, including 6 event analysis scenarios, 3 retention analysis scenarios, and 3 funnel analysis scenarios. After comparing it with the previous architecture(SparkSQL+Kudu+HDFS), we found out:</p><ul><li>In the event analysis scenario, the average query time was reduced by 85%.</li><li>In the scenarios of retention analysis and funnel analysis, the average query time was reduced by 50%.</li></ul><h1>Real Practice</h1><p>Below we will introduce our experience of data import, data query, A/B test in the business application of Apache Doris.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-import">Data Import<a href="#data-import" class="hash-link" aria-label="Direct link to Data Import" title="Direct link to Data Import"></a></h2><p><img loading="lazy" alt="page_5" src="https://cdnd.selectdb.com/assets/images/page_5-010f8edce4b736817d68815f31e52fd7.jpg" width="1130" height="667" class="img_ev3q"></p><p>Xiaomi writes data into Doris mainly through Stream Load, Broker Load and a small amount of data by Insert. Usually data is generally ingested into the message queue first, which is divided into real-time and offline data.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="how-to-write-real-time-data-into-apache-doris">How to write real-time data into Apache Doris:<a href="#how-to-write-real-time-data-into-apache-doris" class="hash-link" aria-label="Direct link to How to write real-time data into Apache Doris:" title="Direct link to How to write real-time data into Apache Doris:"></a></h3><p>After part of real-time data processed by Flink, they will be ingested into Doris through Flink-Doris-Connector provided by Apache Doris. The rest of the data is ingested through Spark Streaming. The bottom layer of these two writing approaches both rely on the Stream Load provided by Apache Doris.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="how-to-write-offline-data-into-apache-doris">How to write offline data into Apache Doris:<a href="#how-to-write-offline-data-into-apache-doris" class="hash-link" aria-label="Direct link to How to write offline data into Apache Doris:" title="Direct link to How to write offline data into Apache Doris:"></a></h3><p>After offline data is partially ingested into Hive, they will be ingested into Doris through Xiaomi&#x27;s data import tool. Users can directly submit Broker Load tasks to the Xiaomi&#x27;s data import tool and import data directly into Doris, or import data through Spark SQL, which relies on the Spark-Doris-Connector provided by Apache Doris. Spark Doris Connector is actually the encapsulation of Stream Load.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="data-qurey">Data Qurey<a href="#data-qurey" class="hash-link" aria-label="Direct link to Data Qurey" title="Direct link to Data Qurey"></a></h2><p><img loading="lazy" alt="page_6" src="https://cdnd.selectdb.com/assets/images/page_6-14cf1592d25e4b6e4cc275e06c2e6673.jpg" width="1120" height="638" class="img_ev3q"></p><p>Users can query after data import is done. Inside Xiaomi, we query through our data platform. Users can perform visual queries on Doris through Xiaomi&#x27;s data platform, and conduct user behavior analysis and user portrait analysis. In order to help our teams conduct event analysis, retention analysis, funnel analysis, path analysis and other behavioral analysis, we have added corresponding UDF (User Defined Function) and UDAF (User Defined Aggregate Function) to Doris.</p><p>In the upcoming version 1.2, Apache Doris adds the function of synchronizing metadata through external table, such as Hive/Hudi/Iceberg and Multi Catalog tool. External table query improves performance, and the ability to access external tables greatly increases ease of use. In the future, we will consider querying Hive and Iceberg data directly through Doris, which builds an architecture of datalake.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="ab-test">A/B Test<a href="#ab-test" class="hash-link" aria-label="Direct link to A/B Test" title="Direct link to A/B Test"></a></h2><p>In real business, the A/B test is a method of comparing two versions of strategies against each other to determine which one performs better. A/B test is essentially an experiment where two or more variants of a page are shown to users at random, and statistical analysis. It is popular approach used to determine which variation performs better for a given conversion goal. Xiaomi&#x27;s A/B test platform is an operation tool product that conducts the A/B test with experimental grouping, traffic splitting, and scientific evaluation to assist in decision making. Xiaomi&#x27;s A/B test platform has several query applications: user deduplication, indicator summation, covariance calculation, etc. The query types will involve Count (distinct), Bitmap, Like, etc.</p><p>Apache Doris also provides services to Xiaomi&#x27;s A/B test platform. Everyday, Xiaomi&#x27;s A/B test platform needs to process a temendous amount of data with billions of queries. That&#x27;s why Xiaomi&#x27;s A/B test platform is eager to improve the query performance. </p><p>Apache Doris V1.1 released just in time and has fully supported vectorization in the processing and storage. Compared with the non-vectorized version, the query performance has been significantly improved. It is time to update Xiaomi&#x27;s Doris cluster to the latest version. That&#x27;s why we first launched the latest vectorized version of Doris on Xiaomi&#x27;s A/B test platform.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="test-before-launch">Test before Launch<a href="#test-before-launch" class="hash-link" aria-label="Direct link to Test before Launch" title="Direct link to Test before Launch"></a></h2><p>Note: The following tests are based on Apache Doris V1.1.2</p><p>We built a test cluster for Apache Doris V1.1.2, which is as big as that of the Xiaomi online Apache Doris V0.13 version, to test before the vectorization version goes online. The test is divided into two aspects: single SQL parrellel query test and batch SQL concurrent query test.</p><p>The configurations of the two clusters are exactly the same, and the specific configuration information is as follows:</p><ul><li>Scale: 3 FEs + 89 virtual machines</li><li>CPU: Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz 16 cores 32 threads × 2</li><li>Memory: 256GB</li><li>Disk: 7.3TB × 12 HDD</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="single-sql-parrellel-query-test">Single SQL Parrellel Query Test<a href="#single-sql-parrellel-query-test" class="hash-link" aria-label="Direct link to Single SQL Parrellel Query Test" title="Direct link to Single SQL Parrellel Query Test"></a></h3><p>We choose 7 classic queries in the Xiaomi A/B test. For each query, we limited the time range to 1 day, 7 days, and 20 days for testing, where the daily partition data size is about 3.1 billion (the data volume is about 2 TB). The test results are shown in the figures:</p><p><img loading="lazy" alt="page_7" src="https://cdnd.selectdb.com/assets/images/page_7-b41817232fb711c583332d813de7f684.jpg" width="750" height="450" class="img_ev3q"></p><p><img loading="lazy" alt="page_8" src="https://cdnd.selectdb.com/assets/images/page_8-c8e10196ce6917449e8372205333f12c.jpg" width="750" height="450" class="img_ev3q"></p><p><img loading="lazy" alt="page_9" src="https://cdnd.selectdb.com/assets/images/page_9-cfbcd21a8b00a3b50508251b78ebd163.jpg" width="750" height="450" class="img_ev3q"></p><p>The Apache Doris V1.1.2 has at least 3~5 times performance improvement compared to the Xiaomi online Doris V0.13, which is remarkable.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="optimization">Optimization<a href="#optimization" class="hash-link" aria-label="Direct link to Optimization" title="Direct link to Optimization"></a></h2><p>Note: The following tests are based on Apache Doris V1.1.2</p><p>Based on Xiaomi&#x27;s A/B test business data, we tuned Apache Doris V1.1.2 and conducted concurrent query tests on the tuned Doris V1.1.2 and Xiaomi&#x27;s online Doris V0.13. The test results are as follows.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="optimization-in-test-1">Optimization in Test 1<a href="#optimization-in-test-1" class="hash-link" aria-label="Direct link to Optimization in Test 1" title="Direct link to Optimization in Test 1"></a></h3><p>We choose user deduplication, index summation, and covariance calculation query(the total number of SQL is 3245) in the A/B test to conduct concurrent query tests on the two versions. The single-day partition data of the table is about 3.1 billion (the amount of data is about 2 TB) and the query will be based on the latest week&#x27;s data. The test results are shown in the figures:</p><p><img loading="lazy" alt="page_10" src="https://cdnd.selectdb.com/assets/images/page_10-98057ca75a1689b6c6eb9932cdd5e841.jpg" width="1080" height="338" class="img_ev3q"></p><p>Compared with Apache Doris V0.13, the overall average latency of Doris V1.1.2 is reduced by about 48%, and the P95 latency is reduced by about 49%. In this test, the query performance of Doris V1.1.2 was nearly doubled compared to Doris V0.13.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="optimization-in-test-2">Optimization in Test 2<a href="#optimization-in-test-2" class="hash-link" aria-label="Direct link to Optimization in Test 2" title="Direct link to Optimization in Test 2"></a></h3><p>We choose 7 A/B test reports to test the two versions. Each A/B test report is corresponded to two modules in Xiaomi A/B test platform and each module represents thousands of SQL query. Each report submits query tasks to the cluster where the two versions reside at the same concurrency. The test results are shown in the figure:</p><p><img loading="lazy" alt="page_11" src="https://cdnd.selectdb.com/assets/images/page_11-bbf60c474aaea1a007b5b413d6bad77a.jpg" width="750" height="450" class="img_ev3q"></p><p>Compared with Doris V0.13, Doris V1.1.2 reduces the overall average latency by around 52%. In the test, the query performance of Doris V1.1.2 version was more than 1 time higher than that of Doris V0.13. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="optimization-in-test-3">Optimization in Test 3<a href="#optimization-in-test-3" class="hash-link" aria-label="Direct link to Optimization in Test 3" title="Direct link to Optimization in Test 3"></a></h3><p>To verify the performance of the tuned Apache Doris V1.1.2 in other cases, we choose the Xiaomi user behavior analysis to conduct concurrent query performance tests of Doris V1.1.2 and Doris V0.13. We choose behavior analysis query for 4 days on October 24, 25, 26 and 27, 2022. The test results are shown in the figures:</p><p><img loading="lazy" alt="page_12" src="https://cdnd.selectdb.com/assets/images/page_12-58242671fba5bbf25225b4d9d9f6d87c.jpg" width="1080" height="338" class="img_ev3q"></p><p>Compared with Doris V0.13, the overall average latency of Doris V1.1.2 has been reduced by about 77%, and the P95 latency has been reduced by about 83%. In this test, the query performance of Doris V1.1.2 version is 4~6 times higher than that of Doris V0.13.</p><h1>Conclusion</h1><p>Since we adopted Apache Doris in 2019, Apache Doris has currently served dozens of businesses and sub-brands within Xiaomi, with dozens of clusters and hundreds of nodes. It completes more than 10,000 user online analysis queries every day and is responsible for most of the online analysis in Xiaomi.</p><p>After performance test and tuning, Apache Doris V1.1.2 has met the launch requirements of the Xiaomi A/B test platform and does well in query performance and stability. In some cases, it even exceeds our expectations, such as the overall average latency being reduced by about 77% in our tuned version.</p><p>Meanwhile, some functions have in the above been released in Apache Doris V1.0 or V1.1, some PRs have been merged into the community Master Fork and should be released soon. Recently the activity of the community has been greatly enhanceed. We are glad to see that Apache Doris has become more and more mature, and stepped forward to an integrated datalake. We truly believe that in the future, more data analysis will be explored and realized within Apache Doris.</p><h1>Contact Us</h1><p>Apache Doris Website:<a href="http://doris.apache.org" target="_blank" rel="noopener noreferrer">http://doris.apache.org</a></p><p>Github Homepage:<a href="https://github.com/apache/doris" target="_blank" rel="noopener noreferrer">https://github.com/apache/doris</a></p><p>Email to DEV:<a href="mailto:dev@doris.apache.org" target="_blank" rel="noopener noreferrer">dev@doris.apache.org</a></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/JD_OLAP">JD.com&#x27;s exploration and practice with Apache Doris in real time OLAP</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Li Zhe</span></span><time datetime="2022-12-02T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">December 2, 2022</time></div></header><div class="markdown" itemprop="articleBody"><p><img loading="lazy" alt="kv" src="https://cdnd.selectdb.com/assets/images/kv-e94fd46c1522a3383d161daec2249d18.png" width="900" height="383" class="img_ev3q"></p><blockquote><p>Guide:
This article discusses the exploration and practice of the search engine team in JD.com using Apache Flink and Apache Doris in real-time data analysis. The popularity of stream computing is increasing day by day: More papers are published on Google Dataflow; Apache Flink has become the one of the most popular engine in the world; There is wide application of real-time analytical databases more than ever before, such as Apache Doris; Stream computing engines are really flourishing. However, no engine is perfect enough to solve every problem. It is important to find a suitable OLAP engine for the business. We hope that JD.com&#x27;s practice in real-time OLAP and stream computing may give you some inspiration.</p></blockquote><blockquote><p>Author: Li Zhe, data engineer of JD.com, who focused on offline data, stream computing and application development.</p></blockquote><h2 class="anchor anchorWithStickyNavbar_LWe7" id="about-jdcom">About JD.com<a href="#about-jdcom" class="hash-link" aria-label="Direct link to About JD.com" title="Direct link to About JD.com"></a></h2><p>JD.com (NASDAQ: JD), a leading e-commerce company in China, had a net income of RMB 951.6 billion in 2021. JD Group owns JD Retail, JD Global, JD Technology, JD Logistics, JD Cloud, etc. Jingdong Group was officially listed on the NASDAQ Stock Exchange in May 2014.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="jd-search-boxs-requirement-real-time-data-analysis">JD Search Box&#x27;s Requirement: Real-time Data Analysis<a href="#jd-search-boxs-requirement-real-time-data-analysis" class="hash-link" aria-label="Direct link to JD Search Box&#x27;s Requirement: Real-time Data Analysis" title="Direct link to JD Search Box&#x27;s Requirement: Real-time Data Analysis"></a></h2><p>JD search box, as the entrance of the e-commerce platform, provides a link betwee merchants and users. Users can express their needs through the search box. In order to better understand user intentions and quickly improve the conversion rate, multiple A/B tests are running online at the same time, which apply to multiple products. The category, organization, and brand all need to be monitored online for better conversion. At present, JD search box demands real-time data in application mainly includes three parts:</p><ol><li>The overall data of JD search box.</li><li>Real-time monitoring of the A/B test.</li><li>Top list of hot search words to reflect changes in public opinion. Words trending can reflect what users care</li></ol><p>The analysis mentioned above needs to refine the data to the SKU-level. At the same time, we also undertake the task of building a real-time data platform to show our business analysists different real-time stream computing data.</p><p>Although different business analysists care about different data granularity, time frequency, and dimensions, we are hoping to establish a unified real-time OLAP data warehouse and provide a set of safe, reliable and flexible real-time data services.</p><p>At present, the newly generated exposure logs every day reach hundreds of millions. The logs willl increase by 10 times if they are stored as SKU. And they would grow to billions of records if based on A/B test. Aggregation queries cross multi-dimension require second-level response time. </p><p>Such an amount of data also brings huge challenges to the team: 2 billion rows have been created daily; Up to 60 million rows need to be imported per minute; Data latency should be limited to 1 minute; MDX query needs to be executed within 3 seconds; QPS has reached above 20. Yet a new reliable OLAP database with high stability should be able to respond to priority 0 emergency.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="the-evolution-of-the-real-time-architecture">The Evolution of the Real-time Architecture<a href="#the-evolution-of-the-real-time-architecture" class="hash-link" aria-label="Direct link to The Evolution of the Real-time Architecture" title="Direct link to The Evolution of the Real-time Architecture"></a></h2><p>Our previous architecture is based on Apache Storm for a point-to-point data processing. This approach can quickly meet the needs of real-time reports during the stage of rapid business growth in the early days. However, with the continuous development of business, disadvantages gradually appear. For example, poor flexibility, poor data consistency, low development efficiency and increased resource costs.</p><p><img loading="lazy" alt="page_2" src="https://cdnd.selectdb.com/assets/images/page_2-bc63d65e9c203504cbc7900319d0211c.png" width="1684" height="801" class="img_ev3q"></p><p>In order to solve the problems of the previous architecture, we first upgraded the architecture and replaced Apache Storm with Apache Flink to achieve high throughput. At the same time, according to the characteristics of the search data, the real-time data is processed hierarchically, which means the PV data flow, the SKU data flow and the A/B test data flow are created. It is expected to build the upper real-time OLAP layer based on the real-time flow.</p><p>When selecting OLAP database, the following points need to be considered:</p><ol><li>The data latency is at minute-level and the query response time is at second-level</li><li>Suppots standard SQL, which reduces the cost of use</li><li>Supports JOIN to facilitate adding dimension</li><li>Traffic data can be deduplicated approximately, but order data must be exact deduplicated </li><li>High throughput with tens of millions of records per minute and tens of billions of new records every day</li><li>Query concurrency needs to be high because Front-end may need it</li></ol><p>By comparing the OLAP engines that support real-time import , we made an in-depth comparison among Apache Druid, Elasticsearch, Clickhouse and Apache Doris:</p><p><img loading="lazy" alt="page_3" src="https://cdnd.selectdb.com/assets/images/page_3-578754e222201a65b0601326dc8b298b.png" width="2667" height="778" class="img_ev3q"></p><p>We found out that Doris and Clickhouse can meet our needs. But the concurrency of Clickhouse is low for us, which is a potential risk. Moreover, the data import of Clickhouse has no TRANSACTION and cannot achieve Exactly-once semantics. Clickhouse is not fully supportive of SQL.</p><p>Finally, we chose Apache Doris as our real-time OLAP database. For user behavior log data, we use Aggregation Key data table; As for E-commerce orders data, we use Unique Key data table. Moreover, we split the previous tasks and reuse the logic we tried before. Therefore, when Flink is processing, there will be new topic flow and real-time flow of different granularities generated in DWD. The new architecture is as follows:</p><p><img loading="lazy" alt="page_4" src="https://cdnd.selectdb.com/assets/images/page_4-1f5e1ab38f22766b4ac14b73ee164d59.png" width="3004" height="1571" class="img_ev3q"></p><p>In the current technical architecture, flink task is very light. Based on the production data detail layer, we directly use Doris to act as the aggregation layer function. And we ask Doris to complete window calculation which previously belongs to Flink. We also take advantage of the routine load to consume real-time data. Although the data is fine-grained before importing, based on the Aggregation Key, asynchronous aggregation will be automatically performed. The degree of aggregation is completely determined by the number of dimensions. By creating Rollup on the base table, double-write or multi-write and pre-aggregate operations are performed during import, which is similar to the function of materialized view, which can highly aggregate data to improve query performance.</p><p>Another advantage of using Kafka to directly connect to Doris at the detail layer is that it naturally supports data backtracking. Data backtracking means that when real-time data is out of order, the &quot;late&quot; data can be recalculated and the previous results can be updated. This is because delayed data can be written to the table whenever it arrives. The final solution is as follows:</p><p><img loading="lazy" alt="page_5" src="https://cdnd.selectdb.com/assets/images/page_5-e8fecc91db2d8fcc3495fb45a0e8e8c2.png" width="1116" height="705" class="img_ev3q"></p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="optimization-during-the-promotion">Optimization during the Promotion<a href="#optimization-during-the-promotion" class="hash-link" aria-label="Direct link to Optimization during the Promotion" title="Direct link to Optimization during the Promotion"></a></h2><p>As mentioned above, we have established Aggregation Key of different granularities in Doris, including PV, SKU, and A/B test granularity. Here we take the exposure A/B test model with the largest amount of daily production data as an example to explain how to support the query of tens of billions of records per day during the big promotion period.</p><p>Strategy we used:</p><ul><li>Monitoring: 10, 30, 60 minutes A/B test with indicators, such as exposure PV, UV, exposure SKU pieces, click PV, click UV and CTR.</li><li>Data Modeling: Use exposed real-time data to establish Aggregation Key; And perform HyperLogLog approximate calculation with UV and PV</li></ul><p>Clusters we had:</p><ul><li>30+ virtual machines with storage of NVMe SSD</li><li>40+ partitions exposed by A/B test</li><li>Tens of billions of new data are created every day</li><li>2 Rollups</li></ul><p>Benefits overall:</p><ul><li>Bucket Field can quickly locate tablet partition when querying</li><li>Import 600 million records in 10 minutes</li><li>2 Rollups have relatively low IO, which meet the requirement of the query</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="look-ahead">Look Ahead<a href="#look-ahead" class="hash-link" aria-label="Direct link to Look Ahead" title="Direct link to Look Ahead"></a></h2><p>JD search box introduced Apache Doris in May 2020, with a scale of 30+ BEs, 10+ routine load tasks running online at the same time. Replacing Flink&#x27;s window computing with Doris can not only improve development efficiency, adapt to dimension changes, but also reduce computing resources. Apache Doris provides unified interface services ensuring data consistency and security.
We are also pushing the upgrade of JD search box&#x27;s OLAP platform to the latest version. After upgrading, we plan to use the bitmap function to support accurate deduplication operations of UV and other indicators. In addition, we also plan to use the appropriate Flink window to develop the real-time stream computing of the aggregation layer to increase the richness and completeness of the data.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Netease">Apache Doris helped Netease create a refined operation DMP system</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Xiaodong Liu</span></span><time datetime="2022-11-30T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">November 30, 2022</time></div></header><div class="markdown" itemprop="articleBody"><h1>Apache Doris Helped Netease Create a Refined Operation DMP System</h1><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/kv-a63c2e8908df91d10704f971aa636fa6.png" width="900" height="383" class="img_ev3q"></p><blockquote><p>Guide: Refined operation is a trend of the future Internet, which requires excellent data analysis. In this article, you will get knowledge of: the construction of Netease Lifease&#x27;s DMP system and the application of Apache Doris.</p></blockquote><blockquote><p>Author | Xiaodong Liu, Lead Developer, Netease</p></blockquote><p>Better data analysis enables users to get better experience. Currently, the normal analysis method is to build a user tags system to accurately generate user portraits and improve user experience. The topic we shared today is the practice of Netease DMP tags system.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="about-netease-and-lifease">About Netease and Lifease<a href="#about-netease-and-lifease" class="hash-link" aria-label="Direct link to About Netease and Lifease" title="Direct link to About Netease and Lifease"></a></h2><p>NetEase (NASDAQ: NTES) is a leading Internet technology company in China, providing users with free emails, gaming, search engine services, news and entertainment, sports, e-commerce and other services.</p><p>Lifease is Netease&#x27;s self-operated home furnishing e-commerce brand. Its products cover 8 categories in total: home life, apparel, food and beverages, personal care and cleaning, baby products, outdoor sport, digital home appliances, and Lifease&#x27;s Special. In Q1 of 2022, Lifease launches &quot;Pro &quot; membership and other multiple memberships for different users. The number of Pro members has increased by 65% ​​compared with the previous year.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="about-the-dmp-system">About the DMP System<a href="#about-the-dmp-system" class="hash-link" aria-label="Direct link to About the DMP System" title="Direct link to About the DMP System"></a></h2><p>DMP system plays an important role in Lifease&#x27;s data analysis.
The data sources of DMP mainly include:</p><ul><li>Business logs of APPs, H5s, PCs and other terminals</li><li>Basic data constructed within NetEase Group</li><li>Data from products sold by third-party such as JD.com, Alibaba, and Bytedance
Through data collection and data cleaning, the above data is ingested into data assets. Based on these data, DMP has created a system of functions, such as tag creation, grouping and portrait analysis, which supports the business including: intelligent product matching, user engagement, and user insight. In general, the DMP system concentrates on building a data-centric tagging system and portrait system to assist the business.</li></ul><p>You can get basic knowledge of the DMP system starting from the concepts below:</p><ul><li>Tagging: Tagging is one of the user monitoring abilities to uniquely identify individual users across different browsers, devices, and user sessions. This approach to user tagging works by capturing available data in your application&#x27;s page source: age, address, preference and other variables. </li><li>Targeting: Target audience may be dictated by age, gender, income, location, interests or a myriad of other factors.</li><li>User Portrait Analysis: User portrait analysis is to develop user profiles, actions and attributes after targeting audience. For instance, check the behavior paths and consumption models of users whose portraits are &quot;City: Hangzhou, Gender: Female&quot; on Lifease APP.</li></ul><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/1__core_capability-188f05fadbac0c4dfa3574a4e140cb8b.png" width="1153" height="642" class="img_ev3q"></p><p>Llifease&#x27;s tagging system mainly provides two core capabilities: </p><ol><li>Tag Query: the ability to query the specified tag of a specific entity, which is often used to display basic information. </li><li>Targeting Audience: for both real-time and offline targets. Result after targeting is mainly used for:</li></ol><ul><li>As Grouping Criteria: It can be used to tell if the user is in one or more specified groups. This occasionally occurs in scenarios such as advertising and contact marketing. </li><li>Resultset Pull: Extract specified data to business system for customized development.</li><li>Portrait Analysis: Analyze the behavioral and consumption models in specific groups of people for more refined operations.</li></ul><p>The overall business process is as follows:</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/2__business_process-ca10e9f507ff8157caa521d0c44d7fc4.png" width="1223" height="662" class="img_ev3q"></p><ul><li>First define the rules for tags and grouping;</li><li>After defining the DSL, the task can be submitted to Spark for processing;</li><li>After the processing is done, the results can be stored in Hive and Doris;</li><li>Data from Hive or Doris can be queried and used according to the actual business needs.</li></ul><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/3__dmp_architecture-82a3358b3eb8794fcff543415248505e.png" width="1197" height="706" class="img_ev3q"></p><p>The DMP platform is divided into four modules: Processing&amp;storage layer, scheduling layer, service layer, and metadata management.
All tag meta-information is stored in the source data table; The scheduling layer schedules tasks for the entire business process: Data processing and aggregation are converted into basic tags, and the data in the basic tags and source tables are converted into something that can be used for data query through SQL; The scheduling layer dispatches tasks to Spark to process, and then stores results in both Hive and Doris. The service layer consists of four parts: tag service, entity grouping service, basic tag data service, and portrait analysis service.</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/4__tag_lifecycle-ec086d95f04379a7f9a10993c0089e63.png" width="1124" height="648" class="img_ev3q"></p><p>The lifecycle of tag consists of 5 phases:</p><ul><li>Tag requirements: At this stage, the operation team demands and the product manager team evaluates the rationality and urgency of the requirements.</li><li>Scheduling production: Developers first sort out the data from ODS to DWD, which is the entire link of DM layer. Secondly, they build a model based on data, and at the same time, monitor the production process.</li><li>Targeting Audience: After the tag is produced, group the audience by those tags.</li><li>Precision marketing: Carry out precision marketing strategy to people grouped by.</li><li>Effect evaluation: In the end, tage usage rate and use effect need to be evaluated for future optimization.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="production-of-tags">Production of Tags<a href="#production-of-tags" class="hash-link" aria-label="Direct link to Production of Tags" title="Direct link to Production of Tags"></a></h2><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/5__production_of_tags-a53f5f1d2e03dc74f8d0e69092e4bd02.png" width="1145" height="675" class="img_ev3q"></p><p>Tag data layering:</p><ul><li>The bottom layer is the ODS layer, including user login logs, event tracking records, transaction data, and Binlog data of various databases</li><li>The data processed by the ODS layer, such as user login table, user activity table and order information table reaches the DWD detail layer</li><li>The DWD layer data is aggregated to the DM layer and the tags are all implemented based on the DM layer data.
At present, we have fully automated the data output from the original database to the ODS layer. And we also realized partial automation from the ODS layer to the DWD layer. And there are a small number of automated operations from the DWD to the DM layer, which will be our focus in the future.</li></ul><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/6__type_of__tags-91b30c2315a91d57aa96017a4ec716eb.png" width="1154" height="677" class="img_ev3q"></p><p>Tags are devided based on timeliness: offline tags, quasi-real-time tags and real-time tags. According to the scale of data, it is divided into: aggregation tags and detail tags. In other cases, tags can also be divided into: account attribute tags, consumption behavior tags, active behavior tags, user preference tags, asset information tags, etc. </p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/7__tags_settings-8a8c1c99a4afbc7f78ceb4659da2c184.png" width="1163" height="672" class="img_ev3q"></p><p>It is inconvenient to use the data of the DM layer directly because the basic data is relatively primitive. The abstraction level is lacking and it is not easy to use. By combining basic data with AND, OR, and NOT, business tags are formed for further use, which can reduce the cost of understanding operations and make it easier to use.</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/8__target_audience-cfe11c32b47db0639303f640a3452d98.png" width="1161" height="696" class="img_ev3q"></p><p>After the tags are merged, it is necessary to apply the tags to specific business scenarios, such as grouping. The configuration is shown on the left side of the figure above, which supports offline crowd packages and real-time behaviors (need to be configured separately). After configuration, generate the DSL rules shown on the right side of the figure above, expressed in Json format, which is more friendly to FE, and can also be converted into query statements of the datebase engine.</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/9__target_audience-mapping-1b00b571d178577b4f0c4f2c8a5b1acf.png" width="1120" height="649" class="img_ev3q"></p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/10__automation-fe72dc6c87f37fdd94f217a9174706bd.png" width="1114" height="649" class="img_ev3q"></p><p>Tagging is partially automated. The degree of automation in grouping is relatively high. For example, group refresh can be done regularly every day; Advanced processing, such as intersection/merge/difference between groups; Data cleaning means timely cleaning up expired and invalid data.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="tags-storage">Tags Storage<a href="#tags-storage" class="hash-link" aria-label="Direct link to Tags Storage" title="Direct link to Tags Storage"></a></h2><p>Lifease&#x27;s DMP labeling system needs to carry relatively large customer end traffic, and has relatively high requirements for real-time performance. Our storage requirements include:</p><ul><li>Need support high-performance query to deal with large-scale customer end traffic</li><li>Need support SQL to facilitate data analysis scenarios</li><li>Need support data update mechanism</li><li>Can store large amount of data</li><li>Need support for extension functions to handle custom data structures</li><li>Closely integrated with big data ecology</li></ul><p>In the field of big data, multiple engines vary in different applicable scenarios. We used the popular engines in the chart below to optimize our database architecture for 2 times.</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/11__comparision-dd0d69a571e362dcca7711561a30db7c.png" width="1133" height="660" class="img_ev3q"></p><p>Our architecture V1.0 is shown below:</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/12__architecture_v1_0-59dffe2265ac0754860a4bc796c090fa.png" width="1175" height="695" class="img_ev3q"></p><p>Most of the offline data is stored in Hive while a small part is stored in Hbase (mainly used for querying basic tags). Part of the real-time data is stored in Hbase for basic tags query and the rest is double-written into KUDU and Elasticsearch for real-time grouping and data query. The data offline is processed by Impala and cached in Redis.
Disadvantages :</p><ul><li>Too many database engines.</li><li>Double writing has hidden problems with data quality. One side may succeed while the other side fails, resulting in data inconsistency.</li><li>The project is complex and maintainability is poor.
In order to reduce the usage of engine and storage, we improved and implemented version 2.0 :</li></ul><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/13__architecture_v2_0-1f1c2b508793cf146a606b3a453e01a5.png" width="1148" height="677" class="img_ev3q"></p><p>In storage architecture V2.0, Apache Doris is adopted. Offline data is mainly stored in Hive. At the same time, basic tags are imported into Doris, and real-time data as well. The query federation of Hive and Doris is performed based on Spark, and the results are stored in Redis. After this improvement, an storage engine which can manages offline and real-time data has been created. We are currently use Apache Doris 1.0, which enables : 1. The query performance can be controlled within 20ms at 99% 2. The query performance can be controlled within 50ms at 99.9%. Now the architecture is simplified, which greatly reduces operation and maintenance costs.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="advantages-of-apache-doris-in-practice">Advantages of Apache Doris in Practice<a href="#advantages-of-apache-doris-in-practice" class="hash-link" aria-label="Direct link to Advantages of Apache Doris in Practice" title="Direct link to Advantages of Apache Doris in Practice"></a></h2><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/14__advantages_in_practice-3fc1c9893383a6635c8c9612e3ef0a15.png" width="1128" height="658" class="img_ev3q"></p><p>Lifeuse has adopted Apache Doris to check, batch query, path analyse and grouping. The advantages are as follows:</p><ul><li>The query federation performance of key query and a small number of tables exceeds 10,000 QPS, with RT99&lt;50MS.</li><li>The horizontal expansion capability is relatively strong and maintenance cost is relatively low.</li><li>The offlin and real-time data are unified to reduce the complexity of the tags model.</li></ul><p>The downside is that importing a large amount of small data takes up more resources. But this problem has been optimized in Doris 1.1. Apache Doris has greatly enhanced the data compaction capability in version 1.1, and can quickly complete aggregation of new data, avoiding the -235 error caused by too many versions of sharded data and the low query efficiency problems.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="future-plan">Future Plan<a href="#future-plan" class="hash-link" aria-label="Direct link to Future Plan" title="Direct link to Future Plan"></a></h2><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/15__future_plan-199b125bad243e0dcd93f00b9f4395fe.png" width="1117" height="652" class="img_ev3q"></p><p>Hive and Spark are gradually turning into Apache Doris.
Optimize the tagging system:</p><ul><li>Establish a rich and accurate tag evaluation system</li><li>Improve tag quality and output speed</li><li>Improve tag coverage
More precision operation:</li><li>Build a rich user analysis model</li><li>Improve the user insight model evaluation system based on the frequency of use and user value</li><li>Establish general image analysis capabilities to assist intelligent decision-making in operations</li></ul></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/NIO">The application of Apache Doris in NIO</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Huaidong Tang</span></span><time datetime="2022-11-28T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">November 28, 2022</time></div></header><div class="markdown" itemprop="articleBody"><h1>The Application of Apache Doris in NIO</h1><p><img loading="lazy" alt="NIO" src="https://cdnd.selectdb.com/assets/images/NIO_kv-7601d71a49c7ecd7fb42f03de600ae6c.png" width="900" height="383" class="img_ev3q"></p><blockquote><p>Guide: The topic of this sharing is the application of Apache Doris in NIO, which mainly includes the following topics:</p><ol><li>Introduction about NIO</li><li>The Development of OLAP in NIO</li><li>Apache Doris-the Unified OLAP Data warehouse</li><li>Best Practice of Apache Doris on CDP Architecture</li><li>Summery and Benefits</li></ol></blockquote><p>Author:Huaidong Tang, Data Team Leader, NIO INC</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="about-nio">About NIO<a href="#about-nio" class="hash-link" aria-label="Direct link to About NIO" title="Direct link to About NIO"></a></h2><p>NIO Inc. (NYSE: NIO)is a leading company in the premium smart electric vehicle market. Founded in November 2014, NIO designs, develops, jointly manufactures and sells premium smart electric vehicles, driving innovations in autonomous driving, digital technologies, electric powertrains and batteries.</p><p>Recently, NIO planned to enter the U.S. market alongside other western markets by the end of 2025. The company has already established a U.S. headquarters in San Jose, California, where they started hiring people.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="the-architecture-evolution-of-olap-in-nio">The Architecture Evolution of OLAP in NIO<a href="#the-architecture-evolution-of-olap-in-nio" class="hash-link" aria-label="Direct link to The Architecture Evolution of OLAP in NIO" title="Direct link to The Architecture Evolution of OLAP in NIO"></a></h2><p>The architectural evolution of OLAP in NIO took several steps for years.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-introduced-apache-druid">1. Introduced Apache Druid<a href="#1-introduced-apache-druid" class="hash-link" aria-label="Direct link to 1. Introduced Apache Druid" title="Direct link to 1. Introduced Apache Druid"></a></h3><p>At that time, there were not so many OLAP storage and query engines to choose from. The more common ones were Apache Druid and Apache Kylin. There are 2 reasons why we didn&#x27;t choose Kylin.</p><ul><li><p>The most suitable and optimal storage at the bottom of Kylin is HBase and adding it would increase the cost of operation and maintenance.</p></li><li><p>Kylin&#x27;s precalculation involves various dimensions and indicators. Too many dimensions and indicators would cause great pressure on storage.</p></li></ul><p>We prefer Druid because we used to be users and are familiar with it. Apache Druid has obvious advantages. It supports real-time and offline data import, columnar storage, high concurrency, and high query efficiency. But it has downsides as well:</p><ul><li><p>Standard protocols such as JDBC are not used</p></li><li><p>The capability of JOIN is weak</p></li><li><p>Significant performance downhill when performing dedeplication</p></li><li><p>High in operation and maintenance costs, different components have separate installation methods and different dependencies; Data import needs extra integration with Hadoop and the dependencies of JAR packages</p></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-introduced-tidb">2. Introduced TiDB<a href="#2-introduced-tidb" class="hash-link" aria-label="Direct link to 2. Introduced TiDB" title="Direct link to 2. Introduced TiDB"></a></h3><p><strong>TiDB is a mature datawarehouse focused on OLTP+OLAP, which also has distinctive advantages and disadvantages:</strong></p><p>Advantage:</p><ul><li><p>OLTP database, can be updated friendly</p></li><li><p>Supports detailed and aggregated query, which can handle dashboard statistical reports or query of detailed data at the same time</p></li><li><p>Supports standard SQL, which has low cost of use</p></li><li><p>Low operation and maintenance cost</p></li></ul><p>Disadvantages:</p><ul><li><p>It is not an independent OLAP. TiFlash relies on OLTP and will increase storage. Its OLAP ability is insufficient</p></li><li><p>The overall performance should be measured separately by each scene</p></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3-introduced-apache-doris">3. Introduced Apache Doris<a href="#3-introduced-apache-doris" class="hash-link" aria-label="Direct link to 3. Introduced Apache Doris" title="Direct link to 3. Introduced Apache Doris"></a></h3><p>Since 2021, we have officially introduced Apache Doris. In the process of selection, we are most concerned about various factors such as product performance, SQL protocol, system compatibility, learning and operation and maintenance costs. After deep research and detailed comparison of the following systems, we came to the following conclusions:</p><p><strong>Apache Doris, whose advantages fully meet our demands:</strong></p><ul><li><p>Supports high concurrent query (what we concerned most)</p></li><li><p>Supports both real-time and offline data</p></li><li><p>Supports detailed and aggregated query</p></li><li><p>UNIQ model can be updated</p></li><li><p>The ability of Materialized View can greatly speed up query efficiency</p></li><li><p>Fully compatible with the MySQL protocol and the cost of development is relatively low</p></li><li><p>The performance fully meets our requirements</p></li><li><p>Lower operation and maintenance costs</p></li></ul><p><strong>Moreover, there is another competitor, Clickhouse. Its stand-alone performance is extremely strong, but its disadvantages are hard to accept:</strong></p><ul><li><p>In some cases, its multi-table JOIN is weak</p></li><li><p>Relatively low in concurrency</p></li><li><p>High operation and maintenance costs</p></li></ul><p>With multiple good performances, Apache Doris outstands Druid and TiDB. Meanwhile Clickhouse did not fit well in our business, which lead us to Apache Doris.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="apache-doris-the-unified-olap-datawarehouse">Apache Doris-the Unified OLAP Datawarehouse<a href="#apache-doris-the-unified-olap-datawarehouse" class="hash-link" aria-label="Direct link to Apache Doris-the Unified OLAP Datawarehouse" title="Direct link to Apache Doris-the Unified OLAP Datawarehouse"></a></h2><p><img loading="lazy" alt="NIO" src="https://cdnd.selectdb.com/assets/images/olap-96ad3bb86cebd92a200a0581f0418d3c.png" width="1018" height="669" class="img_ev3q"></p><p>This diagram basically describes our OLAP Architecuture, including data source, data import, data processing, data warehouse, data service and application.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-data-source">1. Data Source<a href="#1-data-source" class="hash-link" aria-label="Direct link to 1. Data Source" title="Direct link to 1. Data Source"></a></h3><p>In NIO, the data source not only refers to database, but also event tracking data, device data, vehicle data, etc. The data will be ingested into the big data platform. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-data-import">2. Data Import<a href="#2-data-import" class="hash-link" aria-label="Direct link to 2. Data Import" title="Direct link to 2. Data Import"></a></h3><p>For business data, you can trigger CDC and convert it into a data stream, store it in Kafka, and then perform stream processing. Some data that can only be passed in batches will directly enter our distributed storage.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3-data-processing">3. Data Processing<a href="#3-data-processing" class="hash-link" aria-label="Direct link to 3. Data Processing" title="Direct link to 3. Data Processing"></a></h3><p>We took the Lambda architecture rather than stream-batch integration.</p><p>Our own business determines that our Lambda architecture should be divided into two paths: offline and real-time:</p><ul><li><p>Some data is streamed.</p></li><li><p>Some data can be stored in the data stream, and some historical data will not be stored in Kafka.</p></li><li><p>Some data requires high precision in some circumstances. In order to ensure the accuracy of the data, an offline pipeline will recalculate and refresh the entire data.</p></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="4-data-warehouse">4. Data Warehouse<a href="#4-data-warehouse" class="hash-link" aria-label="Direct link to 4. Data Warehouse" title="Direct link to 4. Data Warehouse"></a></h3><p>From data processing to the data warehouse, we did not adopt Flink or Spark Doris Connector. We use Routine Load to connect Apache Doris and Flink, and Broker Load to connect Doris and Spark. The data generated in batches by Spark will be backed up to Hive for further use in other scenarios. In this way, each calculation is used for multiple scenarios at the same time, which greatly improves the efficiency. It also works for Flink.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="5-data-service">5. Data Service<a href="#5-data-service" class="hash-link" aria-label="Direct link to 5. Data Service" title="Direct link to 5. Data Service"></a></h3><p>What behind Doris is One Service. By registering the data source or flexible configuration, the API with flow and authority control is automatically generated, which greatly improves flexibility. And with the k8s serverless solution, the entire service is much more flexible.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="6-application">6. Application<a href="#6-application" class="hash-link" aria-label="Direct link to 6. Application" title="Direct link to 6. Application"></a></h3><p>In the application layer, we mainly deploy some reporting applications and other services.</p><p>We mainly have two types of scenarios:</p><ul><li><p><strong>User-oriented</strong> , which is similar to the Internet, contains a data dashboard and data indicators.</p></li><li><p><strong>Car-oriented</strong> , car data enters Doris in this way. After certain aggregation, the volume of Doris data is about billions. But the overall performance can still meet our requirements.</p></li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="best-practice-of-apache-doris-on-cdp-architecture">Best Practice of Apache Doris on CDP Architecture<a href="#best-practice-of-apache-doris-on-cdp-architecture" class="hash-link" aria-label="Direct link to Best Practice of Apache Doris on CDP Architecture" title="Direct link to Best Practice of Apache Doris on CDP Architecture"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1-cdp-architecture">1. CDP Architecture<a href="#1-cdp-architecture" class="hash-link" aria-label="Direct link to 1. CDP Architecture" title="Direct link to 1. CDP Architecture"></a></h3><p><img loading="lazy" alt="NIO" src="https://cdnd.selectdb.com/assets/images/cdp-3d65926e741a2837759b07514e914bbf.png" width="1471" height="422" class="img_ev3q"></p><p>Next, let me introduce Doris&#x27; practice on the operating platform. This is what happens in our real business. Nowadays, Internet companies will make their own CDP, which includes several modules:</p><ul><li><p><strong>Tags</strong> , which is the most basic part.</p></li><li><p><strong>Target</strong> , based on tags, select people according to some certain logic.</p></li><li><p><strong>Insight</strong> , aiming at a group of people, clarify the distribution and characteristics of the group.</p></li><li><p><strong>Touch</strong> , use methods such as text messages, phone calls, voices, APP notifications, IM, etc. to reach users, and cooperate with flow control.</p></li><li><p><strong>Effect analysis,</strong> to improve the integrity of the operation platform, with action, effect and feedback.</p></li></ul><p>Doris plays the most important role here, including: tags storage, groups storage, and effect analysis.</p><p>Tags are divided into basic tags and basic data of user behavior. We can flexibly customize other tags based on those facts. From the perspective of time effectiveness, tags are also divided into real-time tags and offline tags.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2-considerations-for-cdp-storage-selection">2. Considerations for CDP Storage Selection<a href="#2-considerations-for-cdp-storage-selection" class="hash-link" aria-label="Direct link to 2. Considerations for CDP Storage Selection" title="Direct link to 2. Considerations for CDP Storage Selection"></a></h3><p>We took five dimensions into account when we select CDP storage.</p><p><strong>(1) Unification of Offline and Real-time</strong></p><p>As mentioned earlier, there are offline tags and real-time tags. Currently we are close to quasi-real-time. For some data, quasi-real-time is good enough to meet our needs. A large number of tags are still offline tags. The methods used are Doris&#x27;s Routine Load and Broker Load.</p><table><thead><tr><th><strong>Scenes</strong></th><th><strong>Requirements</strong></th><th><strong>Apache Doris&#x27;s Function</strong></th></tr></thead><tbody><tr><td>Real-time tags</td><td>Real-time data updates</td><td>Routine Load</td></tr><tr><td>Offline tags</td><td>Highly efficient batch import</td><td>Broker Load</td></tr><tr><td>Unification of offline and real-time</td><td>Unification of offline and real-time data storage</td><td>Routine Load and Broker Load update different columns of the same table</td></tr></tbody></table><p>In addition, on the same table, the update frequency of different columns is also different. For example, we need to update the user&#x27;s identity in real time because the user&#x27;s identity changes all the time. T+1&#x27;s update does not meet our needs. Some tags are offline, such as the user&#x27;s gender, age and other basic tags, T+1 update is sufficient to meet our standards. The maintenance cost caused by putting the tags of basic users on the same table is very low. When customizing tags later, the number of tables will be greatly reduced, which benefits the overall performance.</p><p><strong>(2) Efficient Targets</strong></p><p>When users tags are done, is time to target right group of people. The target is to filter out all the people who meet the conditions according to different combinations of tags. At this time, there will be queries with different combinations of tag conditions. There was an obvious improvement when Apache Doris upgraded to vectorization.</p><table><thead><tr><th><strong>Scenes</strong></th><th><strong>Requirements</strong></th><th><strong>Apache Doris&#x27;s Function</strong></th></tr></thead><tbody><tr><td>Complex Condition Targets</td><td>Highly efficient combination of tags</td><td>Optimization of SIMD</td></tr></tbody></table><p><strong>(3) Efficient Polymerization</strong></p><p>The user insights and effect analysis statistics mentioned above require statistical analysis of the data, which is not a simple thing of obtaining tags by user ID. The amount of data read and query efficiency have a great impact on the distribution of our tags, the distribution of groups, and the statistics of effect analysis. Apache Doris helps a lot:</p><ul><li><p>Data Partition. We shard the data by time order and the analysis and statistics will greatly reduce the amount of data, which can greatly speed up the efficiency of query and analysis.</p></li><li><p>Node aggregation. Then we collect them for unified aggregation.</p></li><li><p>Vectorization. The vectorization execution engine has significant performance improvement.</p></li></ul><table><thead><tr><th><strong>Scenes</strong></th><th><strong>Requirements</strong></th><th><strong>Apache Doris&#x27;s Function</strong></th></tr></thead><tbody><tr><td>Distribution of Tags Values</td><td>The distribution values ​​of all tags need to be updated every day. Fast and efficient statistics are required</td><td>Data partition lessens data transfer and calculation</td></tr><tr><td>Distribution of Groups</td><td>Same as Above</td><td>Unified storage and calculation, each node aggregates first</td></tr><tr><td>Statistics for Performance Analysis</td><td>Same as Above</td><td>Speed up SIMD</td></tr></tbody></table><p><strong>(4) Multi-table Association</strong></p><p>Our CDP might be different from common CDP scenarios in the industry, because common CDP tags in some scenarios are estimated in advance and no custom tags, which leaves the flexibility to users who use CDP to customize tags themselves. The underlying data is scattered in different database tables. If you want to create a custom tag, you must associate the tables.</p><p>A very important reason we chose Doris is the ability to associate multiple tables. Through performance tests, Apache Doris is able to meet our requirements. And Doris provides users with powerful capabilities because tags are dynamic.</p><table><thead><tr><th><strong>Scenes</strong></th><th><strong>Requirements</strong></th><th><strong>Apache Doris&#x27;s Function</strong></th></tr></thead><tbody><tr><td>Distributed Characteristics of the Population</td><td>The distribution of statistical groups under a certain characteristic</td><td>Table Association</td></tr><tr><td>Single Tag</td><td>Display tags</td><td></td></tr></tbody></table><p><strong>(5) Query Federation</strong></p><p>Whether the user is successfully reached or not will be recorded in TiDB. Notifications during operations may only affect user experience. If a transaction is involved, such as gift cards or coupons, the task execution must be done without repetition. TiDB is more suitable for this OLTP scenario.</p><p>But for effect analysis, it is necessary to understand the extent to which the operation plan is implemented, whether the goal is achieved and its distribution. It is necessary to combine task execution and group selection for analysis, which requires the query association between Doris and TiDB.</p><p>The size of the tag is probably small, so we would like to save it into Elasticsearch. However, it proves us wrong later.</p><table><thead><tr><th><strong>Scenes</strong></th><th><strong>Requirements</strong></th><th><strong>Apache Doris&#x27;s Function</strong></th></tr></thead><tbody><tr><td>Effect Analysis Associated with Execution Details</td><td>Doris query associated with TiDB</td><td>Query Association with other databases</td></tr><tr><td>Group Tags Associated with Behavior Aggregation</td><td>Doris query associated with Elasticsearch</td><td></td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_LWe7" id="summery-and-benefits">Summery and Benefits<a href="#summery-and-benefits" class="hash-link" aria-label="Direct link to Summery and Benefits" title="Direct link to Summery and Benefits"></a></h2><ol><li><p><strong>bitmap</strong>. Our volume are not big enough to test its full efficiency. If the volume reaches a certain level, using bitmap might have a good performance improvement. For example, when calculating UV , bitmap aggregation can be considered if the full set of Ids is greater than 50 million.</p></li><li><p><strong>The performance is good</strong> when Elasticsearch single-table query is associated with Doris.</p></li><li><p><strong>Better to update columns in batches</strong>. In order to reduce the number of tables and improve the performance of the JOIN table, the table designed should be as streamlined as possible and aggregated as much as possible. However, fields of the same type may have different update frequencies. Some fields need to be updated at daily level, while others may need to be updated at hourly level. Updating a column alone is an important requirement. The solution from Apache Doris is to use REPLACE<!-- -->_<!-- -->IF<!-- -->_<!-- -->NOT<!-- -->_<!-- -->NULL. Note: It is impossible to replace the original non-null value with null. You can replace all nulls with meaningful default values, such as unknown.</p></li><li><p><strong>Online Services</strong>. Apache Doris serves online and offline scenarios at the same time, which requires high resource isolation.</p></li></ol></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/Use-Apache-Doris-with-AI-chatbots">How does Apache Doris help AISPEECH build a data warehouse in AI chatbots scenario</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Zhao Wei</span></span><time datetime="2022-11-24T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">November 24, 2022</time></div></header><div class="markdown" itemprop="articleBody"><h1>How Does Apache Doris Help AISPEECH Build a Data warehouse in AI Chatbots Scenario</h1><p><img loading="lazy" alt="kv" src="https://cdnd.selectdb.com/assets/images/kv-7d5af44f82188444fd1c6ac613c1d7eb.png" width="900" height="383" class="img_ev3q"></p><blockquote><p>Guide: In 2019, AISPEACH built a real-time and offline datawarehouse based on Apache Doris. Reling on its flexible query model, extremely low maintenance costs, high development efficiency, and excellent query performance, Apache Doris has been used in many business scenarios such as real-time business operations, AI chatbots analysis. It meets various data analysis needs such as device portrait/user label, real-time operation, data dashboard, self-service BI and financial reconciliation. And now I will share our experience through this article.</p></blockquote><p>Author|Zhao Wei, Head Developer of AISPEACH&#x27;s Big Data Departpment</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="backgounds">Backgounds<a href="#backgounds" class="hash-link" aria-label="Direct link to Backgounds" title="Direct link to Backgounds"></a></h2><p>AISPEACH is a professional conversational artificial intelligence company in China. It has full-link intelligent voice and language technology. It is committed to becoming a platform-based enterprise for full-link intelligent voice and language interaction. Recently it has developed a new generation of human-computer interaction platform DUI and artificial intelligence chip TH1520, providing natural language interaction solutions for partners in many industry scenarios such as Internet of Vehicles, IoT, government affairs and fintech.</p><p>Aspire introduced Apache Doris for the first time in 2019 and built a real-time and offline data warehouse based on Apache Doris. Compared with the previous architecture, Apache Doris has many advantages such as flexible query model, extremely low maintenance cost, high development efficiency and excellent query performance. Multiple business scenarios have been applied to meet various data analysis needs such as device portraits/user tags, real-time operation of business scenarios, data analysis dashboards, self-service BI, and financial reconciliation.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="architecture-evolution">Architecture Evolution<a href="#architecture-evolution" class="hash-link" aria-label="Direct link to Architecture Evolution" title="Direct link to Architecture Evolution"></a></h2><p>Offline data analysis in the early business was our main requirement. Recently, with the continuous development of business, the requirements for real-time data analysis in business scenarios have become higher and higher. The early datawarehouse architecture failed to meet our requirements. In order to meet the higher requirements of business scenarios for query performance, response time, and concurrency capabilities, Apache Doris was officially introduced in 2019 to build a real-time and offline integrated datawarehouse architecture.</p><p>In the following I will introduce the evolution of the AISPEACH Data Warehouse architecture, and share the reasons why we chose Apache Doris to build a new architecture.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="early-data-warehouse-architecture">Early Data Warehouse Architecture<a href="#early-data-warehouse-architecture" class="hash-link" aria-label="Direct link to Early Data Warehouse Architecture" title="Direct link to Early Data Warehouse Architecture"></a></h3><p>As shown in the architecture diagram below, the offline data warehouse is based on Hive + Kylin while the real-time data warehouse is based on Spark + MySQL.</p><p><img loading="lazy" alt="data_wharehouse_architecture_v1_0_git" src="https://cdnd.selectdb.com/assets/images/data_wharehouse_architecture_v1_0_git-006b22817872b04ad8f909e54e8c1411.png" width="1953" height="1106" class="img_ev3q"></p><p>There are three main types of data sources in our business, business databases such as MySQL, application systems such as K8s container service logs, and logs of automotive T-Box. Data sources are first written to Kafka through various methods such as MQTT/HTTP protocol, business database Binlog, and Filebeat log collection. In the early time, the data will be divided into real-time and offline links after passing through Kafka. Real-time part has a shorter link. The data buffered by Kafka is processed by Spark and put into MySQL for further analysis. MySQL can basically meet the early analysis requirements. After data cleaning and processing by Spark, an offline datawarehouse is built in Hive, and Apache Kylin is used to build Cube. Before building Cube, it is necessary to design the data model in advance, including association tables, dimension tables, index fields, and aggregation functions. After construction through the scheduling system, we can finally use HBase to store the Cube.</p><h4 class="anchor anchorWithStickyNavbar_LWe7" id="pain-points-of-early-architecture">Pain Points of Early Architecture:<a href="#pain-points-of-early-architecture" class="hash-link" aria-label="Direct link to Pain Points of Early Architecture:" title="Direct link to Pain Points of Early Architecture:"></a></h4><ol><li><p><strong>There are many dependent components.</strong> Kylin strongly relies on Hadoop and HBase in versions 2.x and 3.x. The large number of application components leads to low development efficiency, many hidden dangers of architecture stability, and high maintenance costs.</p></li><li><p><strong>The construction process of Kylin is complicated and the construction task always fail.</strong> When we do construction for Kylin, we always need to do the following: widen tables, de-duplicate columns, generate dictionaries, build cubes, etc. If there are 1000-2000 or more tasks per day, at least 10 or more tasks will fail to build, resulting in a lot of time to write automatic operation and maintenance scripts.</p></li><li><p><strong>Dimension/dictionary expansion is heavy.</strong> Dimension expansion refers to the need for multiple analysis conditions and fields in some business scenarios. If many fields are selected in the data analysis model without pruning, it will lead to severe cube dimension expansion and longer construction time. Dictionary inflation means that in some scenarios, it takes a long time to do global accurate deduplication, which will make the dictionary construction bigger and bigger, and the construction time will become longer and longer, resulting in a continuous decline in data analysis performance.</p></li><li><p><strong>The data analysis model is fixed and low in flexibility.</strong> In the actual application, if a calculation field or business scenario is changed, some or even all of the data needs to be backtracked.</p></li><li><p><strong>Data detail query is not supported.</strong> The early data warehouse architecture could not provide detailed data query. The official Kylin solution is to relate to Presto for detailed query, which introduces another architecture and increases development costs.</p></li></ol><h3 class="anchor anchorWithStickyNavbar_LWe7" id="architecture-selection">Architecture Selection<a href="#architecture-selection" class="hash-link" aria-label="Direct link to Architecture Selection" title="Direct link to Architecture Selection"></a></h3><p>In order to solve the problems above, we began to explore other datawarehouse architecture solutions. And we conducted a series of research on OLAP engines such as Apache Doris and Clickhouse, which are most widely used in the market.</p><p>As the original creator, SelectDB provides commercial support and services for Apache Doris. With the new Apache Doris, SelectDB is now providing global users with a fully-managed database option for deployment.</p><p>Comparing with ClickHouse&#x27;s heavy maintenance, various table types, and lack of support for associated queries, Apache Doris performed better. And combined with our OLAP analysis scenario, we finally decided to introduce Apache Doris.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="new-data-warehouse-architecture">New Data Warehouse Architecture<a href="#new-data-warehouse-architecture" class="hash-link" aria-label="Direct link to New Data Warehouse Architecture" title="Direct link to New Data Warehouse Architecture"></a></h3><p><img loading="lazy" alt="data_wharehouse_architecture_v2_0_git" src="https://cdnd.selectdb.com/assets/images/data_wharehouse_architecture_v2_0_git-825df043f0abf0fda4a92b8dc5d10956.png" width="1993" height="1144" class="img_ev3q"></p><p>As shown in the figure above, we built a new real-time + offline data warehouse architecture based on Apache Doris. Unlike the previous architecture, real-time and offline data are processed separately and written to Apache Doris for analysis.</p><p>Due to some historical reasons, data migration is difficult. The offline data is basically consistent with the previous datawarehouse architecture, and it is entirely possible to directly build an offline data warehouse on Apache Doris.</p><p>Comparing with the earlier architecture, the offline data is cleaned and processed by Spark, which is possible to build data warehouse in Hive. Then the data stored in Hive can be written to Apache Doris through Broker Load. What I want to explain here is that the data import speed of Broker Load is very fast and it only takes 10-20 minutes to import 100-200G data into Apache Doris on a daily basis.</p><p>When it comes to the real-time data flow, the new architecture uses Doris-Spark-Connector to consume data in Kafka and write it to Apache Doris after simple tasks. As shown in the architecture diagram, real-time and offline data are analyzed and processed in Apache Doris, which meets the business requirements of data applications for both real-time and offline.</p><h4 class="anchor anchorWithStickyNavbar_LWe7" id="benefits-of-the-new-architecture">Benefits of the New Architecture:<a href="#benefits-of-the-new-architecture" class="hash-link" aria-label="Direct link to Benefits of the New Architecture:" title="Direct link to Benefits of the New Architecture:"></a></h4><ol><li><p><strong>Simplified operation, low maintenance cost, and does not depend on Hadoop ecological components.</strong> The deployment of Apache Doris is simple. There are only two processes of FE and BE. Both FE and BE processes can be scaled out. A single cluster supports hundreds of machines and tens of PB storage capacity. These two types of processes pass the consistency agreement to ensure high availability of services and high reliability of data. This highly integrated architecture design greatly reduces the operation and maintenance cost of a distributed system. The operation and maintenance time spent in the three years of using Doris is very small. Comparing with the previous architecture based on Kylin, the new architecture spends little time on operation and maintenance.</p></li><li><p><strong>The difficulty of developing and troubleshooting problems is greatly reduced.</strong> The real-time and offline unified data warehouse based on Doris supports real-time data services, interactive data analysis, and offline data processing scenarios, which greatly reduces the difficulty of troubleshooting.</p></li><li><p><strong>Apache Doris supports JOIN query in Runtime format.</strong> Runtime is similar to MySQL&#x27;s table association, which is friendly to the scene where the data analysis model changes frequently, and solves the problem of low flexibility in the early structured data model.</p></li><li><p><strong>Apache Doris supports JOIN, aggregation, and detailed query at the same time.</strong> Meanwhile, it solves the problem that data details could not be queried in the previous architecture.</p></li><li><p><strong>Apache Doris supports multiple accelerated query methods.</strong> And it also supports rollup index, materialized view, and implements secondary index through rollup index to speed up query, which greatly improves query response time.</p></li><li><p><strong>Apache Doris supports multiple types of Query Federation.</strong> And it supports Federation Query analysis on data lakes such as Hive, Iceberg, and Hudi, and also databases such as MySQL and Elasticsearch.</p></li></ol><h2 class="anchor anchorWithStickyNavbar_LWe7" id="applications">Applications<a href="#applications" class="hash-link" aria-label="Direct link to Applications" title="Direct link to Applications"></a></h2><p>Apache Doris was first applied in real-time business and AI Chatbots analysis scenarios in AISPEACH. This chapter will introduce the requirements and applications of the two scenarios.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="real-time-business">Real-time Business<a href="#real-time-business" class="hash-link" aria-label="Direct link to Real-time Business" title="Direct link to Real-time Business"></a></h3><p><img loading="lazy" alt="real-time_operation_git" src="https://cdnd.selectdb.com/assets/images/real-time_operation_git-87d6e8ede096ba1551cb290941741126.png" width="1977" height="1226" class="img_ev3q"></p><p>As shown in the figure above, the technical architecture of the real-time operation business is basically the same as the new version of the data warehouse architecture mentioned above:</p><ul><li><p>Data Source: The data source is consistent in the new version with the architecture diagram in the new version, including business data in MySQL, event tracking data of the application system, device and terminal logs.</p></li><li><p>Data Import: Broker Load is used for offline data import, and Doris-Spark-Connector is used for real-time data import.</p></li><li><p>Data Storage and Development: Almost all real-time data warehouses are built on Apache Doris, and some offline data is placed on Airflow to perform DAG batch tasks.</p></li><li><p>Data Application: The top layer is the business analysis requirements, including large-screen display, real-time dashboard for data operation, user portrait, BI tools, etc.</p></li></ul><p><strong>In real-time operation business, there are two main requirements for data analysis:</strong></p><ul><li><p>Due to the large amount of real-time imported data, the query efficiency requirement is high.</p></li><li><p>In this scenario, a team of 20+ people is in charge. The data operation dashboard needs to be opened at the same time, so there will be relatively high requirements for real-time writing performance and query concurrency.</p></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="ai-chatbots-analysis">AI Chatbots Analysis<a href="#ai-chatbots-analysis" class="hash-link" aria-label="Direct link to AI Chatbots Analysis" title="Direct link to AI Chatbots Analysis"></a></h3><p>In addition, the second application of Apache Doris in AISPEACG is a AI Chatbots analysis.</p><p><img loading="lazy" alt="ai_chatbots_git" src="https://cdnd.selectdb.com/assets/images/ai_chatbots_git-f094d1221b56b522cb93ba3bc766e659.png" width="1953" height="1118" class="img_ev3q"></p><p>As shown in the figure above, different from normal BI cases, our users only needs to describe the data analysis needs by typing. Based on our company&#x27;s NLP capabilities, AI Chatbots BI will convert natural language into SQL, which similar to NL2SQL technology. It should be noted that the natural language analysis used here is customized. Comparing with open source NL2SQL, the hit rate is high and the analysis is more precise. After the natural language is converted into SQL, the SQL will give Apache Doris query to get the analysis result. As a result, users can view detailed data in any cases at any time by typing. <strong>Compared with pre-computed OLAP engines such as Apache Kylin and Apache Druid, Apache Doris performs better for the following reasons:</strong></p><ul><li><p>The query is flexible and the model is not fixed, which supports customization.</p></li><li><p>It needs to support table association, aggregation calculation, and detailed query.</p></li><li><p>Response time needs to be fast.</p></li></ul><p>Therefore, we have successfully implemented AI Chatbots analysis by using Apache Doris. At the same time, feedback on the application in our company is awesome.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="experience">Experience<a href="#experience" class="hash-link" aria-label="Direct link to Experience" title="Direct link to Experience"></a></h2><p>Based on the above two scenarios, we have accumulated some experience and insights and I will share them with you now.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="datawarehouse-table-design">Datawarehouse Table Design:<a href="#datawarehouse-table-design" class="hash-link" aria-label="Direct link to Datawarehouse Table Design:" title="Direct link to Datawarehouse Table Design:"></a></h3><ol><li><p>Tables which contain about tens of millions of data(for reference, related to the size of the cluster) is better to use the Duplicate table type. The Duplicate table type supports aggregation and detailed query at the same time, without additional detailed tables required.</p></li><li><p>When the amount of data is relatively large, we suggest to use the Aggregate aggregation table type, build a rollup index on the aggregation table type, use materialized views to optimize queries, and optimize aggregation fields.</p></li><li><p>When the amount of data is large with many associated tables, ETL can be used to write wide tables, imports to Doris, combined with Aggregate to optimize the aggregation table type. Or we suggest you use the official Doris JOIN optimization refer to: https://doris .apache.org/en-US/docs/dev/advanced/join-optimization/doris-join-optimization</p></li></ol><h3 class="anchor anchorWithStickyNavbar_LWe7" id="storage">Storage:<a href="#storage" class="hash-link" aria-label="Direct link to Storage:" title="Direct link to Storage:"></a></h3><p>We use SSD and HDD to separate hot and warm data storage. Data within the past year is stored in SSD, and data more than one year is stored in HDD. Apache Doris supports setting cooling time for partitions. The current solution is to set automatic synchronization to migrate historical data from SSD to HDD to ensure that the data within one year is placed in on the SSD.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="upgrade">Upgrade<a href="#upgrade" class="hash-link" aria-label="Direct link to Upgrade" title="Direct link to Upgrade"></a></h3><p>Make sure to back up the metadata before upgrading. You can also use the method of starting a new cluster to back up the data files to a remote storage system such as S3 or HDFS through Broker, and then import the previous cluster data into the new cluster through backup and recovery.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="performance-comparison">Performance Comparison<a href="#performance-comparison" class="hash-link" aria-label="Direct link to Performance Comparison" title="Direct link to Performance Comparison"></a></h3><p>Aspire started using Apache Doris from version 0.12. This year we completed the upgrade from version 0.15 to the latest version 1.1, and conducted performance tests based on real business data.</p><p><img loading="lazy" alt="doris_1_1_performance_test_git" src="https://cdnd.selectdb.com/assets/images/doris_1_1_performance_test_git-ad375d6872f12ab1e3cca76d30caa1f6.png" width="1961" height="1126" class="img_ev3q"></p><p>As can be seen from the test report, among the 13 SQLs test in total, the performance difference of the first 3 SQLs after the upgrade is not obvious, because these 3 scenarios are mainly simple aggregation functions, which do not require high performance of Apache Doris. Version 0.15 can meet demand. In the scenario after Q4, SQL is more complex while Group By needs multiple fields, aggregation functions and complex functions. Therefore, the performance improvement after upgrading is obvious to see: the average query performance is 2- 3 times. We highly recommend that you upgrade to the latest version of Apache Doris.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="summary-and-benefits">Summary and Benefits<a href="#summary-and-benefits" class="hash-link" aria-label="Direct link to Summary and Benefits" title="Direct link to Summary and Benefits"></a></h2><ol><li><p>Apache Doris supports the construction of offline plus real-time unified data warehouses. One ETL script can support both real-time and offline data warehouses, which greatly greatly improved efficiency, reduces storage costs, and avoids problems such as inconsistencies between offline and real-time indicators.</p></li><li><p>Apache Doris 1.1.x version fully supports vectorization, which improves the query performance by 2-3 times compared with the previous version. After testing, the query performance of Apache Doris version 1.1.x in the wide table is equal to that of ClickHouse.</p></li><li><p>Apache Doris is powerful and does not depend on other components. Compared with Apache Kylin, Apache Druid, ClickHouse, Apache Doris does not need a second component to fill the technical gap. Apache Doris supports aggregation, detailed queries, and associated queries. Currently, more than 90% of AISPEACH&#x27; analysis have migrated to Apache Doris. Thanks to this advantage, developers operate and maintain fewer components, which greatly reduces the cost of operation and maintenance.</p></li><li><p>It is extremely easy to use, supporting MySQL protocol and standard SQL, which greatly reduces user learning costs.</p></li></ol><p><em>Special thanks to SelectDB, the company building Apache Doris helps us work with the community and get sufficient technical support.</em></p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/jd">Best practice of Apache Doris in JD</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2022-07-20T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 20, 2022</time></div></header><div class="markdown" itemprop="articleBody"><h1><strong>Introduction:</strong></h1><p>Apache Doris is an open source MPP analytical database product that not only can get query results in sub-second response time, effectively supporting real-time data analysis, but also supports huge data sets of more than 10PB. Compared with other industry-hot OLAP database systems, the distributed architecture of Apache is very simple. Itsupports elastic scaling and is easy to operate and maintain, saving a lot of labor and time costs. At present, the domestic community is very popular , and there are also many companies which have large scale uses, such as Meituan and Xiaomi,etc. </p><p>This paper mainly discusses how to use Doris for business exploration and practice in the multi-dimensional analysis of real-time and offline data in the large real-time screen of JD customer service in the scenarios of manual consultation, customer event list, after-sales service list, etc.</p><p>In recent years, with the explosive growth of data volume and the emergence of the demand for online analysis of massive data, traditional relational databases such as MySQL and Oracle have encountered bottlenecks under large data volume, while databases such as Hive and Kylin lack timeliness. So Apache Doris, Apache Druid, ClickHouse and other real-time analytic databases begun to appear, not only to cope with the second-level queries of massive data, but also to meet the real-time and quasi-real-time analysis needs. Offline and real-time computing engines are in full bloom. But for different scenarios and facing different problems, no single engine is a panacea. We hope that this article can give you some inspiration on the application and practice of offline and real-time analytics in JD&#x27;s customer service business, and we hope you will communicate more and give us valuable suggestions.</p><h1><strong>JD Customer Service Business Form</strong></h1><p>As the entrance to the group&#x27;s services, JD Customer Service provides efficient and reliable protection for users and merchants. JD customer service is responsible for solving users&#x27; problems in a timely manner and providing them with detailed and easy-to-understand instructions and explanations; in order to better understand users&#x27; feedback and the status of products, it is necessary to monitor a series of indicators such as the number of inquiries, pick-up rates, complaints, etc. in real time, and discover problems in a timely manner through ring comparison and year-on-year comparison, in order to better adapt to users&#x27; shopping styles, improve service quality and efficiency, and thus enhance the brand of JD influence.</p><h1><strong>Easy OLAP Design</strong></h1><h3 class="anchor anchorWithStickyNavbar_LWe7" id="01-easyolap-doris-data-import-links"><strong>01 EasyOLAP Doris Data Import Links</strong><a href="#01-easyolap-doris-data-import-links" class="hash-link" aria-label="Direct link to 01-easyolap-doris-data-import-links" title="Direct link to 01-easyolap-doris-data-import-links"></a></h3><p>EasyOLAP Doris data sources are mainly real-time Kafka and offline HDFS files. The import of real-time data relies on Routine Load; offline data is mainly imported using Broker Load and Stream Load.</p><p><img loading="lazy" alt="1280X1280" src="https://cdnd.selectdb.com/assets/images/jd03-00bd471f0fab2d98798f5e3148b35fce.png" width="1080" height="604" class="img_ev3q"></p><p>EasyOLAP Doris Data Import Links</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="02-easyolap-doris-full-link-monitor"><strong>02 EasyOLAP Doris Full Link Monitor</strong><a href="#02-easyolap-doris-full-link-monitor" class="hash-link" aria-label="Direct link to 02-easyolap-doris-full-link-monitor" title="Direct link to 02-easyolap-doris-full-link-monitor"></a></h3><p>The EasyOLAP Doris project currently uses the Prometheus + Grafana framework for monitoring. The node_exporter is responsible for collecting machine-level metrics, and Doris automatically spits out FE and BE service-level metrics in Prometheus format. In addition, OLAP Exporter service is deployed to collect Routine Load related metrics, aiming to discover real-time data stream import at the first time and ensure real-time data timeliness.</p><p><img loading="lazy" alt="EasyOLAP Doris monitoring link" src="https://cdnd.selectdb.com/assets/images/jd04-8770adfb04ffe977f931d9eaff4cb534.png" width="1080" height="594" class="img_ev3q"></p><p>EasyOLAP Doris monitoring link</p><p><img loading="lazy" alt="640" src="https://cdnd.selectdb.com/assets/images/jd01-47257e8bb0b14785f854db959cdfd931.png" width="871" height="600" class="img_ev3q"></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="03-easyolap-doris-primary-secondary-dual-stream-design"><strong>03 EasyOLAP Doris Primary-Secondary Dual Stream Design</strong><a href="#03-easyolap-doris-primary-secondary-dual-stream-design" class="hash-link" aria-label="Direct link to 03-easyolap-doris-primary-secondary-dual-stream-design" title="Direct link to 03-easyolap-doris-primary-secondary-dual-stream-design"></a></h3><p>EasyOLAP Doris adopts a dual-write approach for the primary and secondary clusters in order to guarantee the service stability of Level 0 services during the promotion time.</p><p><img loading="lazy" alt="03 EasyOLAP Doris Primary-Secondary Dual Stream Design" src="https://cdnd.selectdb.com/assets/images/jd02-a6a4279c0c33a25862e89b56e7c986a7.png" width="1080" height="669" class="img_ev3q"></p><p>EasyOLAP Doris Primary-Secondary Dual Stream Design</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="04-easyolap-doris-dynamic-partition-management"><strong>04 EasyOLAP Doris Dynamic Partition Management</strong><a href="#04-easyolap-doris-dynamic-partition-management" class="hash-link" aria-label="Direct link to 04-easyolap-doris-dynamic-partition-management" title="Direct link to 04-easyolap-doris-dynamic-partition-management"></a></h3><p>After analyzing the requirements, the JD OLAP team did some customization of Doris, which involved dynamic partition management. Although the community version already had the function of dynamic partitioning, the function could not retain partitions of a specified time. For the characteristics of JD Group, we have retained historical data of specified time, such as data during 618 and 11.11, which will not be deleted due to dynamic partitioning. The dynamic partition management feature can control the amount of data stored in the cluster, and it is easy to use by the business side without the need to manage partition information manually or with additional code.</p><h1><strong>Doris Caching Mechanism</strong></h1><h3 class="anchor anchorWithStickyNavbar_LWe7" id="01-demand-scenarios"><strong>01 Demand Scenarios</strong><a href="#01-demand-scenarios" class="hash-link" aria-label="Direct link to 01-demand-scenarios" title="Direct link to 01-demand-scenarios"></a></h3><p>Committed to continuously improving user experience, JD Customer Service&#x27;s data analysis pursues the ultimate timeliness. Offline data analysis scenario is write less read more, data is written once and read frequently many times; real-time data analysis scenario, part of the data is not updated historical partition, part of the data is in the updated partition. In most analysis applications, there are the following scenarios:</p><ul><li><p>High concurrency scenario: Doris better support high concurrency, but too high QPS will cause cluster jitter, and a single node can not carry too high QPS;.</p></li><li><p>Complex queries: JD customer service real-time operation platform monitoring needs to display multi-dimensional complex indicators according to business scenarios, rich indicators display corresponding to a variety of different queries, and data sources from multiple tables . Although the response time of individual queries at milliseconds level , the overall response time may be at the second level.</p></li><li><p>Repeated queries: if there is no anti-refresh mechanism, repeatedly refreshing the page will lead to the submission of a large number of repeated queries due to delays or hand errors.</p></li></ul><p>For the above scenario, there are solutions at the application layer —— the query results are put into Redis and the cache is refreshed periodically or manually by the user, but there are some problems:</p><ul><li><p>Data inconsistency: can not respond immediately to data updates, and the user may receive results with old data.</p></li><li><p>Low hit rate: if the data is highly real-time and the cache is frequently invalidated, the hit rate of the cache is low and the load on the system cannot be relieved.</p></li></ul><p>Additional cost: introduction of external components increases system complexity and adds additional cost.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="02-introduction-to-caching-mechanism"><strong>02 Introduction to Caching Mechanism</strong><a href="#02-introduction-to-caching-mechanism" class="hash-link" aria-label="Direct link to 02-introduction-to-caching-mechanism" title="Direct link to 02-introduction-to-caching-mechanism"></a></h3><p>There are three different types of Cache in EasyOLAP Doris, respectively Result Cache, SQL Cache and Partition Cache, depending on the applicable scenario. All three types of caches can be switched on and off by MySQL client commands.</p><p>These three caching mechanisms can coexist: which can be turned on at the same time. When querying, the query parser first determines whether the Result Cache is enabled or not, and if the Result Cache is enabled, it first finds out whether the cache exists for the query from the Result Cache, and if the cache fails or does not exist, it directly takes the cached value and returns it to the client. The cache is placed in the memory of each FE node for fast reading.</p><p>SQL Cache stores and gets the cache according to the signature of SQL, the ID of the partition of the queried table, and the latest version number of the partition. These three together serve as cache conditions. If one of these three conditions is changed, such as SQL statement change or partition version number change after data update, the cache will not be hit. In the case of multiple table joins, the partition update of one of the tables will also result in failure to hit the cache. SQL Cache is more suitable for T+1 update scenarios.</p><p>Partition Cache is a more fine-grained caching mechanism. Partition cache mainly splits a query into read-only partition and updatable partition in parallel based on partition, read-only partition is cached, updatable partition is not cached, and the corresponding result set is generated n, and then the results of each split subquery are merged. Therefore, if the query N days of data, data update the most recent D days, each day is only a different date range but similar queries, you can use Partition Cache, only need to query D partitions can be, the other parts are from the cache, can effectively reduce the cluster load, shorten the query response time.</p><p>When a query enters Doris, the system will first process the query statement and take it as the key, before executing the query statement, the query analyzer can automatically select the most suitable caching mechanism to ensure that the caching mechanism is used to shorten the query response time in the best case. Then, it checks whether the query result exists in the Cache, and if it does, it gets the data in the cache and returns it to the client; if it does not, it queries normally and stores the query result as Value and the query statement Key in the cache. SQL Cache is more suitable for T+1 scenarios and works well when partition updates are infrequent and SQL statements are repetitive Partition Cache is the least granular cache. When a query statement queries data for a time period, the query statement is split into multiple subqueries. It can shorten the query time and save cluster resources when the data is written to only one partition or partial partition.</p><p>To better observe the effectiveness of caching, metrics have been added to Doris&#x27; service metrics, which are monitored visually through Prometheus and Grafana monitoring systems. The metrics include the number of hits for different types of Cache, the hit rate for different types of Cache, the memory size of the Cache, and other metrics.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="03-caching-mechanism-effect"><strong>03 Caching Mechanism Effect</strong><a href="#03-caching-mechanism-effect" class="hash-link" aria-label="Direct link to 03-caching-mechanism-effect" title="Direct link to 03-caching-mechanism-effect"></a></h3><p>For the JD Customer Service Doris main cluster, some services reached 100% CPU usage during 11.11 period without caching on; with Result Cache on, CPU usage was between 30% and 40%. The caching mechanism ensures that the business can get the query results quickly and protects the cluster resources well under high concurrency scenarios.</p><h1><strong>Doris&#x27; optimization during the 11.11 sale, 2020</strong></h1><h3 class="anchor anchorWithStickyNavbar_LWe7" id="01-import-task-optimization"><strong>01 Import Task Optimization</strong><a href="#01-import-task-optimization" class="hash-link" aria-label="Direct link to 01-import-task-optimization" title="Direct link to 01-import-task-optimization"></a></h3><p>The import of real-time data has always been a challenge. Among them, ensuring real-time data and importing stability is the most important. In order to observe the real-time data import situation more intuitively, JD OLAP team developed OLAP Exporter independently to collect real-time data import-related metrics, such as import speed, import backlog and suspended tasks. The import speed and import backlog can be used to determine the status of a real-time import task, and if find a trend of backlog, the sampling tool developed independently can be used to sample and analyze the real-time task. Real-time tasks have three main thresholds to control the submission of tasks, which are the maximum processing interval per batch, the maximum number of processing entries per batch and the maximum amount of data processed per batch, and a task will be submitted as soon as one of these thresholds is reached. By increasing the logs, we found that the task queue in FE was relatively busy, so the parameters were mainly adjusted to make the maximum number of processing entries per batch and the maximum amount of data processed per batch larger, and then the maximum processing interval per batch was adjusted to ensure that the data latency was within twice the maximum processing interval per batch according to the business requirements. Through the sampling tool, the analysis task ensures not only the real-time data, but also the stability of the import. In addition, we also set up alarms to detect abnormalities such as backlog of real-time import tasks and suspension of import tasks in a timely manner.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="02-monitoring-metrics-optimization"><strong>02 Monitoring Metrics Optimization</strong><a href="#02-monitoring-metrics-optimization" class="hash-link" aria-label="Direct link to 02-monitoring-metrics-optimization" title="Direct link to 02-monitoring-metrics-optimization"></a></h3><p>The monitoring metrics are divided into two main sections, a machine level metrics section and a business level metrics section. In the whole monitoring panel, detailed metrics bring comprehensive data and at the same time make it more difficult to get important metrics. So, to get a better view of important metrics for all clusters, a separate panel is created - 11.11 Important Metrics Summary Panel. The board contains metrics such as BE CPU usage, real-time task consumption backlog rows, TP99, QPS, and so on. The number of metrics is small, but the situation of all clusters can be observed, which can eliminate the trouble of frequent switching in monitoring.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="03-peripheral-tools-support"><strong>03 Peripheral Tools Support</strong><a href="#03-peripheral-tools-support" class="hash-link" aria-label="Direct link to 03-peripheral-tools-support" title="Direct link to 03-peripheral-tools-support"></a></h3><p>In addition to the sampling tools and OLAP Exporter mentioned above, the JD OLAP team has also developed a series maintenance tools for Doris.</p><ol><li>Import sampling tool: The import sampling tool not only collects the data imported in real time, but also supports adjusting the parameters of the real time import task, or generating creation statements (including the latest loci and other information) for task migration and other operations when the real time import task is paused.</li></ol><ol start="2"><li>Big query tool: Big queries not only cause jitter in cluster BE CPU usage, but also lead to longer response time for other queries. Before the Big Query tool, if you found jitter in cluster CPU, you needed to check the audit logs on all FEs and then do the statistics, which is not only time-consuming but also not intuitive. The Big Query tool is designed to solve the above problem. When the monitoring side finds that the cluster has jitter, you can use the Big Query tool and enter the cluster name and time point to get the total number of queries for different services at that time point, the number of queries with more than 5 seconds, 10 seconds, 20 seconds, the number of queries with huge scanning volume, etc. It is convenient for us to analyze the big queries from different dimensions. The details of the big queries will also be saved in the intermediate file, which can directly get the big queries of different businesses. The whole process only takes a few tens of seconds to a minute to locate the big query that is happening and get the corresponding query statements, which greatly saves time and operation and maintenance costs.</li></ol><ol start="3"><li>Downgrade and recovery tools: In order to ensure the stability of the Level 0 business during the 11.11 promotion, when the cluster pressure exceeds the safety level, it is necessary to downgrade other non-Level 0 businesses, and then restore them to the pre-downgrade settings with one click after the peak period. The degradation mainly involves reducing the maximum number of connections to the service, suspending non-level 0 real-time import tasks, and so on. This greatly increases the ease of operation and improves efficiency.</li></ol><ol start="4"><li>Cluster inspection tool: During 11.11 period, the health inspection of clusters is extremely important. Routine inspections include primary and secondary cluster consistency checks for dual-stream services. In order to ensure that the business can quickly switch to the other cluster when one cluster has problems, it is necessary to ensure that the library tables on both clusters are consistent and the data volume is not too different; check whether the number of copies of the library tables is 3 and whether there are unhealthy Tablet in the cluster; check the machine disk utilization, memory and other machine-level indicators, etc. Check the machine disk utilization, memory and other machine-level metrics, etc.</li></ol><h1><strong>Summary &amp; Outlook</strong></h1><p> JD Customer Service was introduced to Doris in early 2020, and currently has one standalone cluster and one shared cluster, and is an experienced user of JD OLAP.</p><p> In the business use, we also encountered problems such as task scheduling-related, import task configuration-related and query-related problems, which are driving the JD OLAP team to understand Doris more deeply. We plan to promote the use of materialized views to further improve the efficiency of queries; use Bitmap to support accurate de-duplication of UV and other metrics; use audit logs to make it easier to count large and slow queries; and solve the scheduling problem of real-time import tasks to make them more efficient and stable. In addition, we also plan to optimize table building, create high-quality Rollup or materialized views to improve the smoothness of the application, and accelerate more businesses to the OLAP platform to improve the impact of the application.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/meituan">Best practice of Apache Doris in Meituan</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2022-07-20T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 20, 2022</time></div></header><div class="markdown" itemprop="articleBody"><h1>Best Practice of Apache Doris in Meituan</h1><p>Introduction: This paper mainly introduces a general method and practice of real-time data warehouse construction. The real-time data warehouse aims at end-to-end low latency, SQL standardization, rapid response to changes, and data unification. In practice, the best practice we summarize is: a common real-time production platform + a common interactive real-time analysis engine cooperate with each other to meet real-time and quasi-real-time business scenarios. The two have a reasonable division of labor and complement each other to form an easy-to-develop, easy-to-maintain, and most efficient assembly line, taking into account development efficiency and production costs, and satisfying diverse business needs with a better input-output ratio.</p><h1>real-time scene</h1><p>There are many scenarios in which real-time data is delivered in Meituan, mainly including these following points:</p><ul><li>Operational level: Such as real-time business changes, real-time marketing effects, daily business status and daily real-time business trend analysis, etc.</li><li>Production level: such as whether the real-time system is reliable, whether the system is stable, real-time monitoring of the health of the system, etc.</li><li>C-end users: For example, search recommendation sorting requires real-time understanding of users&#x27; thoughts, behaviors and characteristics, and recommendation of more concerned content to users.</li><li>Risk control: Food delivery and financial technology are used a lot. Real-time risk identification, anti-fraud, abnormal transactions, etc., are all scenarios where a large number of real-time data are applied</li></ul><h1>Real-time technology and architecture</h1><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1real-time-computing-technology-selection">1.Real-time computing technology selection<a href="#1real-time-computing-technology-selection" class="hash-link" aria-label="Direct link to 1.Real-time computing technology selection" title="Direct link to 1.Real-time computing technology selection"></a></h3><p>At present, there are many open source real-time technologies, among which Storm, Spark Streaming and Flink are common. The specific selection depends on the business situation of different companies.</p><p>Meituan Takeaway relies on the overall construction of meituan&#x27;s basic data system. In terms of technology maturity, It used Storm a few years ago, which was irreplaceable in terms of performance stability, reliability and scalability. As Flink becomes more and more mature, it has surpassed Storm in terms of technical performance and framework design advantages. In terms of trends, just like Spark replacing MR, Storm will be gradually replaced by Flink. Of course, there will be a process of migrating from Storm to Flink. We currently have some old tasks still on Storm, and we are constantly promoting task migration.</p><p>The comparison between Storm and Flink can refer to the form above.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2real-time-architecture">2.Real-time Architecture<a href="#2real-time-architecture" class="hash-link" aria-label="Direct link to 2.Real-time Architecture" title="Direct link to 2.Real-time Architecture"></a></h3><h4 class="anchor anchorWithStickyNavbar_LWe7" id="-lambda-architecture">① Lambda Architecture<a href="#-lambda-architecture" class="hash-link" aria-label="Direct link to ① Lambda Architecture" title="Direct link to ① Lambda Architecture"></a></h4><p>The Lambda architecture is a relatively classic architecture. In the past, there were not many real-time scenarios, mainly offline. When a real-time scene is attached, the technical ecology is different due to the different timeliness of offline and real- time. The Lambda architecture is equivalent to attaching a real-time production link, which is integrated at the application level, and two-way production is independent of each other.This is also a logical approach to adopt in business applications.</p><p>There will be some problems in dual-channel production, such as double processing logic, double development and operation and maintenance, and resources will also become two resource links. Because of these problems, Kappa architecture has been evolved.</p><h4 class="anchor anchorWithStickyNavbar_LWe7" id="-kappa-architecture">② Kappa Architecture<a href="#-kappa-architecture" class="hash-link" aria-label="Direct link to ② Kappa Architecture" title="Direct link to ② Kappa Architecture"></a></h4><p>The Kappa architecture is relatively simple in terms of architecture design, unified in production, and a set of logic produces both offline and real time. However, there are relatively large limitations in practical application scenarios. There are few cases in the industry that directly use the Kappa architecture for production and implementation, and the scene is relatively simple. These problems will also be encountered on our side, and we will also have some thoughts of our own, which will be discussed later.</p><h1>Business Pain Points</h1><p>In the take-away business, we also encountered some problems.</p><p>In the early stage of the business, in order to meet the business needs, the requirements are generally completed case by case after the requirements are obtained. The business has high real-time requirements. From the perspective of timeliness, there is no opportunity for middle-level precipitation. In the scenario, the business logic is generally directly embedded. This is a simple and effective method that can be imagined. This development mode is relatively common in the early stage of business development.</p><p>As shown in the figure above, after getting the data source, it will go through data cleaning, dimension expansion, business logic processing through Storm or Flink, and finally direct business output. Taking this link apart, the data source will repeatedly refer to the same data source, and the operations such as cleaning, filtering, and dimension expansion must be repeated. The only difference is that the code logic of the business is different. IIf there is less business, this model is acceptable, but when the subsequent business volume increases, there will be a situation where whoever develops will be responsible for operation and maintenance, the maintenance workload will increase, and the operations cannot be managed in a unified manner. Moreover, everyone is applying for resources, resulting in a rapid expansion of resource costs, and resources cannot be used intensively and effectively. Therefore, it is necessary to think about how to construct real-time data from the whole data source.</p><h1>Data features and Application Scenario</h1><p>So how to build a real-time data warehouse?</p><p>First of all, we need to disassemble this task into what data, what scenarios, and what features these scenarios have in common. For takeaway business scenarios, there are two categories, log class and business category.</p><ul><li><p>Log class: It is characterized by a large amount of data, semi-structured, and deeply nested.Log data has a great feature that once the log stream is formed, it will not change. It will collect all the logs of the platform by means of buried points, and then collect and distribute them uniformly. Just like a tree with really large roots. The whole process of pushing to the front-end application is just like the process of a tree branching from the root to a branch (the decomposition process from 1 to n). If all businesses search for data from the root, although the path seems to be the shortest, because of the heavy burden,the data retrieval efficiency is low. Log data is generally used for production monitoring and user behavior analysis. The timeliness requirements are relatively high . Generally, the time window will be 5 minutes or 10 minutes, or up to the current state. The main application is the real-time large screen and real-time features, such as behaviour can immediately perceive the need for waiting every time the user clicks.</p></li><li><p>Business category: The business class is mainly about business transaction data. Business systems are usually self-contained and distribute data down in the form of Binlog logs. All business systems are transactional, mainly using paradigm modeling methods, which have a structured characteristic and the main part can be seen clearly. However, due to the large number of data tables, multi-table associations are required to express the complete business. So it&#x27;s an integrated machining process from n to 1 .</p></li></ul><p>Several difficulties faced by business real-time processing:</p><ul><li><p>Diversity of business: Business processes are constantly changing from the beginning to the end, such as from ordering -&gt; payment -&gt; delivery. The business database is changed on the original basis,and Binlog will generate a lot of changed logs. Business analysis is more focused on the end state, which leads to the problem of data retraction calculation, such as placing an order at 10 o&#x27;clock and canceling it at 13 o&#x27;clock, but hoping to subtract the canceled order at 10 o&#x27;clock.</p></li><li><p>Business integration: Business analysis data usually cannot be expressed by a single subject, and often many tables are associated to obtain the desired information. When confluent alignment of data is performed in real-time streaming, it often requires large cache processing and is complicated.</p></li><li><p>The analysis is batch, and the processing process is streaming: for a single data, no analysis can be formed, so the analysis object must be batch, and the data processing is one by one.</p></li></ul><p>The scenarios of log classes and business classes generally exist at the same time and are intertwined. Whether it is Lambda architecture or Kappa architecture, a single application will have some problems, so it is more meaningful to choose the architecture and practice according to the scenario.</p><h1>Architecture Design of Real-time Data Warehouse</h1><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1real-time-architecture-exploration-of-stream-batch-combination">1.Real-time Architecture: Exploration of Stream-Batch Combination<a href="#1real-time-architecture-exploration-of-stream-batch-combination" class="hash-link" aria-label="Direct link to 1.Real-time Architecture: Exploration of Stream-Batch Combination" title="Direct link to 1.Real-time Architecture: Exploration of Stream-Batch Combination"></a></h3><p>Based on the above problems, we have our own thinking and ideas,it is to deal with different business scenarios through the combination of flow and batch.</p><p>As shown in the figure above, the data is collected from the log to the message queue, and then to the ETL process of the data stream. The construction of the basic data stream is unified. Afterwards, for log real-time features, real-time large-screen applications use real-time stream computing. Real-time OLAP batch processing is used for Binlog business analysis.</p><p>What are the Pain Points of Stream Processing Analysis Business? For the paradigm business, both Storm and Flink require a large amount of external memory to achieve business alignment between data streams, which requires a lot of computing resources. Due to the limitation of external memory, the window limitation strategy must be carried out, and may eventually discard some data as a result. After calculation, it is generally stored in Redis as query support, and KV storage has many limitations in dealing with analytical query scenarios.</p><p>How to achieve real-time OLAP? Is there a real-time computing engine with its own storage, when the real-time data is entered,it can flexibly and freely calculate within a certain range, and has a certain data carrying capacity, and supports analysis of query responses at the same time? With the development of technology, the current MPP engine is developing very rapidly, and its performance is also improving rapidly, so there is a new possibility in this scenario, just like the Doris engine we use here.</p><p>This idea has been practiced in the industry and has become an important exploration direction. For example, Alibaba&#x27;s real-time OLAP solution based on ADB, etc.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2architecture-design-of-real-time-data-warehouse">2.Architecture Design of Real-time Data Warehouse<a href="#2architecture-design-of-real-time-data-warehouse" class="hash-link" aria-label="Direct link to 2.Architecture Design of Real-time Data Warehouse" title="Direct link to 2.Architecture Design of Real-time Data Warehouse"></a></h3><p>From the perspective of the entire real-time data warehouse architecture, the first thing to consider is how to manage all real-time data, how to effectively integrate resources, and how to construct data.</p><p>In terms of methodology, the real-time and offline are very similar to each other. In the early stage of offline data warehouse, it is also case by case. Consider how to govern it when the scale of data increases to a certain amount. We all know that layering is a very effective way of data governing. So, on the issue of how to manage the real-time data warehouse, the first consideration is also the hierarchical processing logic, as follows:</p><ul><li><p>Data source: At the data source level, offline and real-time data sources are consistent. They are mainly divided into log classes and business classes. Log classes include user logs, DB logs, and server logs.</p></li><li><p>Real-time detail layer: At the detail level, in order to solve the problem of repeated construction, a unified construction should be carried out.Using the offline data warehouse model to build a unified basic detailed data layer, managed according to the theme, the purpose of the detail layer is to provide directly available data downstream, so the basic layer should be processed uniformly, such as cleaning, filtering, and dimension expansion.</p></li><li><p>Aggregation layer: The summary layer can directly calculate the result through the concise operator of Flink or Storm. And form a summary of indicators, all indicators are processed at the summary layer, and everyone manages and constructs according to unified specifications, forming a reusable summary result.</p></li></ul><p>In conclusion, from the perspective of the construction of the entire real-time data warehouse,first of all, the data construction needs to be layered, build the framework first, and set the specifications includs what extent each layer is processed and how each layer is used.The definition of specifications facilitates standardized processing in production.Due to the need to ensure timeliness, don&#x27;t design too many layers when designing.For scenarios with high real-time requirements, you can basically refer to the left side of the figure above. For batch processing requirements, you can import from the real-time detail layer to the real-time OLAP engine, and perform fast retraction calculations based on the OLAP engine&#x27;s own calculation and query capabilities, as shown in the data flow on the right side of the figure above.</p><h1>Real-time platform construction</h1><p>After the architecture is determined, the next consideration is how to build a platform.The construction of the real-time platform is completely attached to the real-time data warehouse management.</p><p>First, abstract the functions and abstract them into components, so that standardized production can be achieved, and systematic guarantees can be further constructed. For the basic processing layer cleaning, filtering, confluence, dimension expansion, conversion, encryption, screening and other functions can be abstracted, and the base layer builds a directly usable data result stream in this componentized way. How to meet diverse needs and how to be compatible with users are the problems that we need to figure out. In this case it may occur problems with redundant processing. In terms of storage, real-time data does not have a history and will not consume too much storage. This redundancy is acceptable.The production efficiency can be improved by means of redundancy, which is an ideological application of changing space for time.</p><p>Through the processing of the base layer, all data is deposited in the IDL layer, and written to the base layer of the OLAP engine at the same time, and then the real-time summary layer is calculated. Based on Storm, Flink or Doris, multi-dimensional summary indicators are produced to form a unified summary layer for unified storage distribution.</p><p>When these functions are available, system capabilities such as metadata management, indicator management, data security, SLA, and data quality will be gradually built.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="1real-time-base-layer-functions">1.Real-time base layer functions<a href="#1real-time-base-layer-functions" class="hash-link" aria-label="Direct link to 1.Real-time base layer functions" title="Direct link to 1.Real-time base layer functions"></a></h3><p>The construction of the real-time base layer needs to solve some problems.</p><p>The first is the problem of repeated reading of a stream. When a Binlog is called, it exists in the form of a DB package. Users may only use one of the tables. If everyone wants to use it, there may be a problem that everyone needs to access this stream. The solution can be deconstructed according to different businesses, restored to the basic data flow layer, made into a paradigm structure according to the needs of the business, and integrated with the theme construction according to the modeling method of the data warehouse.</p><p>Secondly, we need to encapsulate components, such as basic layer cleaning, filtering, and dimension expansion . Users can write logic by a very simple expression. Trans part is more flexible. For example, converting from one value to another value, for this custom logic expression, we also open custom components, which can develop custom scripts through Java or Python for data processing.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="2real-time-feature-production-capabilities">2.Real-time feature production capabilities<a href="#2real-time-feature-production-capabilities" class="hash-link" aria-label="Direct link to 2.Real-time feature production capabilities" title="Direct link to 2.Real-time feature production capabilities"></a></h3><p>Feature production can be expressed logically through SQL syntax, and the underlying logic is adapted, and transparently transmitted to the computing engine, shielding the user&#x27;s dependence on the computing engine.Just like for offline scenarios, currently large companies rarely develop through code, unless there are some special cases, so they can basically be expressed in SQL.</p><p>At the functional level, the idea of indicator management is integrated. Atomic indicators, derived indicators, standard calculation apertures, dimension selection, window settings and other operations can be configured in a configurable way.In this way, the production logic can be uniformly parsed and packaged uniformly.</p><p>Another question,with the same source code a lot of SQL is written, and each submission will have a data stream which is a waste of resources.Our solution is to produce dynamic metrics through the same data stream, so that metrics can be added dynamically without stopping the service.</p><p>So, during the construction of the real-time platform, engineers should consider more about how to use resources more effectively and which links can use resources more economically.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="3sla-construction">3.SLA construction<a href="#3sla-construction" class="hash-link" aria-label="Direct link to 3.SLA construction" title="Direct link to 3.SLA construction"></a></h3><p>SLA mainly solves two problems, one is about the end-to-end SLA, the other is about the SLA of job productivity. We adopt the method of burying points + reporting.Because the real-time stream is relatively large, the burying point should be as simple as possible, do not bury too many things,can express the business information is enough.The output of each job is reported to the SLA monitoring platform in a unified manner, and the required information is reported at each job point through a unified interface, and finally the end-to-end SLA can be counted.</p><p>In real-time production, because the process is very long, it is impossible to control all links, but it can control the efficiency of its own operations, so job SLA is also essential.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="4real-time-olap-solution">4.Real-time OLAP solution<a href="#4real-time-olap-solution" class="hash-link" aria-label="Direct link to 4.Real-time OLAP solution" title="Direct link to 4.Real-time OLAP solution"></a></h3><p>Problems:</p><ul><li><p>Binlog business restoration is complex:There are many changes in the business, and changes at a certain point in time are required. Therefore, sorting and data storage are required, which consumes a lot of memory and CPU resources.</p></li><li><p>Binlog business association is complex:In stream computing, the relationship between streams and streams is very difficult to express business logic.</p></li></ul><p>solutions:</p><p>To solve the problem through the OLAP engine with computing power, there is no need to logically map a data stream, and only the problem of real-time and stable data storage needs to be solved.</p><p>We use Doris as a high-performance OLAP engine here.Due to the need for derivative calculations between the results generated by the business data and the results, Doris can quickly restore the business by using the unique model or the aggregation model, and can also perform aggregation at the summary layer while restoring the business,and is also designed for reuse.The application layer can be physical or logical view.</p><p>This mode focuses on solving the business rollback calculation. For example, when the business state changes, the value needs to be changed at a certain point in history. The cost of using flow calculation in this scenario is very high. The OLAP mode can solve this problem very well.</p><h1>Real-time use cases</h1><p>In the end, we use a case to illustrate.For example, merchants want to offer discounts to users based on the number of historical orders placed by users. Merchants need to see how many orders they have placed in history. They must have historical T+1 data and real-time data today.This scenario is a typical Lambda architecture,You can design a partition table in Doris, one is the historical partition, and the other is the today partition. The historical partition can be produced offline. Today&#x27;s indicators can be calculated in real time and written to today&#x27;s partition. When querying, a simple summary.</p><p>This scenario seems relatively simple, but the difficulty lies in the fact that many simple problems will become complicated after the number of merchants increases.Therefore, in the future, we will use more business input to precipitate more business scenarios, abstract them to form a unified production plan and function, and support diversified business needs with minimized real-time computing resources, which is also what needs to be achieved in the future. </p><p>That&#x27;s all for today, thank you.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="about-the-author">about the author:<a href="#about-the-author" class="hash-link" aria-label="Direct link to about the author:" title="Direct link to about the author:"></a></h3><p>Zhu Liang, more than 5 years experience in data warehouse construction in traditional industries, 6 years experience in Internet data warehouse, technical direction involves offline, real-time data warehouse management, systematic capacity building, OLAP system and engine, big data related technologies, focusing on OLAP,and real-time technology frontier development trends.The business direction involves ad hoc query, operation analysis, strategy report product, user portrait, crowd recommendation, experimental evaluation, etc.</p></div></article><article class="margin-bottom--xl" itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><div class="text-center mb-4"><a class="text-[#8592A6] cursor-pointer hover:no-underline" href="/blog">Blog</a><span class="px-2 text-[#8592A6]">/</span><span><span class="s-tags"><span class="s-tag">Best Practice</span></span></span></div><h2 class="blog-post-title text-[2rem] leading-normal lg:text-[2.5rem] text-center" itemprop="headline"><a itemprop="url" href="/blog/xiaomi">Best practice of Apache Doris in Xiaomi Group</a></h2><div class="blog-info text-center flex justify-center text-sm text-black"><span class="authors"><span class="s-author text-black">Apache Doris</span></span><time datetime="2022-07-20T00:00:00.000Z" itemprop="datePublished" class="text-black ml-4">July 20, 2022</time></div></header><div class="markdown" itemprop="articleBody"><h1>Background</h1><p>In order to improve the query performance of the Xiaomi growth analysis platform and reduce the operation and maintenance costs, Xiaomi Group introduced Apache Doris in September 2019. In the past two and a half years, <strong>Apache Doris has been widely used in Xiaomi Group,</strong> <strong>such as business growth analytic platform, realtime dashboards for all business groups, finance analysis, user profile analysis, advertising reports, A/B testing platform and so on.</strong> This article will share the best practice of Apache Doris in Xiaomi Group. </p><h1>Business Practice</h1><p>The typical business practices of Apache Doris in Xiaomi are as follows:</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="01-user-access">01 User Access<a href="#01-user-access" class="hash-link" aria-label="Direct link to 01 User Access" title="Direct link to 01 User Access"></a></h2><p>Data Factory is a one-stop data development platform developed by Xiaomi for data developers and data analysts. This platform supports data sources such as Doris, Hive, Kudu, Iceberg, ES, Talso, TiDB, MySQL, etc. It also supports computing engines such as Flink, Spark, Presto,etc.</p><p>Inside Xiaomi, users need to access the Doris service through the data factory. Users need to register in the data factory and complete the approval for building the database. The Doris operation and maintenance classmates will connect according to the descriptions of the business scenarios and data usage expectations submitted by users in the data factory. After completing the access approval, users can use the Doris service to perform operations such as visual table creation and data import in the data factory.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="02-data-import">02 Data import<a href="#02-data-import" class="hash-link" aria-label="Direct link to 02 Data import" title="Direct link to 02 Data import"></a></h2><p>In Xiaomi&#x27;s business, the two most common ways to import data into Doris are Stream Load and Broker Load. User data will be divided into real-time data and offline data, and users&#x27; real-time and offline data will generally be written to Talos first (Talos is a distributed, high-throughput message queue developed by Xiaomi). The offline data from Talos will be sink to HDFS, and then imported to Doris through the data factory. Users can directly submit Broker Load tasks in the data factory to import large batches of data on HDFS into Doris, In addition, you can run the SparkSQL command in the data factory to query data from Hive, Import the data found in SparkSQL into Doris through Spark-doris-Connector, and encapsulate Stream Load at the bottom layer of Spark-doris-Connector. Real-time data from Talos is generally imported into Doris in two ways. One is to first perform ETL on the data through Flink, and then import small batches of data to Doris through.Flink- Doris-connector encapsulates the Stream Load at the bottom layer. Another way is to import small batches of data into Doris through Stream Load encapsulated by Spark Streaming at regular intervals.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="03-data-query">03 Data Query<a href="#03-data-query" class="hash-link" aria-label="Direct link to 03 Data Query" title="Direct link to 03 Data Query"></a></h2><p>Doris users of Xiaomi generally analyze and query Doris and display the results through the ShuJing platform.ShuJing is a general-purpose BI analysis tool developed by Xiaomi. Users can query and visualize Doris through ShuJing platform, and realize user behavior analysis (in order to meet the needs of business event analysis, retention analysis, funnel analysis, path analysis and other behavior analysis needs, We added corresponding UDF and UDAF ) and user profile analysis for Doris.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="04-compaction-tuning">04 Compaction Tuning<a href="#04-compaction-tuning" class="hash-link" aria-label="Direct link to 04 Compaction Tuning" title="Direct link to 04 Compaction Tuning"></a></h2><p>For Doris, each data import will generate a data version under the relevant data shard (Tablet) of the storage layer, and the Compaction mechanism will asynchronously merge the smaller data versions generated by the import (the detailed principle of the Compaction mechanism can be Refer to the previous article &quot;Doris Compaction Mechanism Analysis&quot;).</p><p>Xiaomi has many high-frequency, high-concurrency, near-real-time import business scenarios, and a large number of small versions will be generated in a short period of time. If Compaction does not merge data versions in time, it will cause version accumulation.On the one hand, too many minor versions will increase the pressure on metadata, and on the other hand, too many versions will affect query performance.In Xiaomi&#x27;s usage scenarios, many tables use the Unique and Aggregate data models, and the query performance is heavily dependent on whether Compaction can merge data versions in time.In our business scenario, the query performance was reduced by tens of times due to delayed version merging, thus affecting online services.When a Compaction happens, it consumes CPU, memory, and disk I/O resources. Too much compaction will take up too many machine resources, affect query performance, and may cause OOM.</p><p><strong>In response to this problem of Compaction, we first start from the business side and guide users through the following aspects:</strong></p><ul><li><p>Set reasonable partitions and buckets for tables to avoid generating too many data fragments.</p></li><li><p>Standardize the user&#x27;s data import operation, reduce the frequency of data import, increase the amount of data imported in a single time, and reduce the pressure of Compaction.</p></li><li><p>Avoid using delete operations too much.The delete operation will generate a delete version under the relevant data shard in the storage layer.The Cumulative Compaction task will be truncated when the delete version is encountered. This task can only merge the data version after the Cumulative Point and before the delete version, move the Cumulative Point to the delete version, and hand over the delete version to the subsequent Base Compaction task. to process. If you use the delete operation too much, too many delete versions will be generated under the Tablet, which will cause the Cumulative Compaction task to slow down the progress of version merging. Using the delete operation does not actually delete the data from the disk, but records the deletion conditions in the delete version. When the data is queried, the deleted data will be filtered out by Merge-On-Read. Only the delete version is merged by the Base Compaction task. After that, the data to be deleted by the delete operation can be cleared from the disk as expired data with the Stale Rowset. If you need to delete the data of an entire partition, you can use the truncated partition operation instead of the delete operation.</p></li></ul><p><strong>Second, we tuned Compaction from the operation and maintenance side:</strong></p><ul><li><p>According to different business scenarios, different Compaction parameters (Compaction strategy, number of threads, etc.) are configured for different clusters.</p></li><li><p>Appropriately lowers the priority of the Base Compaction task and increases the priority of the Cumulative Compaction task, because the Base Compaction task takes a long time to execute and has serious write amplification problems, while the Cumulative Compaction task executes faster and can quickly merge a large number of small versions.</p></li><li><p>Version backlog alarm, dynamic adjustment of Compaction parameters.When the Compaction Producer produces Compaction tasks, it will update the corresponding metric.It records the value of the largest Compaction Score on the BE node. You can check the trend of this indicator through Grafana to determine whether there is a version backlog. In addition, we have added a Version backlog alert.In order to facilitate the adjustment of Compaction parameters, we have optimized the code level to support dynamic adjustment of the Compaction strategy and the number of Compaction threads at runtime, avoiding the need to restart the process when adjusting the Compaction parameters.</p></li><li><p>Supports manual triggering of the Compaction task of the specified Table and data shards under the specified Partition, and improves the Compaction priority of the specified Table and data shards under the specified Partition.</p></li></ul><h1>Monitoring and Alarm Management</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="01-monitoring-system">01 Monitoring System<a href="#01-monitoring-system" class="hash-link" aria-label="Direct link to 01 Monitoring System" title="Direct link to 01 Monitoring System"></a></h2><p>Prometheus will regularly pull Metrics metrics from Doris&#x27;s FE and BE and display them in the Grafana monitoring panel.The service metadata based on QingZhou Warehouse will be automatically registered in Zookeeper, and Prometheus will regularly pull the latest cluster metadata information from Zookeeper and display it dynamically in the Grafana monitoring panel.(Qingzhou Data Warehouse is a data warehouse constructed by the Qingzhou platform based on the operation data of Xiaomi&#x27;s full-scale big data service. It consists of 2 base tables and 30+ dimension tables.Covers the whole process data such as resources, server cmdb, cost, process status and so on when big data components are running)We have also added statistics and display boards for common troubleshooting data such as Doris large query list, real-time write data volume, data import transaction numbers, etc. in Grafana.In Grafana, we also added statistics and display boards for common troubleshooting data such as the Doris big query list, the amount of real-time data written, and the number of data import transactions, so that alarms can be linked. When the cluster is abnormal, Doris&#x27; operation and maintenance students can locate the cause of the cluster failure in the shortest time.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="02--falcon">02 Falcon<a href="#02--falcon" class="hash-link" aria-label="Direct link to 02 Falcon" title="Direct link to 02 Falcon"></a></h2><p>Falcon is a monitoring and alarm system widely used inside Xiaomi.Because Doris provides a relatively complete metrics interface, which can easily provide monitoring functions based on Prometheus and Grafana, we only use Falcon&#x27;s alarm function in the Doris service.For different levels of faults in Doris, we define alarms as three levels of P0, P1 and P2:</p><ul><li><p>P2 alarm (alarm level is low): single node failure alarm.When a single node indicator or process status is abnormal, an alarm is generally issued as a P2 level.The alarm information is sent to the members of the alarm group in the form of Xiaomi Office messages.(Xiaomi Office is a privatized deployment product of ByteDance Feishu in Xiaomi, and its functions are similar to Feishu.)</p></li><li><p>P1 alarm (alarm level is higher):In a short period of time (within 3 minutes), the cluster will issue a P1 level alarm if there are short-term exceptions such as increased query delay and abnormal writing,etc.The alarm information is sent to the members of the alarm group in the form of Xiaomi Office messages.P1 level alarms require Oncall engineers to respond and provide feedback.</p></li><li><p>P0 alarm (alarm level is high):In a long period of time (more than 3 minutes), the cluster will issue a P0 level alarm if there are exceptions such as increased query delay and abnormal writing,etc.Alarm information is sent in the form of Xiaomi office messages and phone alarms.P0 level alarm requires Oncall engineers to respond within 1 minute and coordinate resources for failure recovery and review preparation.</p></li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="03--cloud-doris">03 Cloud-Doris<a href="#03--cloud-doris" class="hash-link" aria-label="Direct link to 03 Cloud-Doris" title="Direct link to 03 Cloud-Doris"></a></h2><p>cloud-Doris is a data collection component developed by Xiaomi for the internal Doris service. Its main capability is to detect the availability of the Doris service and collect the cluster indicator data of internal concern.For example, Cloud-Doris can periodically simulate users reading and writing to the Doris system to detect the availability of services.If the cluster has abnormal availability, it will be alerted through Falcon.Collect user&#x27;s read and write data, and then generate user bill.Collect information such as table-level data volume, unhealthy copies, and oversized Tablets, and send alarms to abnormal information through Falcon.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="04-qingzhou-inspection">04 QingZhou inspection<a href="#04-qingzhou-inspection" class="hash-link" aria-label="Direct link to 04 QingZhou inspection" title="Direct link to 04 QingZhou inspection"></a></h2><p>For chronic hidden dangers such as capacity, user growth, resource allocation, etc., we use the unified QingZhou big data service inspection platform for inspection and reporting.The inspection generally consists of two parts:Service-specific inspections and basic indicator inspections.Among them, the service-specific inspection refers to the indicators that are unique to each big data service and cannot be used universally.For Doris, it mainly includes: Quota, number of shard copies, number of single table columns, number of table partitions, etc.By increasing the inspection method, the chronic hidden dangers that are difficult to be alarmed in advance can be well avoided, which provides support for the failure-free major festivals.</p><h1>Failure Recovery</h1><p>When an online cluster fails, the first principle should be to quickly restore services.If the cause of the failure is clear, handle it according to the specific cause and restore the service.If the cause of the failure is not clear, you should try restarting the process as soon as you keep the snapshot to restore the service.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="01-access-failures-handling">01 Access Failures Handling<a href="#01-access-failures-handling" class="hash-link" aria-label="Direct link to 01 Access Failures Handling" title="Direct link to 01 Access Failures Handling"></a></h2><p>Doris uses Xiaomi LVS as the access layer, which is similar to the LB service of open source or public cloud, and provides layer 4 or layer 7 traffic load scheduling capability.After Doris binds a reasonable port,Generally speaking, if an abnormality occurs in a single FE node, it will be automatically kicked out, and the service can be restored without the user&#x27;s perception, and an alarm will be issued for the abnormal node.Of course, for FE faults that cannot be processed in a short time, we will first adjust the weight of the faulty node to 0 or delete the abnormal node from LVS first to prevent unpredictable problems caused by process detection exceptions.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="02-node-failure-handling">02 Node Failure Handling<a href="#02-node-failure-handling" class="hash-link" aria-label="Direct link to 02 Node Failure Handling" title="Direct link to 02 Node Failure Handling"></a></h2><p>For FE node failures, if the cause of the failure cannot be quickly located, it is generally necessary to keep thread snapshots and memory snapshots and restart the process.</p><div class="language-undefined codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-undefined codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">jstack 进程ID &gt;&gt; 快照文件名.jstack</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>Save a memory snapshot of FE with the command:</p><div class="language-undefined codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-undefined codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">jmap -dump:live,format=b,file=快照文件名.heap 进程ID</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><p>In the case of version upgrade or some unexpected scenarios, the image of the FE node may have abnormal metadata, and the abnormal metadata may be synchronized to other FE, resulting in all FE not working.Once a failed image is discovered, the fastest recovery option is to use Recovery mode to stop FE elections and replace the failed image with the backup image.Of course, it is not easy to backup images all the time.Since this failure is common in cluster upgrades, we recommend adding simple local image backup logic to the cluster upgrade procedure.Ensure that a copy of the current and latest image data will be retained before each upgrade starts the FE process.For BE node failure, if the process crashes, a core file will be generated, and minos will automatically pull the process;If the task is stuck, you need to restart the process after retaining the thread snapshot with the following command:</p><div class="language-undefined codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#F8F8F2;--prism-background-color:#282A36"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-undefined codeBlock_bY9V thin-scrollbar"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#F8F8F2"><span class="token plain">pstack 进程ID &gt;&gt; 快照文件名.pstack</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div><h1>Concluding Remarks</h1><p>Apache Doris has been widely used by Xiaomi since the first use of open source software Apache Doris by Xiaomi Group in September 2019.At present, it has served dozens of businesses of Xiaomi, with dozens of clusters and hundreds of nodes, and a set of data ecology with Apache Doris as the core has been formed within Xiaomi.In order to improve the efficiency of operation and maintenance, Xiaomi has also developed a complete set of automated management and operation and maintenance systems around Doris.With the increasing number of services, Doris also exposed some problems. For example, there was no better resource isolation mechanism in the past version, and services would affect each other. In addition, system monitoring needs to be further improved.With the rapid development of the community, more and more small partners have participated in the community construction, the vectorized engine has been transformed, the transformation of the query optimizer is in full swing, and Apache Doris is gradually maturing.</p></div></article><nav class="pagination-nav" aria-label="Blog list page navigation"></nav></main></div></div></div></div><div class="footer pt-16 pb-10"><div class="container"><div class="footer-box"><div class="left"><img src="/images/asf_logo_apache.svg" alt="" class="themedImage_ToTc themedImage--light_HNdA footer__logo"><img src="/images/asf_logo_apache.svg" alt="" class="themedImage_ToTc themedImage--dark_i4oU footer__logo"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">ASF</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://www.apache.org/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Foundation<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="footer__link-item">License<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="footer__link-item">Events<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://privacy.apache.org/policies/privacy-policy-public.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Privacy<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/security/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Security<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Resources</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/download">Download</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/get-starting/quick-start">Docs</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li><li class="footer__item"><a class="footer__link-item" href="/ecosystem/cluster-management">Ecosystem</a></li><li class="footer__item"><a class="footer__link-item" href="/users">Users</a></li><li class="footer__item"><a href="https://github.com/apache/doris/discussions" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discussions<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/community/how-to-contribute/">How to contribute</a></li><li class="footer__item"><a href="https://github.com/apache/doris/" target="_blank" rel="noopener noreferrer" class="footer__link-item">Source code<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://cwiki.apache.org/confluence/display/DORIS/Doris+Improvement+Proposals" target="_blank" rel="noopener noreferrer" class="footer__link-item">Improvement proposal<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/community/team">Doris team</a></li><li class="footer__item"><a href="https://github.com/apache/doris/issues/30669" target="_blank" rel="noopener noreferrer" class="footer__link-item">Roadmap<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div></div><div class="right"><div class="footer__title">Join the community</div><div class="social-list"><div class="social"><a href="mailto:dev@doris.apache.org" target="_blank" title="mail" class="item"><svg xmlns="http://www.w3.org/2000/svg" width="2em" height="2em" viewBox="0 0 32 32" fill="none"><path d="M5.6003 6H26.3997C27.8186 6 28.982 7.10964 29 8.46946L16.0045 15.454L3.01202 8.47829C3.02405 7.11258 4.1784 6 5.6003 6ZM3.01202 11.1508L3 23.5011C3 24.8756 4.16938 26 5.6003 26H26.3997C27.8306 26 29 24.8756 29 23.5011V11.145L16.3111 17.8028C16.1157 17.9058 15.8813 17.9058 15.6889 17.8028L3.01202 11.1508Z" fill="currentColor"></path></svg></a><a href="https://github.com/apache/doris" target="_blank" title="github" class="item"><svg width="2em" height="2em" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M16.0001 2.66675C8.63342 2.66675 2.66675 8.63341 2.66675 16.0001C2.66524 18.7991 3.54517 21.5276 5.1817 23.7983C6.81824 26.0691 9.12828 27.7668 11.7841 28.6508C12.4508 28.7668 12.7001 28.3668 12.7001 28.0161C12.7001 27.7001 12.6828 26.6508 12.6828 25.5334C9.33342 26.1508 8.46675 24.7174 8.20008 23.9668C8.04942 23.5828 7.40008 22.4001 6.83342 22.0828C6.36675 21.8334 5.70008 21.2161 6.81608 21.2001C7.86675 21.1828 8.61608 22.1668 8.86675 22.5668C10.0668 24.5828 11.9841 24.0161 12.7494 23.6668C12.8668 22.8001 13.2161 22.2174 13.6001 21.8841C10.6334 21.5508 7.53342 20.4001 7.53342 15.3001C7.53342 13.8494 8.04942 12.6507 8.90008 11.7161C8.76675 11.3827 8.30008 10.0161 9.03342 8.18275C9.03342 8.18275 10.1494 7.83342 12.7001 9.55075C13.7855 9.2495 14.907 9.09787 16.0334 9.10008C17.1668 9.10008 18.3001 9.24942 19.3668 9.54942C21.9161 7.81608 23.0334 8.18408 23.0334 8.18408C23.7668 10.0174 23.3001 11.3841 23.1668 11.7174C24.0161 12.6507 24.5334 13.8334 24.5334 15.3001C24.5334 20.4174 21.4174 21.5508 18.4508 21.8841C18.9334 22.3001 19.3508 23.1001 19.3508 24.3508C19.3508 26.1334 19.3334 27.5668 19.3334 28.0174C19.3334 28.3668 19.5841 28.7828 20.2508 28.6494C22.8975 27.7558 25.1973 26.0547 26.8266 23.7856C28.4559 21.5165 29.3327 18.7936 29.3334 16.0001C29.3334 8.63341 23.3668 2.66675 16.0001 2.66675V2.66675Z" fill="currentColor"></path></svg></a><a href="https://twitter.com/doris_apache" target="_blank" title="twitter" class="item"><svg xmlns="http://www.w3.org/2000/svg" width="2em" height="2em" viewBox="0 0 32 32" fill="none"><path d="M4.625 4.625H11.2809L27.375 27.375H20.7191L4.625 4.625ZM7.52549 6.10639L21.5236 25.8936H24.4746L10.4764 6.10639H7.52549Z" fill="currentColor"></path><path d="M14.4268 18.4803L6.53447 27.375H4.625L13.5581 17.2525L14.4268 18.4803ZM18.1299 14.3066L26.7203 4.625H24.7017L17.2525 13.0662L18.1299 14.3066Z" fill="currentColor"></path></svg></a><a href="https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2kl08hzc0-SPJe4VWmL_qzrFd2u2XYQA" title="slack" target="_blank" class="item"><svg width="2em" height="2em" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg"><g clip-path="url(#clip0_125_278)"><path d="M12.5875 16.6906C11.0844 16.6906 9.86562 17.9094 9.86562 19.4125V26.2375C9.86562 26.9594 10.1524 27.6517 10.6628 28.1622C11.1733 28.6726 11.8656 28.9594 12.5875 28.9594C13.3094 28.9594 14.0017 28.6726 14.5122 28.1622C15.0226 27.6517 15.3094 26.9594 15.3094 26.2375V19.4531C15.3094 17.9094 14.0906 16.6906 12.5875 16.6906ZM3 19.4531C3 20.175 3.28677 20.8673 3.79722 21.3778C4.30767 21.8882 4.99999 22.175 5.72187 22.175C6.44376 22.175 7.13608 21.8882 7.64653 21.3778C8.15698 20.8673 8.44375 20.175 8.44375 19.4531V16.7312H5.7625C4.25938 16.6906 3 17.9094 3 19.4531ZM12.5875 3C11.8656 3 11.1733 3.28677 10.6628 3.79722C10.1524 4.30767 9.86562 4.99999 9.86562 5.72187C9.86562 6.44376 10.1524 7.13608 10.6628 7.64653C11.1733 8.15698 11.8656 8.44375 12.5875 8.44375H15.3094V5.72187C15.3094 4.21875 14.0906 3 12.5875 3ZM5.72187 15.3094H12.5469C13.2688 15.3094 13.9611 15.0226 14.4715 14.5122C14.982 14.0017 15.2688 13.3094 15.2688 12.5875C15.2688 11.8656 14.982 11.1733 14.4715 10.6628C13.9611 10.1524 13.2688 9.86562 12.5469 9.86562H5.72187C4.99999 9.86562 4.30767 10.1524 3.79722 10.6628C3.28677 11.1733 3 11.8656 3 12.5875C3 13.3094 3.28677 14.0017 3.79722 14.5122C4.30767 15.0226 4.99999 15.3094 5.72187 15.3094ZM26.2375 9.86562C24.7344 9.86562 23.5156 11.0844 23.5156 12.5875V15.3094H26.2375C26.9594 15.3094 27.6517 15.0226 28.1622 14.5122C28.6726 14.0017 28.9594 13.3094 28.9594 12.5875C28.9594 11.8656 28.6726 11.1733 28.1622 10.6628C27.6517 10.1524 26.9594 9.86562 26.2375 9.86562ZM16.6906 5.72187V12.5875C16.6906 13.3094 16.9774 14.0017 17.4878 14.5122C17.9983 15.0226 18.6906 15.3094 19.4125 15.3094C20.1344 15.3094 20.8267 15.0226 21.3372 14.5122C21.8476 14.0017 22.1344 13.3094 22.1344 12.5875V5.72187C22.1344 4.99999 21.8476 4.30767 21.3372 3.79722C20.8267 3.28677 20.1344 3 19.4125 3C18.6906 3 17.9983 3.28677 17.4878 3.79722C16.9774 4.30767 16.6906 4.99999 16.6906 5.72187ZM22.1344 26.2781C22.1344 24.775 20.9156 23.5562 19.4125 23.5562H16.6906V26.2781C16.6906 27 16.9774 27.6923 17.4878 28.2028C17.9983 28.7132 18.6906 29 19.4125 29C20.1344 29 20.8267 28.7132 21.3372 28.2028C21.8476 27.6923 22.1344 27 22.1344 26.2781ZM26.2781 16.6906H19.4125C18.6906 16.6906 17.9983 16.9774 17.4878 17.4878C16.9774 17.9983 16.6906 18.6906 16.6906 19.4125C16.6906 20.1344 16.9774 20.8267 17.4878 21.3372C17.9983 21.8476 18.6906 22.1344 19.4125 22.1344H26.2375C27.7406 22.1344 28.9594 20.9156 28.9594 19.4125C29 17.9094 27.7812 16.6906 26.2781 16.6906Z" fill="currentColor"></path></g><defs><clipPath id="clip0_125_278"><rect width="26" height="26" fill="currentColor" transform="translate(3 3)"></rect></clipPath></defs></svg></a></div><div class="social"><a href="https://www.youtube.com/@apachedoris/channels" title="youtube" target="_blank" class="item"><svg xmlns="http://www.w3.org/2000/svg" width="2em" height="2em" viewBox="0 0 32 32" fill="none"><path d="M28.5167 7.83429C28.9436 8.25423 29.2532 8.77539 29.4154 9.34742C29.8205 11.5462 30.0159 13.7775 29.999 16.0121C30.0144 18.2382 29.819 20.4609 29.4154 22.6515C29.2532 23.2235 28.9436 23.7446 28.5167 24.1645C28.0898 24.5845 27.5601 24.889 26.9785 25.0486C24.7728 25.625 16.0124 25.625 16.0124 25.625C16.0124 25.625 7.22652 25.625 5.04638 25.0486C4.46489 24.889 3.9351 24.5845 3.5082 24.1645C3.08132 23.7446 2.77176 23.2235 2.60948 22.6515C2.19736 20.4617 1.9934 18.239 2.00025 16.0121C1.9918 13.7767 2.19577 11.5455 2.60948 9.34742C2.77176 8.77539 3.08132 8.25423 3.5082 7.83429C3.9351 7.41436 4.46489 7.10985 5.04638 6.95021C7.25103 6.36354 16.0124 6.37502 16.0124 6.37502C16.0124 6.37502 24.796 6.37502 26.9785 6.95021C27.5601 7.10985 28.0898 7.41436 28.5167 7.83429ZM12.5 21.25L21.25 16.008L12.5 10.75V21.25Z" fill="currentColor"></path></svg></a><a href="https://www.linkedin.com/company/doris-apache/" title="linkedin" target="_blank" class="item"><svg width="2rem" height="2rem" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4.29925 26.9996H9.66738V11.6781H4.29925V26.9996ZM22.1628 11.1949C19.9409 11.1949 18.7157 11.9388 17.3054 13.7407V11.6777H11.9459V26.9996H17.305V18.6738C17.305 16.9168 18.145 15.1982 20.1535 15.1982C22.162 15.1982 22.6559 16.9164 22.6559 18.632V27H28V18.2902C28 12.2386 24.3854 11.1949 22.1628 11.1949ZM6.99325 4C5.3395 4 4 5.21047 4 6.7046C4 8.19759 5.3395 9.40617 6.99325 9.40617C8.6455 9.40617 9.985 8.19722 9.985 6.7046C9.985 5.21047 8.6455 4 6.99325 4Z" fill="white"></path></svg></a><a href="https://medium.com/@ApacheDoris" title="medium" target="_blank" class="item"><svg width="2em" height="2em" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg"><g id="Frame"><path id="Vector" d="M17.7967 16.5385C17.8029 18.53 16.9746 20.4425 15.4937 21.8559C14.0128 23.2693 12.0004 24.0681 9.89836 24.0769C7.79633 24.0681 5.78391 23.2693 4.30302 21.8559C2.82212 20.4425 1.99383 18.53 2.00003 16.5385C1.99383 14.5469 2.82212 12.6344 4.30302 11.221C5.78391 9.80759 7.79633 9.00878 9.89836 9C12.0004 9.00878 14.0128 9.80759 15.4937 11.221C16.9746 12.6344 17.8029 14.5469 17.7967 16.5385ZM26.4533 16.5385C26.4533 20.4514 24.6917 23.6348 22.51 23.6348C20.3283 23.6348 18.555 20.4514 18.555 16.5385C18.555 12.6255 20.3283 9.44214 22.51 9.44214C24.6917 9.44214 26.4533 12.6255 26.4533 16.5385ZM30 16.5385C30 20.0424 29.3817 22.8942 28.6117 22.8942C27.8417 22.8942 27.2233 20.0424 27.2233 16.5385C27.2233 13.0345 27.8417 10.1827 28.6117 10.1827C29.3817 10.1827 30 13.0345 30 16.5385Z" fill="currentColor"></path></g></svg></a><a class="item wechat"><svg width="2em" height="2em" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M20.7578 11.5169C21.0708 11.5169 21.3795 11.5398 21.6851 11.573C20.8524 7.73517 16.7052 4.88306 11.9718 4.88306C6.67951 4.88306 2.34412 8.45283 2.34412 12.9854C2.34412 15.6013 3.78679 17.7498 6.19667 19.4161L5.2339 22.2827L8.59917 20.6122C9.80411 20.8478 10.7698 21.0906 11.9718 21.0906C12.2738 21.0906 12.5728 21.0759 12.8703 21.0523C12.682 20.4159 12.5728 19.7485 12.5728 19.0566C12.5728 14.8947 16.1847 11.5169 20.7578 11.5169ZM15.5822 8.9335C16.3072 8.9335 16.7871 9.40601 16.7871 10.1229C16.7871 10.8369 16.3072 11.3153 15.5822 11.3153C14.8601 11.3153 14.1365 10.8369 14.1365 10.1229C14.1365 9.40601 14.8601 8.9335 15.5822 8.9335ZM8.84429 11.3153C8.12218 11.3153 7.3942 10.8368 7.3942 10.1229C7.3942 9.40597 8.12218 8.93346 8.84429 8.93346C9.56559 8.93346 10.0463 9.40597 10.0463 10.1229C10.0463 10.8369 9.56559 11.3153 8.84429 11.3153ZM29.5453 18.9422C29.5453 15.1332 25.6935 12.0285 21.3677 12.0285C16.7871 12.0285 13.1797 15.1332 13.1797 18.9422C13.1797 22.7567 16.7871 25.8547 21.3677 25.8547C22.326 25.8547 23.2932 25.6169 24.2559 25.3777L26.897 26.8086L26.1726 24.4282C28.1056 22.993 29.5453 21.0906 29.5453 18.9422ZM18.7126 17.7498C18.2335 17.7498 17.7499 17.278 17.7499 16.7966C17.7499 16.3219 18.2335 15.8442 18.7126 15.8442C19.4406 15.8442 19.9176 16.3219 19.9176 16.7966C19.9176 17.278 19.4406 17.7498 18.7126 17.7498ZM24.0079 17.7498C23.5324 17.7498 23.0518 17.278 23.0518 16.7966C23.0518 16.3219 23.5324 15.8442 24.0079 15.8442C24.73 15.8442 25.2128 16.3219 25.2128 16.7966C25.2128 17.278 24.73 17.7498 24.0079 17.7498Z" fill="currentColor"></path></svg><div class="wechat-dropdown"><p class="text-[#4c576c] text-xs">Connect on WeChat</p><img src="https://cdnd.selectdb.com/assets/images/doris-wechat-b949e908a3bc2776d824f79a9100bd4b.png" alt=""></div></a></div></div></div></div><div class="footer__copyright">Copyright © 2024 The Apache Software Foundation,Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0" target="_blank">Apache License, Version 2.0</a>. Apache, Doris, Apache Doris, the Apache feather logo and the Apache Doris logo are trademarks of The Apache Software Foundation.</div></div></div></div>
<script src="https://cdnd.selectdb.com/assets/js/runtime~main.eb208fba.js"></script>
<script src="https://cdnd.selectdb.com/assets/js/main.fa7fcb85.js"></script>
</body>
</html>