published_versions/assets/js/232c70bd.98f32a08.js - druid-website-src - Git at Google

 "use strict";(self.webpackChunk=self.webpackChunk||[]).push([[1904],{28453:(e,t,i)=>{i.d(t,{R:()=>r,x:()=>o});var a=i(96540);const s={},n=a.createContext(s);function r(e){const t=a.useContext(n);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function o(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:r(e.components),a.createElement(n.Provider,{value:t},e.children)}},90398:(e,t,i)=>{i.r(t),i.d(t,{assets:()=>d,contentTitle:()=>o,default:()=>u,frontMatter:()=>r,metadata:()=>a,toc:()=>l});const a=JSON.parse('{"id":"comparisons/druid-vs-redshift","title":"Apache Druid vs Redshift","description":"\x3c!--","source":"@site/docs/33.0.0/comparisons/druid-vs-redshift.md","sourceDirName":"comparisons","slug":"/comparisons/druid-vs-redshift","permalink":"/docs/33.0.0/comparisons/druid-vs-redshift","draft":false,"unlisted":false,"tags":[],"version":"current","frontMatter":{"id":"druid-vs-redshift","title":"Apache Druid vs Redshift"}}');var s=i(74848),n=i(28453);const r={id:"druid-vs-redshift",title:"Apache Druid vs Redshift"},o=void 0,d={},l=[{value:"How does Druid compare to Redshift?",id:"how-does-druid-compare-to-redshift",level:3},{value:"Real-time data ingestion",id:"real-time-data-ingestion",level:3},{value:"Druid is a read oriented analytical data store",id:"druid-is-a-read-oriented-analytical-data-store",level:3},{value:"Data distribution model",id:"data-distribution-model",level:3},{value:"Replication strategy",id:"replication-strategy",level:3},{value:"Indexing strategy",id:"indexing-strategy",level:3}];function c(e){const t={h3:"h3",li:"li",p:"p",ul:"ul",...(0,n.R)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(t.h3,{id:"how-does-druid-compare-to-redshift",children:"How does Druid compare to Redshift?"}),"\n",(0,s.jsx)(t.p,{children:"In terms of drawing a differentiation, Redshift started out as ParAccel (Actian), which Amazon is licensing and has since heavily modified."}),"\n",(0,s.jsx)(t.p,{children:"Aside from potential performance differences, there are some functional differences:"}),"\n",(0,s.jsx)(t.h3,{id:"real-time-data-ingestion",children:"Real-time data ingestion"}),"\n",(0,s.jsx)(t.p,{children:"Because Druid is optimized to provide insight against massive quantities of streaming data; it is able to load and aggregate data in real-time."}),"\n",(0,s.jsx)(t.p,{children:"Generally traditional data warehouses including column stores work only with batch ingestion and are not optimal for streaming data in regularly."}),"\n",(0,s.jsx)(t.h3,{id:"druid-is-a-read-oriented-analytical-data-store",children:"Druid is a read oriented analytical data store"}),"\n",(0,s.jsx)(t.p,{children:"Druid\u2019s write semantics are not as fluid and does not support full joins (we support large table to small table joins). Redshift provides full SQL support including joins and insert/update statements."}),"\n",(0,s.jsx)(t.h3,{id:"data-distribution-model",children:"Data distribution model"}),"\n",(0,s.jsx)(t.p,{children:'Druid\u2019s data distribution is segment-based and leverages a highly available "deep" storage such as S3 or HDFS. Scaling up (or down) does not require massive copy actions or downtime; in fact, losing any number of Historical processes does not result in data loss because new Historical processes can always be brought up by reading data from "deep" storage.'}),"\n",(0,s.jsx)(t.p,{children:"To contrast, ParAccel\u2019s data distribution model is hash-based. Expanding the cluster requires re-hashing the data across the nodes, making it difficult to perform without taking downtime. Amazon\u2019s Redshift works around this issue with a multi-step process:"}),"\n",(0,s.jsxs)(t.ul,{children:["\n",(0,s.jsx)(t.li,{children:"set cluster into read-only mode"}),"\n",(0,s.jsx)(t.li,{children:"copy data from cluster to new cluster that exists in parallel"}),"\n",(0,s.jsx)(t.li,{children:"redirect traffic to new cluster"}),"\n"]}),"\n",(0,s.jsx)(t.h3,{id:"replication-strategy",children:"Replication strategy"}),"\n",(0,s.jsx)(t.p,{children:"Druid employs segment-level data distribution meaning that more processes can be added and rebalanced without having to perform a staged swap. The replication strategy also makes all replicas available for querying. Replication is done automatically and without any impact to performance."}),"\n",(0,s.jsx)(t.p,{children:"ParAccel\u2019s hash-based distribution generally means that replication is conducted via hot spares. This puts a numerical limit on the number of nodes you can lose without losing data, and this replication strategy often does not allow the hot spare to help share query load."}),"\n",(0,s.jsx)(t.h3,{id:"indexing-strategy",children:"Indexing strategy"}),"\n",(0,s.jsx)(t.p,{children:"Along with column oriented structures, Druid uses indexing structures to speed up query execution when a filter is provided. Indexing structures do increase storage overhead (and make it more difficult to allow for mutation), but they also significantly speed up queries."}),"\n",(0,s.jsx)(t.p,{children:"ParAccel does not appear to employ indexing strategies."})]})}function u(e={}){const{wrapper:t}={...(0,n.R)(),...e.components};return t?(0,s.jsx)(t,{...e,children:(0,s.jsx)(c,{...e})}):c(e)}}}]);