assets/js/1fd605f7.e0891fef.js - druid-website - Git at Google

 "use strict";(self.webpackChunk=self.webpackChunk||[]).push([[5803],{3905:(e,t,a)=>{a.d(t,{Zo:()=>l,kt:()=>h});var r=a(7294);function n(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,r)}return a}function s(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{};t%2?i(Object(a),!0).forEach((function(t){n(e,t,a[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):i(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))}))}return e}function o(e,t){if(null==e)return{};var a,r,n=function(e,t){if(null==e)return{};var a,r,n={},i=Object.keys(e);for(r=0;r<i.length;r++)a=i[r],t.indexOf(a)>=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r<i.length;r++)a=i[r],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var d=r.createContext({}),u=function(e){var t=r.useContext(d),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},l=function(e){var t=u(e.components);return r.createElement(d.Provider,{value:t},e.children)},c="mdxType",m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},p=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,d=e.parentName,l=o(e,["components","mdxType","originalType","parentName"]),c=u(a),p=n,h=c["".concat(d,".").concat(p)]||c[p]||m[p]||i;return a?r.createElement(h,s(s({ref:t},l),{},{components:a})):r.createElement(h,s({ref:t},l))}));function h(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,s=new Array(i);s[0]=p;var o={};for(var d in t)hasOwnProperty.call(t,d)&&(o[d]=t[d]);o.originalType=e,o[c]="string"==typeof e?e:n,s[1]=o;for(var u=2;u<i;u++)s[u]=a[u];return r.createElement.apply(null,s)}return r.createElement.apply(null,a)}p.displayName="MDXCreateElement"},7033:(e,t,a)=>{a.r(t),a.d(t,{contentTitle:()=>d,default:()=>p,frontMatter:()=>o,metadata:()=>u,toc:()=>l});var r=a(7462),n=a(3366),i=(a(7294),a(3905)),s=["components"],o={title:"Frequently Asked Questions"},d=void 0,u={type:"mdx",permalink:"/faq",source:"@site/src/pages/faq.md",title:"Frequently Asked Questions",description:"Don't see your question here? Ask us",frontMatter:{title:"Frequently Asked Questions"}},l=[{value:"Is Druid a data warehouse? When should I use Druid over Redshift/BigQuery/Snowflake?",id:"is-druid-a-data-warehouse-when-should-i-use-druid-over-redshiftbigquerysnowflake",level:3},{value:"Is Druid a log aggregation/log search system? When should I use Druid over Elastic/Splunk?",id:"is-druid-a-log-aggregationlog-search-system-when-should-i-use-druid-over-elasticsplunk",level:3},{value:"Is Druid a timeseries database? When should I use Druid over InfluxDB/OpenTSDB/Prometheus?",id:"is-druid-a-timeseries-database-when-should-i-use-druid-over-influxdbopentsdbprometheus",level:3},{value:"Does Druid separate storage and compute?",id:"does-druid-separate-storage-and-compute",level:3},{value:"How is Druid deployed?",id:"how-is-druid-deployed",level:3},{value:"Where does Druid fit in my big data stack?",id:"where-does-druid-fit-in-my-big-data-stack",level:3},{value:"Is Druid in-memory?",id:"is-druid-in-memory",level:3}],c={toc:l},m="wrapper";function p(e){var t=e.components,a=(0,n.Z)(e,s);return(0,i.kt)(m,(0,r.Z)({},c,a,{components:t,mdxType:"MDXLayout"}),(0,i.kt)("admonition",{type:"tip"},(0,i.kt)("p",{parentName:"admonition"},"Don't see your question here? ",(0,i.kt)("a",{href:"/community/"},"Ask us"))),(0,i.kt)("h3",{id:"is-druid-a-data-warehouse-when-should-i-use-druid-over-redshiftbigquerysnowflake"},"Is Druid a data warehouse? When should I use Druid over Redshift/BigQuery/Snowflake?"),(0,i.kt)("p",null,"Apache Druid is a new type of database to power real-time analytic workloads for\nevent-driven data, and isn\u2019t a traditional data warehouse.  Although Druid\nincorporates architecture ideas from data warehouses such as column-oriented\nstorage, Druid also incorporates designs from search systems and timeseries\ndatabases. Druid's architecture is designed to handle many use cases that\ntraditional data warehouses cannot."),(0,i.kt)("p",null,"Druid offers the following advantages over traditional data warehouses:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"Much lower latency for OLAP-style queries"),(0,i.kt)("li",{parentName:"ul"},"Much lower latency for data ingest (both streaming and batch)"),(0,i.kt)("li",{parentName:"ul"},"Out-of-the-box integration with Apache Kafka, AWS Kinesis, HDFS, AWS S3, and more"),(0,i.kt)("li",{parentName:"ul"},"Time-based partitioning, which enables performant time-based queries"),(0,i.kt)("li",{parentName:"ul"},"Fast search and filter, for fast slice and dice"),(0,i.kt)("li",{parentName:"ul"},"Minimal schema design and native support for semi-structured and nested data")),(0,i.kt)("p",null,"Consider using Druid to augment your data warehouse if your use case requires:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"Powering an user-facing application"),(0,i.kt)("li",{parentName:"ul"},"Low-latency query response with high concurrency"),(0,i.kt)("li",{parentName:"ul"},"Instant data visibility"),(0,i.kt)("li",{parentName:"ul"},"Fast ad-hoc slice and dice"),(0,i.kt)("li",{parentName:"ul"},"Streaming data")),(0,i.kt)("p",null,"To summarize, Druid shines when the use cases involves real-time analytics and\nwhere the end-user (technical or not) wants to apply numerous queries in rapid\nsuccession to explore or better understand data trends. "),(0,i.kt)("h3",{id:"is-druid-a-log-aggregationlog-search-system-when-should-i-use-druid-over-elasticsplunk"},"Is Druid a log aggregation/log search system? When should I use Druid over Elastic/Splunk?"),(0,i.kt)("p",null,"Druid uses inverted indexes (in particular, compressed bitmaps) for fast\nsearching and filtering, but it is not generally considered a search system.\nWhile Druid contains many features commonly found in search systems, such as\nthe ability to stream in structured and semi-structured data and the ability to\nsearch and filter the data, Druid isn\u2019t commonly used to ingest text logs and\nrun full text search queries over the text logs.  However, Druid is often used\nto ingest and analyze semi-structured data such as JSON."),(0,i.kt)("p",null,"Druid at its core is an analytics engine and as such, it can support numerical\naggregations, groupBys (including multi-dimensional groupBys), and other\nanalytic workloads faster and more efficiently than search systems."),(0,i.kt)("h3",{id:"is-druid-a-timeseries-database-when-should-i-use-druid-over-influxdbopentsdbprometheus"},"Is Druid a timeseries database? When should I use Druid over InfluxDB/OpenTSDB/Prometheus?"),(0,i.kt)("p",null,"Druid does share some characteristics with timeseries databases, but also\ncombines ideas from analytic databases and search systems.  Like in timeseries\ndatabases, Druid is optimized for data where a timestamp is present.  Druid\npartitions data by time, and queries that include a time filter will be\nsignificantly faster than those that do not.  Aggregating metrics and filtering\non dimensions (which are roughly equivalent to TSDBs' tags) are also very fast when a\ntime filter is present.  However, because Druid incorporates many architectural designs\nfrom analytics databases and search systems, it can significantly\noutperformance TSDBs when grouping, searching, and filtering on tags that are\nnot time, or when computing complex metrics such as histograms and quantiles."),(0,i.kt)("h3",{id:"does-druid-separate-storage-and-compute"},"Does Druid separate storage and compute?"),(0,i.kt)("p",null,"Druid creates an indexed copy of raw data that is highly optimized for\nanalytic queries. Druid runs queries over this indexed data, called a ",(0,i.kt)("a",{parentName:"p",href:"/docs/latest/design/segments"},"'segment'"),"\nin Druid, and does not pull raw data from an external storage system as needed\nby queries. "),(0,i.kt)("h3",{id:"how-is-druid-deployed"},"How is Druid deployed?"),(0,i.kt)("p",null,"Druid can be deployed on commodity hardware in any *NIX based environment.\nA Druid cluster consists of several different services, each designed to do a small set of things very well (ingestion, querying, coordination, etc).\nMany of these services can be co-located and deployed together on the same hardware as described ",(0,i.kt)("a",{parentName:"p",href:"/docs/latest/tutorials/"},"here"),"."),(0,i.kt)("p",null,"Druid was designed for the cloud, and runs well in AWS, GCP, Azure, and other cloud environments."),(0,i.kt)("h3",{id:"where-does-druid-fit-in-my-big-data-stack"},"Where does Druid fit in my big data stack?"),(0,i.kt)("p",null,"Druid typically connects to a source of raw data such as a message bus such as Apache Kafka, or a filesystem such as HDFS.\nDruid ingests an optimized, column-oriented, indexed copy of your data and serves analytics workloads on top of it."),(0,i.kt)("p",null,"A common streaming data oriented setup involving Druid looks like this:\nRaw data \u2192 Kafka \u2192 Stream processor (optional, typically for ETL) \u2192 Kafka (optional) \u2192 Druid \u2192 Application/user"),(0,i.kt)("p",null,"A common batch/static file oriented setup involving Druid looks like this:\nRaw data \u2192 Kafka (optional) \u2192 HDFS \u2192 ETL process (optional) \u2192 Druid \u2192 Application/user"),(0,i.kt)("p",null,"The same Druid cluster can serve both the streaming and batch path."),(0,i.kt)("h3",{id:"is-druid-in-memory"},"Is Druid in-memory?"),(0,i.kt)("p",null,"The earliest iterations of Druid didn\u2019t allow for data to be paged in from\nand out to disk, so it was often called an \u201cin-memory\u201d database. As Druid\nevolved, this limitation was removed. To provide a balance between hardware\ncost and query performance, Druid leverages memory-mapping to page data between\ndisk and memory and extend the amount of data a single node can load up to the\nsize of its disks."),(0,i.kt)("p",null,"Individual Historicals can be configured with the maximum amount of data\nthey should be given. Coupled with the Coordinator\u2019s ability to assign data to\ndifferent \u201ctiers\u201d based on different query requirements, Druid is essentially a\nsystem that can be configured across a wide spectrum of performance\nrequirements. All data can be in memory and processed, or data can be heavily\nover-committed compared to the amount of memory available. Druid can also\nsupport complex configurations, such as configuring the most recent month of\ndata in memory, while everything else is over-committed."))}p.isMDXComponent=!0}}]);