"use strict";(self.webpackChunk=self.webpackChunk||[]).push([[5814],{14137:(e,t,n)=>{n.d(t,{Zo:()=>p,kt:()=>h});var a=n(67294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function i(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?o(Object(n),!0).forEach((function(t){r(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):o(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function l(e,t){if(null==e)return{};var n,a,r=function(e,t){if(null==e)return{};var n,a,r={},o=Object.keys(e);for(a=0;a<o.length;a++)n=o[a],t.indexOf(n)>=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a<o.length;a++)n=o[a],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var u=a.createContext({}),s=function(e){var t=a.useContext(u),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=s(e.components);return a.createElement(u.Provider,{value:t},e.children)},d="mdxType",c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},m=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,u=e.parentName,p=l(e,["components","mdxType","originalType","parentName"]),d=s(n),m=r,h=d["".concat(u,".").concat(m)]||d[m]||c[m]||o;return n?a.createElement(h,i(i({ref:t},p),{},{components:n})):a.createElement(h,i({ref:t},p))}));function h(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,i=new Array(o);i[0]=m;var l={};for(var u in t)hasOwnProperty.call(t,u)&&(l[u]=t[u]);l.originalType=e,l[d]="string"==typeof e?e:r,i[1]=l;for(var s=2;s<o;s++)i[s]=n[s];return a.createElement.apply(null,i)}return a.createElement.apply(null,n)}m.displayName="MDXCreateElement"},85162:(e,t,n)=>{n.d(t,{Z:()=>i});var a=n(67294),r=n(86010);const o={tabItem:"tabItem_Ymn6"};function i(e){var t=e.children,n=e.hidden,i=e.className;return a.createElement("div",{role:"tabpanel",className:(0,r.Z)(o.tabItem,i),hidden:n},t)}},74866:(e,t,n)=>{n.d(t,{Z:()=>v});var a=n(87462),r=n(67294),o=n(86010),i=n(12466),l=n(16550),u=n(91980),s=n(67392),p=n(50012);function d(e){return function(e){var t,n;return null!=(t=null==(n=r.Children.map(e,(function(e){if(!e||(0,r.isValidElement)(e)&&(t=e.props)&&"object"==typeof t&&"value"in t)return e;var t;throw new Error("Docusaurus error: Bad <Tabs> child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the <Tabs> component should be <TabItem>, and every <TabItem> should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function c(e){var t=e.values,n=e.children;return(0,r.useMemo)((function(){var e=null!=t?t:d(n);return function(e){var t=(0,s.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in <Tabs>. Every value needs to be unique.')}(e),e}),[t,n])}function m(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function h(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId,o=(0,l.k6)(),i=function(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!a)throw new Error('Docusaurus error: The <Tabs> component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:n,groupId:a});return[(0,u._X)(i),(0,r.useCallback)((function(e){if(i){var t=new URLSearchParams(o.location.search);t.set(i,e),o.replace(Object.assign({},o.location,{search:t.toString()}))}}),[i,o])]}function k(e){var t,n,a,o,i=e.defaultValue,l=e.queryString,u=void 0!==l&&l,s=e.groupId,d=c(e),k=(0,r.useState)((function(){return function(e){var t,n=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the <Tabs> component requires at least one <TabItem> children component");if(n){if(!m({value:n,tabValues:a}))throw new Error('Docusaurus error: The <Tabs> has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var r=null!=(t=a.find((function(e){return e.default})))?t:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:i,tabValues:d})})),y=k[0],f=k[1],N=h({queryString:u,groupId:s}),b=N[0],g=N[1],v=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:s}.groupId),n=(0,p.Nk)(t),a=n[0],o=n[1],[a,(0,r.useCallback)((function(e){t&&o.set(e)}),[t,o])]),w=v[0],T=v[1],q=function(){var e=null!=b?b:w;return m({value:e,tabValues:d})?e:null}();return(0,r.useLayoutEffect)((function(){q&&f(q)}),[q]),{selectedValue:y,selectValue:(0,r.useCallback)((function(e){if(!m({value:e,tabValues:d}))throw new Error("Can't select invalid tab value="+e);f(e),g(e),T(e)}),[g,T,d]),tabValues:d}}var y=n(72389);const f={tabList:"tabList__CuJ",tabItem:"tabItem_LNqP"};function N(e){var t=e.className,n=e.block,l=e.selectedValue,u=e.selectValue,s=e.tabValues,p=[],d=(0,i.o5)().blockElementScrollPositionUntilNextRender,c=function(e){var t=e.currentTarget,n=p.indexOf(t),a=s[n].value;a!==l&&(d(t),u(a))},m=function(e){var t,n=null;switch(e.key){case"Enter":c(e);break;case"ArrowRight":var a,r=p.indexOf(e.currentTarget)+1;n=null!=(a=p[r])?a:p[0];break;case"ArrowLeft":var o,i=p.indexOf(e.currentTarget)-1;n=null!=(o=p[i])?o:p[p.length-1]}null==(t=n)||t.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":n},t)},s.map((function(e){var t=e.value,n=e.label,i=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:l===t?0:-1,"aria-selected":l===t,key:t,ref:function(e){return p.push(e)},onKeyDown:m,onClick:c},i,{className:(0,o.Z)("tabs__item",f.tabItem,null==i?void 0:i.className,{"tabs__item--active":l===t})}),null!=n?n:t)})))}function b(e){var t=e.lazy,n=e.children,a=e.selectedValue,o=(Array.isArray(n)?n:[n]).filter(Boolean);if(t){var i=o.find((function(e){return e.props.value===a}));return i?(0,r.cloneElement)(i,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},o.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==a})})))}function g(e){var t=k(e);return r.createElement("div",{className:(0,o.Z)("tabs-container",f.tabList)},r.createElement(N,(0,a.Z)({},e,t)),r.createElement(b,(0,a.Z)({},e,t)))}function v(e){var t=(0,y.Z)();return r.createElement(g,(0,a.Z)({key:String(t)},e))}},96478:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>c,contentTitle:()=>p,default:()=>y,frontMatter:()=>s,metadata:()=>d,toc:()=>m});var a=n(87462),r=n(63366),o=(n(67294),n(14137)),i=n(74866),l=n(85162),u=["components"],s={id:"datasource",title:"Datasources"},p=void 0,d={unversionedId:"querying/datasource",id:"querying/datasource",title:"Datasources",description:"\x3c!--",source:"@site/docs/31.0.0/querying/datasource.md",sourceDirName:"querying",slug:"/querying/datasource",permalink:"/docs/31.0.0/querying/datasource",draft:!1,tags:[],version:"current",frontMatter:{id:"datasource",title:"Datasources"},sidebar:"docs",previous:{title:"Troubleshooting",permalink:"/docs/31.0.0/querying/troubleshooting"},next:{title:"Joins",permalink:"/docs/31.0.0/querying/joins"}},c={},m=[{value:"Datasource type",id:"datasource-type",level:2},{value:"<code>table</code>",id:"table",level:3},{value:"<code>lookup</code>",id:"lookup",level:3},{value:"<code>union</code>",id:"union",level:3},{value:"Dynamic table append",id:"dynamic-table-append",level:4},{value:"<code>inline</code>",id:"inline",level:3},{value:"<code>query</code>",id:"query",level:3},{value:"<code>join</code>",id:"join",level:3},{value:"Joins in SQL",id:"joins-in-sql",level:4},{value:"Joins in native queries",id:"joins-in-native-queries",level:4},{value:"Join performance",id:"join-performance",level:4},{value:"Future work for joins",id:"future-work-for-joins",level:4},{value:"<code>unnest</code>",id:"unnest",level:3}],h={toc:m},k="wrapper";function y(e){var t=e.components,n=(0,r.Z)(e,u);return(0,o.kt)(k,(0,a.Z)({},h,n,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("p",null,'Datasources in Apache Druid are things that you can query. The most common kind of datasource is a table datasource,\nand in many contexts the word "datasource" implicitly refers to table datasources. This is especially true\n',(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/ingestion/"},"during data ingestion"),", where ingestion is always creating or writing into a table\ndatasource. But at query time, there are many other types of datasources available."),(0,o.kt)("p",null,'The word "datasource" is generally spelled ',(0,o.kt)("inlineCode",{parentName:"p"},"dataSource")," (with a capital S) when it appears in API requests and\nresponses."),(0,o.kt)("h2",{id:"datasource-type"},"Datasource type"),(0,o.kt)("h3",{id:"table"},(0,o.kt)("inlineCode",{parentName:"h3"},"table")),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"1",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},'SELECT column1, column2 FROM "druid"."dataSourceName"\n'))),(0,o.kt)(l.Z,{value:"2",label:"Native",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "scan",\n "dataSource": "dataSourceName",\n "columns": ["column1", "column2"],\n "intervals": ["0000/3000"]\n}\n')))),(0,o.kt)("p",null,"The table datasource is the most common type. This is the kind of datasource you get when you perform\n",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/ingestion/"},"data ingestion"),". They are split up into segments, distributed around the cluster,\nand queried in parallel."),(0,o.kt)("p",null,"In ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql#from"},"Druid SQL"),", table datasources reside in the ",(0,o.kt)("inlineCode",{parentName:"p"},"druid")," schema. This is the default schema, so table\ndatasources can be referenced as either ",(0,o.kt)("inlineCode",{parentName:"p"},"druid.dataSourceName")," or simply ",(0,o.kt)("inlineCode",{parentName:"p"},"dataSourceName"),"."),(0,o.kt)("p",null,"In native queries, table datasources can be referenced using their names as strings (as in the example above), or by\nusing JSON objects of the form:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'"dataSource": {\n "type": "table",\n "name": "dataSourceName"\n}\n')),(0,o.kt)("p",null,"To see a list of all table datasources, use the SQL query\n",(0,o.kt)("inlineCode",{parentName:"p"},"SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'druid'"),"."),(0,o.kt)("h3",{id:"lookup"},(0,o.kt)("inlineCode",{parentName:"h3"},"lookup")),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"3",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT k, v FROM lookup.countries\n"))),(0,o.kt)(l.Z,{value:"4",label:"Native",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "scan",\n "dataSource": {\n "type": "lookup",\n "lookup": "countries"\n },\n "columns": ["k", "v"],\n "intervals": ["0000/3000"]\n}\n')))),(0,o.kt)("p",null,"Lookup datasources correspond to Druid's key-value ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/lookups"},"lookup")," objects. In ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql#from"},"Druid SQL"),",\nthey reside in the ",(0,o.kt)("inlineCode",{parentName:"p"},"lookup")," schema. They are preloaded in memory on all servers, so they can be accessed rapidly.\nThey can be joined onto regular tables using the ",(0,o.kt)("a",{parentName:"p",href:"#join"},"join operator"),"."),(0,o.kt)("p",null,"Lookup datasources are key-value oriented and always have exactly two columns: ",(0,o.kt)("inlineCode",{parentName:"p"},"k")," (the key) and ",(0,o.kt)("inlineCode",{parentName:"p"},"v")," (the value), and\nboth are always strings."),(0,o.kt)("p",null,"To see a list of all lookup datasources, use the SQL query\n",(0,o.kt)("inlineCode",{parentName:"p"},"SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'lookup'"),"."),(0,o.kt)("admonition",{type:"info"},(0,o.kt)("p",{parentName:"admonition"}," Performance tip: Lookups can be joined with a base table either using an explicit ",(0,o.kt)("a",{parentName:"p",href:"#join"},"join"),", or by using the\nSQL ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql-scalar#string-functions"},(0,o.kt)("inlineCode",{parentName:"a"},"LOOKUP")," function"),".\nHowever, the join operator must evaluate the condition on each row, whereas the\n",(0,o.kt)("inlineCode",{parentName:"p"},"LOOKUP")," function can defer evaluation until after an aggregation phase. This means that the ",(0,o.kt)("inlineCode",{parentName:"p"},"LOOKUP")," function is\nusually faster than joining to a lookup datasource.")),(0,o.kt)("p",null,"Refer to the ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/query-execution#table"},"Query execution")," page for more details on how queries are executed when you\nuse table datasources."),(0,o.kt)("h3",{id:"union"},(0,o.kt)("inlineCode",{parentName:"h3"},"union")),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"5",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT column1, column2\nFROM (\n SELECT column1, column2 FROM table1\n UNION ALL\n SELECT column1, column2 FROM table2\n UNION ALL\n SELECT column1, column2 FROM table3\n)\n"))),(0,o.kt)(l.Z,{value:"6",label:"Native",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "scan",\n "dataSource": {\n "type": "union",\n "dataSources": ["table1", "table2", "table3"]\n },\n "columns": ["column1", "column2"],\n "intervals": ["0000/3000"]\n}\n')))),(0,o.kt)("p",null,"Unions allow you to treat two or more tables as a single datasource. In SQL, this is done with the UNION ALL operator\napplied directly to tables, called a ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql#table-level"},'"table-level union"'),'. In native queries, this is done with a\n"union" datasource.'),(0,o.kt)("p",null,"With SQL ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql#table-level"},"table-level unions")," the same columns must be selected from each table in the same order,\nand those columns must either have the same types, or types that can be implicitly cast to each other (such as different\nnumeric types). For this reason, it is more robust to write your queries to select specific columns."),(0,o.kt)("p",null,"With the native union datasource, the tables being unioned do not need to have identical schemas. If they do not fully\nmatch up, then columns that exist in one table but not another will be treated as if they contained all null values in\nthe tables where they do not exist."),(0,o.kt)("p",null,"In either case, features like expressions, column aliasing, JOIN, GROUP BY, ORDER BY, and so on cannot be used with\ntable unions."),(0,o.kt)("p",null,"Refer to the ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/query-execution#union"},"Query execution")," page for more details on how queries are executed when you\nuse union datasources."),(0,o.kt)("h4",{id:"dynamic-table-append"},"Dynamic table append"),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"sql",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT column1, column2, column3\nFROM TABLE(APPEND('table1','table2','table3'))\n")))),(0,o.kt)("p",null,"Perform dynamic table appends in SQL using ",(0,o.kt)("inlineCode",{parentName:"p"},"TABLE(APPEND(...))"),". This simplifies SQL syntax to match columns by name from multiple tables. The native query syntax remains the same as for native union datasources.\nSuppose you have three tables:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"table1")," has ",(0,o.kt)("inlineCode",{parentName:"li"},"column1")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"table2")," has ",(0,o.kt)("inlineCode",{parentName:"li"},"column2")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"table3")," has ",(0,o.kt)("inlineCode",{parentName:"li"},"column1"),", ",(0,o.kt)("inlineCode",{parentName:"li"},"column2"),", ",(0,o.kt)("inlineCode",{parentName:"li"},"column3"))),(0,o.kt)("p",null,"You can create a union view of all the tables by using ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql#table-level"},"table-level union"),":"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT * from (\n SELECT column1,NULL AS column2,NULL AS column3 FROM table1\n UNION ALL\n SELECT NULL AS column1,column2,NULL AS column3 FROM table2\n UNION ALL\n SELECT column1,column2,column3 FROM table3\n) t\n")),(0,o.kt)("p",null,"However depending on the size of the table's schema it might be quite complicated to do that; ",(0,o.kt)("inlineCode",{parentName:"p"},"TABLE(APPEND('table1','table2','table3'))")," represents the same in a more compact form."),(0,o.kt)("admonition",{type:"info"},(0,o.kt)("p",{parentName:"admonition"},"Only tables defined in the catalog are supported in ",(0,o.kt)("inlineCode",{parentName:"p"},"TABLE(APPEND())")," - due to that; common table expressions result in ",(0,o.kt)("inlineCode",{parentName:"p"},"table not found")," errors for queries like:"),(0,o.kt)("pre",{parentName:"admonition"},(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"WITH cte_table AS (SELECT * from TABLE1) SELECT * FROM TABLE(APPEND('cte_table'))\n"))),(0,o.kt)("h3",{id:"inline"},(0,o.kt)("inlineCode",{parentName:"h3"},"inline")),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"sql",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT * from (VALUES ('United States', 'San Francisco'),\n ('Canada', 'Calgary')\n ) t (country, city)\n"))),(0,o.kt)(l.Z,{value:"native",label:"Native",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "scan",\n "dataSource": {\n "type": "inline",\n "columnNames": ["country", "city"],\n "rows": [\n ["United States", "San Francisco"],\n ["Canada", "Calgary"]\n ]\n },\n "columns": ["country", "city"],\n "intervals": ["0000/3000"]\n}\n')))),(0,o.kt)("p",null,"Inline datasources allow you to query a small amount of data that is embedded in the query itself. They are useful when\nyou want to write a query on a small amount of data without loading it first. They are also useful as inputs into a\n",(0,o.kt)("a",{parentName:"p",href:"#join"},"join"),". Druid also uses them internally to handle subqueries that need to be inlined on the Broker. See the\n",(0,o.kt)("a",{parentName:"p",href:"#query"},(0,o.kt)("inlineCode",{parentName:"a"},"query")," datasource")," documentation for more details."),(0,o.kt)("p",null,"There are two fields in an inline datasource: an array of ",(0,o.kt)("inlineCode",{parentName:"p"},"columnNames")," and an array of ",(0,o.kt)("inlineCode",{parentName:"p"},"rows"),". Each row is an array\nthat must be exactly as long as the list of ",(0,o.kt)("inlineCode",{parentName:"p"},"columnNames"),". The first element in each row corresponds to the first\ncolumn in ",(0,o.kt)("inlineCode",{parentName:"p"},"columnNames"),", and so on."),(0,o.kt)("p",null,"Inline datasources are not available in Druid SQL."),(0,o.kt)("p",null,"Refer to the ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/query-execution#inline"},"Query execution")," page for more details on how queries are executed when you\nuse inline datasources."),(0,o.kt)("h3",{id:"query"},(0,o.kt)("inlineCode",{parentName:"h3"},"query")),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"8",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},"-- Uses a subquery to count hits per page, then takes the average.\nSELECT\n AVG(cnt) AS average_hits_per_page\nFROM\n (SELECT page, COUNT(*) AS hits FROM site_traffic GROUP BY page)\n"))),(0,o.kt)(l.Z,{value:"9",label:"Native",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "timeseries",\n "dataSource": {\n "type": "query",\n "query": {\n "queryType": "groupBy",\n "dataSource": "site_traffic",\n "intervals": ["0000/3000"],\n "granularity": "all",\n "dimensions": ["page"],\n "aggregations": [\n { "type": "count", "name": "hits" }\n ]\n }\n },\n "intervals": ["0000/3000"],\n "granularity": "all",\n "aggregations": [\n { "type": "longSum", "name": "hits", "fieldName": "hits" },\n { "type": "count", "name": "pages" }\n ],\n "postAggregations": [\n { "type": "expression", "name": "average_hits_per_page", "expression": "hits / pages" }\n ]\n}\n')))),(0,o.kt)("p",null,"Query datasources allow you to issue subqueries. In native queries, they can appear anywhere that accepts a\n",(0,o.kt)("inlineCode",{parentName:"p"},"dataSource")," (except underneath a ",(0,o.kt)("inlineCode",{parentName:"p"},"union"),"). In SQL, they can appear in the following places, always surrounded by parentheses:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"The FROM clause: ",(0,o.kt)("inlineCode",{parentName:"li"},"FROM (<subquery>)"),"."),(0,o.kt)("li",{parentName:"ul"},"As inputs to a JOIN: ",(0,o.kt)("inlineCode",{parentName:"li"},"<table-or-subquery-1> t1 INNER JOIN <table-or-subquery-2> t2 ON t1.<col1> = t2.<col2>"),"."),(0,o.kt)("li",{parentName:"ul"},"In the WHERE clause: ",(0,o.kt)("inlineCode",{parentName:"li"},"WHERE <column> { IN | NOT IN } (<subquery>)"),". These are translated to joins by the SQL planner.")),(0,o.kt)("admonition",{type:"info"},(0,o.kt)("p",{parentName:"admonition"}," Performance tip: In most cases, subquery results are fully buffered in memory on the Broker and then further\nprocessing occurs on the Broker itself. This means that subqueries with large result sets can cause performance\nbottlenecks or run into memory usage limits on the Broker. See the ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/query-execution#query"},"Query execution"),"\npage for more details on how subqueries are executed and what limits will apply.")),(0,o.kt)("h3",{id:"join"},(0,o.kt)("inlineCode",{parentName:"h3"},"join")),(0,o.kt)(i.Z,{mdxType:"Tabs"},(0,o.kt)(l.Z,{value:"10",label:"SQL",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-sql"},'-- Joins "sales" with "countries" (using "store" as the join key) to get sales by country.\nSELECT\n store_to_country.v AS country,\n SUM(sales.revenue) AS country_revenue\nFROM\n sales\n INNER JOIN lookup.store_to_country ON sales.store = store_to_country.k\nGROUP BY\n countries.v\n'))),(0,o.kt)(l.Z,{value:"11",label:"Native",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "groupBy",\n "dataSource": {\n "type": "join",\n "left": "sales",\n "right": {\n "type": "lookup",\n "lookup": "store_to_country"\n },\n "rightPrefix": "r.",\n "condition": "store == \\"r.k\\"",\n "joinType": "INNER"\n },\n "intervals": ["0000/3000"],\n "granularity": "all",\n "dimensions": [\n { "type": "default", "outputName": "country", "dimension": "r.v" }\n ],\n "aggregations": [\n { "type": "longSum", "name": "country_revenue", "fieldName": "revenue" }\n ]\n}\n')))),(0,o.kt)("p",null,"Join datasources allow you to do a SQL-style join of two datasources. Stacking joins on top of each other allows\nyou to join arbitrarily many datasources."),(0,o.kt)("p",null,'Joins in native queries are implemented with a broadcast hash-join algorithm. This means\nthat all datasources other than the leftmost "base" datasource must fit in memory. In native queries, the join condition\nmust be an equality. In SQL, any join condition is accepted, but only equalities of a certain form\n(see ',(0,o.kt)("a",{parentName:"p",href:"#joins-in-sql"},"Joins in SQL"),") execute efficiently as part of a native join. For other kinds of conditions, planner will try\nto re-arrange condition such that some of the sub-conditions are evaluated as a filter on top of join and other\nsub-conditions are left out in the join condition. In worst case scenario, SQL will execute the join condition as a\ncross join (cartesian product) plus a filter."),(0,o.kt)("p",null,"This feature is intended mainly to allow joining regular Druid tables with ",(0,o.kt)("a",{parentName:"p",href:"#lookup"},"lookup"),", ",(0,o.kt)("a",{parentName:"p",href:"#inline"},"inline"),", and\n",(0,o.kt)("a",{parentName:"p",href:"#query"},"query")," datasources. Refer to the ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/query-execution#join"},"Query execution")," page for more details on how\nqueries are executed when you use join datasources."),(0,o.kt)("h4",{id:"joins-in-sql"},"Joins in SQL"),(0,o.kt)("p",null,"SQL joins take the form:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},"<o1> [ INNER | LEFT [OUTER] ] JOIN <o2> ON <condition>\n")),(0,o.kt)("p",null,"Any condition is accepted, but only certain kinds of conditions execute efficiently\nas a native join. The condition must be a single clause like the following, or an ",(0,o.kt)("inlineCode",{parentName:"p"},"AND")," of clauses involving at\nleast one of the following:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Equality between fields of the same type on each side, like ",(0,o.kt)("inlineCode",{parentName:"li"},"t1 JOIN t2 ON t1.x = t2.x"),"."),(0,o.kt)("li",{parentName:"ul"},"Equality between a function call on one side, and a field on the other side, like ",(0,o.kt)("inlineCode",{parentName:"li"},"t1 JOIN t2 ON LOWER(t1.x) = t2.x"),"."),(0,o.kt)("li",{parentName:"ul"},"The equality operator may be ",(0,o.kt)("inlineCode",{parentName:"li"},"=")," (which does not match nulls) or ",(0,o.kt)("inlineCode",{parentName:"li"},"IS NOT DISTINCT FROM")," (which does match nulls).")),(0,o.kt)("p",null,"In other cases, Druid will either insert a subquery below the join, or will use a cross join (cartesian product)\nfollowed by a filter. Joins executed in these ways may run into resource or performance constraints. To determine\nif your query is using one of these execution paths, run ",(0,o.kt)("inlineCode",{parentName:"p"},"EXPLAIN PLAN FOR <query>")," and look for the following:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"query")," type datasources under the ",(0,o.kt)("inlineCode",{parentName:"li"},"left")," or ",(0,o.kt)("inlineCode",{parentName:"li"},"right")," key of your ",(0,o.kt)("inlineCode",{parentName:"li"},"join")," datasource."),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"join")," type datasource with ",(0,o.kt)("inlineCode",{parentName:"li"},"condition")," set to ",(0,o.kt)("inlineCode",{parentName:"li"},'"1"')," (cartesian product) followed by a ",(0,o.kt)("inlineCode",{parentName:"li"},"filter")," that encodes the\ncondition you provided.")),(0,o.kt)("p",null,"In these cases, you may be able to improve the performance of your query by rewriting it."),(0,o.kt)("p",null,"For more information about how Druid translates SQL to native queries, refer to the\n",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/querying/sql-translation"},"Druid SQL")," documentation."),(0,o.kt)("h4",{id:"joins-in-native-queries"},"Joins in native queries"),(0,o.kt)("p",null,"Native join datasources have the following properties. All are required."),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Field"),(0,o.kt)("th",{parentName:"tr",align:null},"Description"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},"left")),(0,o.kt)("td",{parentName:"tr",align:null},"Left-hand datasource. Must be of type ",(0,o.kt)("inlineCode",{parentName:"td"},"table"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"join"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"lookup"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"query"),", or ",(0,o.kt)("inlineCode",{parentName:"td"},"inline"),". Placing another join as the left datasource allows you to join arbitrarily many datasources.")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},"right")),(0,o.kt)("td",{parentName:"tr",align:null},"Right-hand datasource. Must be of type ",(0,o.kt)("inlineCode",{parentName:"td"},"lookup"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"query"),", or ",(0,o.kt)("inlineCode",{parentName:"td"},"inline"),". Note that this is more rigid than what Druid SQL requires.")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},"rightPrefix")),(0,o.kt)("td",{parentName:"tr",align:null},"String prefix that will be applied to all columns from the right-hand datasource, to prevent them from colliding with columns from the left-hand datasource. Can be any string, so long as it is nonempty and is not be a prefix of the string ",(0,o.kt)("inlineCode",{parentName:"td"},"__time"),". Any columns from the left-hand side that start with your ",(0,o.kt)("inlineCode",{parentName:"td"},"rightPrefix")," will be shadowed. It is up to you to provide a prefix that will not shadow any important columns from the left side.")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},"condition")),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("a",{parentName:"td",href:"/docs/31.0.0/querying/math-expr"},"Expression")," that must be an equality where one side is an expression of the left-hand side, and the other side is a simple column reference to the right-hand side. Note that this is more rigid than what Druid SQL requires: here, the right-hand reference must be a simple column reference; in SQL it can be an expression.")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},"joinType")),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},"INNER")," or ",(0,o.kt)("inlineCode",{parentName:"td"},"LEFT"),".")))),(0,o.kt)("h4",{id:"join-performance"},"Join performance"),(0,o.kt)("p",null,"Joins are a feature that can significantly affect performance of your queries. Some performance tips and notes:"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},"Joins are especially useful with ",(0,o.kt)("a",{parentName:"li",href:"#lookup"},"lookup datasources"),", but in most cases, the\n",(0,o.kt)("a",{parentName:"li",href:"/docs/31.0.0/querying/sql-scalar#string-functions"},(0,o.kt)("inlineCode",{parentName:"a"},"LOOKUP")," function")," performs better than a join. Consider using the ",(0,o.kt)("inlineCode",{parentName:"li"},"LOOKUP")," function if\nit is appropriate for your use case."),(0,o.kt)("li",{parentName:"ol"},"When using joins in Druid SQL, keep in mind that it can generate subqueries that you did not explicitly include in\nyour queries. Refer to the ",(0,o.kt)("a",{parentName:"li",href:"/docs/31.0.0/querying/sql-translation"},"Druid SQL")," documentation for more details about when this happens\nand how to detect it."),(0,o.kt)("li",{parentName:"ol"},"One common reason for implicit subquery generation is if the types of the two halves of an equality do not match.\nFor example, since lookup keys are always strings, the condition ",(0,o.kt)("inlineCode",{parentName:"li"},"druid.d JOIN lookup.l ON d.field = l.field")," will\nperform best if ",(0,o.kt)("inlineCode",{parentName:"li"},"d.field")," is a string."),(0,o.kt)("li",{parentName:"ol"},"The join operator must evaluate the condition for each row. In the future, we expect\nto implement both early and deferred condition evaluation, which we expect to improve performance considerably for\ncommon use cases."),(0,o.kt)("li",{parentName:"ol"},"Currently, Druid does not support pushing down predicates (condition and filter) past a Join (i.e. into\nJoin's children). Druid only supports pushing predicates into the join if they originated from\nabove the join. Hence, the location of predicates and filters in your Druid SQL is very important.\nAlso, as a result of this, comma joins should be avoided.")),(0,o.kt)("h4",{id:"future-work-for-joins"},"Future work for joins"),(0,o.kt)("p",null,"Joins are an area of active development in Druid. The following features are missing today but may appear in\nfuture versions:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Reordering of join operations to get the most performant plan."),(0,o.kt)("li",{parentName:"ul"},"Preloaded dimension tables that are wider than lookups (i.e. supporting more than a single key and single value)."),(0,o.kt)("li",{parentName:"ul"},"RIGHT OUTER and FULL OUTER joins in the native query engine. Currently, they are partially implemented. Queries run\nbut results are not always correct."),(0,o.kt)("li",{parentName:"ul"},"Performance-related optimizations as mentioned in the ",(0,o.kt)("a",{parentName:"li",href:"#join-performance"},"previous section"),"."),(0,o.kt)("li",{parentName:"ul"},"Join conditions on a column containing a multi-value dimension.")),(0,o.kt)("h3",{id:"unnest"},(0,o.kt)("inlineCode",{parentName:"h3"},"unnest")),(0,o.kt)("p",null,"Use the ",(0,o.kt)("inlineCode",{parentName:"p"},"unnest")," datasource to unnest a column with multiple values in an array.\nFor example, you have a source column that looks like this:"),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Nested"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"[a, b]")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"[c, d]")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"[e, ","[f,g]","]")))),(0,o.kt)("p",null,"When you use the ",(0,o.kt)("inlineCode",{parentName:"p"},"unnest")," datasource, the unnested column looks like this:"),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Unnested"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"a")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"b")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"c")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"d")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"e")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"[f, g]")))),(0,o.kt)("p",null,"When unnesting data, keep the following in mind:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"The total number of rows will grow to accommodate the new rows that the unnested data occupy."),(0,o.kt)("li",{parentName:"ul"},"You can unnest the values in more than one column in a single ",(0,o.kt)("inlineCode",{parentName:"li"},"unnest")," datasource, but this can lead to a very large number of new rows depending on your dataset.")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"unnest")," datasource uses the following syntax:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},' "dataSource": {\n "type": "unnest",\n "base": {\n "type": "table",\n "name": "nested_data"\n },\n "virtualColumn": {\n "type": "expression",\n "name": "output_column",\n "expression": "\\"column_reference\\""\n },\n "unnestFilter": "optional_filter"\n }\n')),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"dataSource.type"),": Set this to ",(0,o.kt)("inlineCode",{parentName:"li"},"unnest"),"."),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"dataSource.base"),": Defines the datasource you want to unnest.",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"dataSource.base.type"),": The type of datasource you want to unnest, such as a table."))),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"dataSource.virtualColumn"),": ",(0,o.kt)("a",{parentName:"li",href:"/docs/31.0.0/querying/virtual-columns"},"Virtual column")," that references the nested values. The output name of this column is reused as the name of the column that contains unnested values. You can replace the source column with the unnested column by specifying the source column's name or a new column by specifying a different name. Outputting it to a new column can help you verify that you get the results that you expect but isn't required."),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"unnestFilter"),": A filter only on the output column. You can omit this or set it to null if there are no filters.")),(0,o.kt)("p",null,"To learn more about how to use the ",(0,o.kt)("inlineCode",{parentName:"p"},"unnest")," datasource, see the ",(0,o.kt)("a",{parentName:"p",href:"/docs/31.0.0/tutorials/tutorial-unnest-arrays"},"unnest tutorial"),"."))}y.isMDXComponent=!0}}]); |