| "use strict";(self.webpackChunk=self.webpackChunk||[]).push([[3856],{3905:(e,t,n)=>{n.d(t,{Zo:()=>p,kt:()=>f});var r=n(67294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function l(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?i(Object(n),!0).forEach((function(t){a(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):i(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function o(e,t){if(null==e)return{};var n,r,a=function(e,t){if(null==e)return{};var n,r,a={},i=Object.keys(e);for(r=0;r<i.length;r++)n=i[r],t.indexOf(n)>=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r<i.length;r++)n=i[r],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),m=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},p=function(e){var t=m(e.components);return r.createElement(s.Provider,{value:t},e.children)},u="mdxType",d={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},c=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,s=e.parentName,p=o(e,["components","mdxType","originalType","parentName"]),u=m(n),c=a,f=u["".concat(s,".").concat(c)]||u[c]||d[c]||i;return n?r.createElement(f,l(l({ref:t},p),{},{components:n})):r.createElement(f,l({ref:t},p))}));function f(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,l=new Array(i);l[0]=c;var o={};for(var s in t)hasOwnProperty.call(t,s)&&(o[s]=t[s]);o.originalType=e,o[u]="string"==typeof e?e:a,l[1]=o;for(var m=2;m<i;m++)l[m]=n[m];return r.createElement.apply(null,l)}return r.createElement.apply(null,n)}c.displayName="MDXCreateElement"},90286:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>p,contentTitle:()=>s,default:()=>f,frontMatter:()=>o,metadata:()=>m,toc:()=>u});var r=n(87462),a=n(63366),i=(n(67294),n(3905)),l=["components"],o={id:"bloom-filter",title:"Bloom Filter"},s=void 0,m={unversionedId:"development/extensions-core/bloom-filter",id:"development/extensions-core/bloom-filter",title:"Bloom Filter",description:"\x3c!--",source:"@site/docs/32.0.1/development/extensions-core/bloom-filter.md",sourceDirName:"development/extensions-core",slug:"/development/extensions-core/bloom-filter",permalink:"/docs/32.0.1/development/extensions-core/bloom-filter",draft:!1,tags:[],version:"current",frontMatter:{id:"bloom-filter",title:"Bloom Filter"}},p={},u=[{value:"Filter queries with a Bloom filter",id:"filter-queries-with-a-bloom-filter",level:2},{value:"JSON specification",id:"json-specification",level:3},{value:"Serialized format for BloomKFilter",id:"serialized-format-for-bloomkfilter",level:3},{value:"Filter SQL queries",id:"filter-sql-queries",level:3},{value:"Expression and virtual column support",id:"expression-and-virtual-column-support",level:3},{value:"Bloom filter query aggregator",id:"bloom-filter-query-aggregator",level:2},{value:"JSON specification",id:"json-specification-1",level:3},{value:"Example",id:"example",level:3},{value:"SQL Bloom filter aggregator",id:"sql-bloom-filter-aggregator",level:3}],d={toc:u},c="wrapper";function f(e){var t=e.components,n=(0,a.Z)(e,l);return(0,i.kt)(c,(0,r.Z)({},d,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)("p",null,"To use the Apache Druid","\xae"," Bloom filter extension, include ",(0,i.kt)("inlineCode",{parentName:"p"},"druid-bloom-filter")," in the extensions load list. See ",(0,i.kt)("a",{parentName:"p",href:"/docs/32.0.1/configuration/extensions#loading-extensions"},"Loading extensions")," for more information."),(0,i.kt)("p",null,"This extension adds the abilities to construct Bloom filters from query results and to filter query results by testing\nagainst a Bloom filter. A Bloom filter is a probabilistic data structure to check for set membership. A Bloom\nfilter is a good candidate to use when an explicit filter is impossible, such as filtering a query\nagainst a set of millions of values."),(0,i.kt)("p",null,"Following are some characteristics of Bloom filters:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"Bloom filters are significantly more space efficient than HashSets."),(0,i.kt)("li",{parentName:"ul"},"Because they are probabilistic, false positive results are possible with Bloom filters. For example, the ",(0,i.kt)("inlineCode",{parentName:"li"},"test()")," function might return ",(0,i.kt)("inlineCode",{parentName:"li"},"true")," for an element that is not within the filter."),(0,i.kt)("li",{parentName:"ul"},"False negatives are not possible. If an element is present, ",(0,i.kt)("inlineCode",{parentName:"li"},"test()")," always returns ",(0,i.kt)("inlineCode",{parentName:"li"},"true"),"."),(0,i.kt)("li",{parentName:"ul"},"The false positive probability of this implementation is fixed at 5%. Increasing the number of entries that the filter can hold can decrease this false positive rate in exchange for overall size."),(0,i.kt)("li",{parentName:"ul"},"Bloom filters are sensitive to the number of inserted elements. You must specify the expected number of entries at creation time. If the number of insertions exceeds the specified number of entries, the false positive probability increases accordingly.")),(0,i.kt)("p",null,"This extension is based on ",(0,i.kt)("inlineCode",{parentName:"p"},"org.apache.hive.common.util.BloomKFilter")," from ",(0,i.kt)("inlineCode",{parentName:"p"},"hive-storage-api"),". Internally,\nthis implementation uses Murmur3 as the hash algorithm."),(0,i.kt)("p",null,"The following Java example shows how to construct a BloomKFilter externally:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-java"},'BloomKFilter bloomFilter = new BloomKFilter(1500);\nbloomFilter.addString("value 1");\nbloomFilter.addString("value 2");\nbloomFilter.addString("value 3");\nByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();\nBloomKFilter.serialize(byteArrayOutputStream, bloomFilter);\nString base64Serialized = Base64.encodeBase64String(byteArrayOutputStream.toByteArray());\n')),(0,i.kt)("p",null,"You can then use the Base64 encoded string in JSON-based or SQL-based queries in Druid."),(0,i.kt)("h2",{id:"filter-queries-with-a-bloom-filter"},"Filter queries with a Bloom filter"),(0,i.kt)("h3",{id:"json-specification"},"JSON specification"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-json"},'{\n "type" : "bloom",\n "dimension" : <dimension_name>,\n "bloomKFilter" : <serialized_bytes_for_BloomKFilter>,\n "extractionFn" : <extraction_fn>\n}\n')),(0,i.kt)("table",null,(0,i.kt)("thead",{parentName:"table"},(0,i.kt)("tr",{parentName:"thead"},(0,i.kt)("th",{parentName:"tr",align:null},"Property"),(0,i.kt)("th",{parentName:"tr",align:null},"Description"),(0,i.kt)("th",{parentName:"tr",align:null},"Required"))),(0,i.kt)("tbody",{parentName:"table"},(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"type")),(0,i.kt)("td",{parentName:"tr",align:null},"Filter type. Set to ",(0,i.kt)("inlineCode",{parentName:"td"},"bloom"),"."),(0,i.kt)("td",{parentName:"tr",align:null},"Yes")),(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"dimension")),(0,i.kt)("td",{parentName:"tr",align:null},"Dimension to filter over."),(0,i.kt)("td",{parentName:"tr",align:null},"Yes")),(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"bloomKFilter")),(0,i.kt)("td",{parentName:"tr",align:null},"Base64 encoded binary representation of ",(0,i.kt)("inlineCode",{parentName:"td"},"org.apache.hive.common.util.BloomKFilter"),"."),(0,i.kt)("td",{parentName:"tr",align:null},"Yes")),(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"extractionFn")),(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("a",{parentName:"td",href:"/docs/32.0.1/querying/dimensionspecs#extraction-functions"},"Extraction function")," to apply to the dimension values."),(0,i.kt)("td",{parentName:"tr",align:null},"No")))),(0,i.kt)("h3",{id:"serialized-format-for-bloomkfilter"},"Serialized format for BloomKFilter"),(0,i.kt)("p",null,"Serialized BloomKFilter format:"),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"1 byte for the number of hash functions."),(0,i.kt)("li",{parentName:"ul"},"1 big-endian integer for the number of longs in the bitset."),(0,i.kt)("li",{parentName:"ul"},"Big-endian longs in the BloomKFilter bitset.")),(0,i.kt)("p",null,(0,i.kt)("inlineCode",{parentName:"p"},"org.apache.hive.common.util.BloomKFilter")," provides a method to serialize Bloom filters to ",(0,i.kt)("inlineCode",{parentName:"p"},"outputStream"),"."),(0,i.kt)("h3",{id:"filter-sql-queries"},"Filter SQL queries"),(0,i.kt)("p",null,"You can use Bloom filters in SQL ",(0,i.kt)("inlineCode",{parentName:"p"},"WHERE")," clauses with the ",(0,i.kt)("inlineCode",{parentName:"p"},"bloom_filter_test")," operator:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT COUNT(*) FROM druid.foo WHERE bloom_filter_test(<expr>, '<serialized_bytes_for_BloomKFilter>')\n")),(0,i.kt)("h3",{id:"expression-and-virtual-column-support"},"Expression and virtual column support"),(0,i.kt)("p",null,"The Bloom filter extension also adds a Bloom filter ",(0,i.kt)("a",{parentName:"p",href:"/docs/32.0.1/querying/math-expr"},"Druid expression")," which shares syntax\nwith the SQL operator."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-sql"},"bloom_filter_test(<expr>, '<serialized_bytes_for_BloomKFilter>')\n")),(0,i.kt)("h2",{id:"bloom-filter-query-aggregator"},"Bloom filter query aggregator"),(0,i.kt)("p",null,"You can create an input for a ",(0,i.kt)("inlineCode",{parentName:"p"},"BloomKFilter")," from a Druid query with the ",(0,i.kt)("inlineCode",{parentName:"p"},"bloom")," aggregator. Make sure to set a reasonable value for the ",(0,i.kt)("inlineCode",{parentName:"p"},"maxNumEntries")," parameter to specify the maximum number of distinct entries that the Bloom filter can represent without increasing the false positive rate. Try performing a query using\none of the unique count sketches to calculate the value for this parameter to build a Bloom filter appropriate for the query."),(0,i.kt)("h3",{id:"json-specification-1"},"JSON specification"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-json"},'{\n "type": "bloom",\n "name": <output_field_name>,\n "maxNumEntries": <maximum_number_of_elements_for_BloomKFilter>\n "field": <dimension_spec>\n }\n')),(0,i.kt)("table",null,(0,i.kt)("thead",{parentName:"table"},(0,i.kt)("tr",{parentName:"thead"},(0,i.kt)("th",{parentName:"tr",align:null},"Property"),(0,i.kt)("th",{parentName:"tr",align:null},"Description"),(0,i.kt)("th",{parentName:"tr",align:null},"Required"))),(0,i.kt)("tbody",{parentName:"table"},(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"type")),(0,i.kt)("td",{parentName:"tr",align:null},"Aggregator type. Set to ",(0,i.kt)("inlineCode",{parentName:"td"},"bloom"),"."),(0,i.kt)("td",{parentName:"tr",align:null},"Yes")),(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"name")),(0,i.kt)("td",{parentName:"tr",align:null},"Output field name."),(0,i.kt)("td",{parentName:"tr",align:null},"Yes")),(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"field")),(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("a",{parentName:"td",href:"/docs/32.0.1/querying/dimensionspecs"},"DimensionSpec")," to add to ",(0,i.kt)("inlineCode",{parentName:"td"},"org.apache.hive.common.util.BloomKFilter"),"."),(0,i.kt)("td",{parentName:"tr",align:null},"Yes")),(0,i.kt)("tr",{parentName:"tbody"},(0,i.kt)("td",{parentName:"tr",align:null},(0,i.kt)("inlineCode",{parentName:"td"},"maxNumEntries")),(0,i.kt)("td",{parentName:"tr",align:null},"Maximum number of distinct values supported by ",(0,i.kt)("inlineCode",{parentName:"td"},"org.apache.hive.common.util.BloomKFilter"),". Defaults to ",(0,i.kt)("inlineCode",{parentName:"td"},"1500"),"."),(0,i.kt)("td",{parentName:"tr",align:null},"No")))),(0,i.kt)("h3",{id:"example"},"Example"),(0,i.kt)("p",null,"The following example shows a timeseries query object with a ",(0,i.kt)("inlineCode",{parentName:"p"},"bloom")," aggregator:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-json"},'{\n "queryType": "timeseries",\n "dataSource": "wikiticker",\n "intervals": [ "2015-09-12T00:00:00.000/2015-09-13T00:00:00.000" ],\n "granularity": "day",\n "aggregations": [\n {\n "type": "bloom",\n "name": "userBloom",\n "maxNumEntries": 100000,\n "field": {\n "type":"default",\n "dimension":"user",\n "outputType": "STRING"\n }\n }\n ]\n}\n')),(0,i.kt)("p",null,"Example response:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-json"},'[\n {\n "timestamp":"2015-09-12T00:00:00.000Z",\n "result":{"userBloom":"BAAAJhAAAA..."}\n }\n]\n')),(0,i.kt)("p",null,"We recommend ordering by an alternative aggregation method instead of ordering results by a Bloom filter aggregator.\nOrdering results by a Bloom filter aggregator can be resource-intensive because Druid performs an expensive linear scan of the filter to approximate the count of items added to the set by counting the number of set bits. "),(0,i.kt)("h3",{id:"sql-bloom-filter-aggregator"},"SQL Bloom filter aggregator"),(0,i.kt)("p",null,"You can compute Bloom filters in SQL expressions with the BLOOM_FILTER aggregator. For example:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-sql"},"SELECT BLOOM_FILTER(<expression>, <max number of entries>) FROM druid.foo WHERE dim2 = 'abc'\n")),(0,i.kt)("p",null,"Druid serializes Bloom filter results in a SQL response into a Base64 string. You can use the resulting string in subsequent queries as a filter."))}f.isMDXComponent=!0}}]); |