blob: b23d9ea9e4e8dda9f074fd0fc71681a3254274ac [file] [log] [blame]
"use strict";(self.webpackChunk=self.webpackChunk||[]).push([[6630],{15680:(e,t,n)=>{n.d(t,{xA:()=>p,yg:()=>h});var o=n(96540);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function r(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);t&&(o=o.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,o)}return n}function i(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?r(Object(n),!0).forEach((function(t){a(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):r(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function s(e,t){if(null==e)return{};var n,o,a=function(e,t){if(null==e)return{};var n,o,a={},r=Object.keys(e);for(o=0;o<r.length;o++)n=r[o],t.indexOf(n)>=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(o=0;o<r.length;o++)n=r[o],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=o.createContext({}),c=function(e){var t=o.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=c(e.components);return o.createElement(l.Provider,{value:t},e.children)},d="mdxType",g={inlineCode:"code",wrapper:function(e){var t=e.children;return o.createElement(o.Fragment,{},t)}},u=o.forwardRef((function(e,t){var n=e.components,a=e.mdxType,r=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),u=a,h=d["".concat(l,".").concat(u)]||d[u]||g[u]||r;return n?o.createElement(h,i(i({ref:t},p),{},{components:n})):o.createElement(h,i({ref:t},p))}));function h(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var r=n.length,i=new Array(r);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s[d]="string"==typeof e?e:a,i[1]=s;for(var c=2;c<r;c++)i[c]=n[c];return o.createElement.apply(null,i)}return o.createElement.apply(null,n)}u.displayName="MDXCreateElement"},22866:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>p,contentTitle:()=>l,default:()=>h,frontMatter:()=>s,metadata:()=>c,toc:()=>d});var o=n(58168),a=n(98587),r=(n(96540),n(15680)),i=["components"],s={id:"iceberg",title:"Iceberg extension"},l=void 0,c={unversionedId:"development/extensions-contrib/iceberg",id:"development/extensions-contrib/iceberg",title:"Iceberg extension",description:"\x3c!--",source:"@site/docs/29.0.0/development/extensions-contrib/iceberg.md",sourceDirName:"development/extensions-contrib",slug:"/development/extensions-contrib/iceberg",permalink:"/docs/29.0.0/development/extensions-contrib/iceberg",draft:!1,tags:[],version:"current",frontMatter:{id:"iceberg",title:"Iceberg extension"}},p={},d=[{value:"Iceberg Ingest extension",id:"iceberg-ingest-extension",level:2},{value:"Hive metastore catalog",id:"hive-metastore-catalog",level:2},{value:"Read from HDFS warehouse",id:"read-from-hdfs-warehouse",level:3},{value:"Read from S3 warehouse",id:"read-from-s3-warehouse",level:3},{value:"Local catalog",id:"local-catalog",level:2},{value:"Downloading Iceberg extension",id:"downloading-iceberg-extension",level:2},{value:"Known limitations",id:"known-limitations",level:2}],g={toc:d},u="wrapper";function h(e){var t=e.components,n=(0,a.A)(e,i);return(0,r.yg)(u,(0,o.A)({},g,n,{components:t,mdxType:"MDXLayout"}),(0,r.yg)("h2",{id:"iceberg-ingest-extension"},"Iceberg Ingest extension"),(0,r.yg)("p",null,"Apache Iceberg is an open table format for huge analytic datasets. ",(0,r.yg)("a",{parentName:"p",href:"/docs/29.0.0/ingestion/input-sources#iceberg-input-source"},"IcebergInputSource")," lets you ingest data stored in the Iceberg table format into Apache Druid. To use the iceberg extension, add the ",(0,r.yg)("inlineCode",{parentName:"p"},"druid-iceberg-extensions")," to the list of loaded extensions. See ",(0,r.yg)("a",{parentName:"p",href:"/docs/29.0.0/configuration/extensions#loading-extensions"},"Loading extensions")," for more information."),(0,r.yg)("p",null,"Iceberg manages most of its metadata in metadata files in the object storage. However, it is still dependent on a metastore to manage a certain amount of metadata.\nIceberg refers to these metastores as catalogs. The Iceberg extension lets you connect to the following Iceberg catalog types:"),(0,r.yg)("ul",null,(0,r.yg)("li",{parentName:"ul"},"Hive metastore catalog"),(0,r.yg)("li",{parentName:"ul"},"Local catalog")),(0,r.yg)("p",null,"Druid does not support AWS Glue and REST based catalogs yet."),(0,r.yg)("p",null,"For a given catalog, Iceberg input source reads the table name from the catalog, applies the filters, and extracts all the underlying live data files up to the latest snapshot.\nThe data files can be in Parquet, ORC, or Avro formats. The data files typically reside in a warehouse location, which can be in HDFS, S3, or the local filesystem.\nThe ",(0,r.yg)("inlineCode",{parentName:"p"},"druid-iceberg-extensions")," extension relies on the existing input source connectors in Druid to read the data files from the warehouse. Therefore, the Iceberg input source can be considered as an intermediate input source, which provides the file paths for other input source implementations."),(0,r.yg)("h2",{id:"hive-metastore-catalog"},"Hive metastore catalog"),(0,r.yg)("p",null,"For Druid to seamlessly talk to the Hive metastore, ensure that the Hive configuration files such as ",(0,r.yg)("inlineCode",{parentName:"p"},"hive-site.xml")," and ",(0,r.yg)("inlineCode",{parentName:"p"},"core-site.xml")," are available in the Druid classpath for peon processes.",(0,r.yg)("br",{parentName:"p"}),"\n","You can also specify Hive properties under the ",(0,r.yg)("inlineCode",{parentName:"p"},"catalogProperties")," object in the ingestion spec. "),(0,r.yg)("p",null,"The ",(0,r.yg)("inlineCode",{parentName:"p"},"druid-iceberg-extensions")," extension presently only supports HDFS, S3 and local warehouse directories."),(0,r.yg)("h3",{id:"read-from-hdfs-warehouse"},"Read from HDFS warehouse"),(0,r.yg)("p",null,"To read from a HDFS warehouse, load the ",(0,r.yg)("inlineCode",{parentName:"p"},"druid-hdfs-storage")," extension. Druid extracts data file paths from the Hive metastore catalog and uses ",(0,r.yg)("a",{parentName:"p",href:"/docs/29.0.0/ingestion/input-sources#hdfs-input-source"},"HDFS input source")," to ingest these files.\nThe ",(0,r.yg)("inlineCode",{parentName:"p"},"warehouseSource")," type in the ingestion spec should be ",(0,r.yg)("inlineCode",{parentName:"p"},"hdfs"),"."),(0,r.yg)("p",null,"For authenticating with Kerberized clusters, include ",(0,r.yg)("inlineCode",{parentName:"p"},"principal")," and ",(0,r.yg)("inlineCode",{parentName:"p"},"keytab")," properties in the ",(0,r.yg)("inlineCode",{parentName:"p"},"catalogProperties")," object:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'"catalogProperties": {\n "principal": "krb_principal",\n "keytab": "/path/to/keytab"\n}\n')),(0,r.yg)("p",null,"Only Kerberos based authentication is supported as of now."),(0,r.yg)("h3",{id:"read-from-s3-warehouse"},"Read from S3 warehouse"),(0,r.yg)("p",null,"To read from a S3 warehouse, load the ",(0,r.yg)("inlineCode",{parentName:"p"},"druid-s3-extensions")," extension. Druid extracts the data file paths from the Hive metastore catalog and uses ",(0,r.yg)("inlineCode",{parentName:"p"},"S3InputSource")," to ingest these files.\nSet the ",(0,r.yg)("inlineCode",{parentName:"p"},"type")," property of the ",(0,r.yg)("inlineCode",{parentName:"p"},"warehouseSource")," object to ",(0,r.yg)("inlineCode",{parentName:"p"},"s3")," in the ingestion spec. If the S3 endpoint for the warehouse is different from the endpoint configured as the deep storage, include the following properties in the ",(0,r.yg)("inlineCode",{parentName:"p"},"warehouseSource")," object to define the S3 endpoint settings:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'"warehouseSource": {\n "type": "s3",\n "endpointConfig": {\n "url": "S3_ENDPOINT_URL",\n "signingRegion": "us-east-1"\n },\n "clientConfig": {\n "protocol": "http",\n "disableChunkedEncoding": true,\n "enablePathStyleAccess": true,\n "forceGlobalBucketAccessEnabled": false\n },\n "properties": {\n "accessKeyId": {\n "type": "default",\n "password": "<ACCESS_KEY_ID"\n },\n "secretAccessKey": {\n "type": "default",\n "password": "<SECRET_ACCESS_KEY>"\n }\n }\n}\n')),(0,r.yg)("p",null,"This extension uses the ",(0,r.yg)("a",{parentName:"p",href:"https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/"},"Hadoop AWS module")," to connect to S3 and retrieve the metadata and data file paths.\nThe following properties are required in the ",(0,r.yg)("inlineCode",{parentName:"p"},"catalogProperties"),":"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-json"},'"catalogProperties": {\n "fs.s3a.access.key" : "S3_ACCESS_KEY",\n "fs.s3a.secret.key" : "S3_SECRET_KEY",\n "fs.s3a.endpoint" : "S3_API_ENDPOINT"\n}\n')),(0,r.yg)("p",null,"Since the Hadoop AWS connector uses the ",(0,r.yg)("inlineCode",{parentName:"p"},"s3a")," filesystem client, specify the warehouse path with the ",(0,r.yg)("inlineCode",{parentName:"p"},"s3a://")," protocol instead of ",(0,r.yg)("inlineCode",{parentName:"p"},"s3://"),"."),(0,r.yg)("h2",{id:"local-catalog"},"Local catalog"),(0,r.yg)("p",null,"The local catalog type can be used for catalogs configured on the local filesystem. Set the ",(0,r.yg)("inlineCode",{parentName:"p"},"icebergCatalog")," type to ",(0,r.yg)("inlineCode",{parentName:"p"},"local"),". You can use this catalog for demos or localized tests. It is not recommended for production use cases.\nThe ",(0,r.yg)("inlineCode",{parentName:"p"},"warehouseSource")," is set to ",(0,r.yg)("inlineCode",{parentName:"p"},"local")," because this catalog only supports reading from a local filesystem."),(0,r.yg)("h2",{id:"downloading-iceberg-extension"},"Downloading Iceberg extension"),(0,r.yg)("p",null,"To download ",(0,r.yg)("inlineCode",{parentName:"p"},"druid-iceberg-extensions"),", run the following command after replacing ",(0,r.yg)("inlineCode",{parentName:"p"},"<VERSION>")," with the desired\nDruid version:"),(0,r.yg)("pre",null,(0,r.yg)("code",{parentName:"pre",className:"language-shell"},'java \\\n -cp "lib/*" \\\n -Ddruid.extensions.directory="extensions" \\\n -Ddruid.extensions.hadoopDependenciesDir="hadoop-dependencies" \\\n org.apache.druid.cli.Main tools pull-deps \\\n --no-default-hadoop \\\n -c "org.apache.druid.extensions.contrib:druid-iceberg-extensions:<VERSION>"\n')),(0,r.yg)("p",null,"See ",(0,r.yg)("a",{parentName:"p",href:"/docs/29.0.0/configuration/extensions#loading-community-extensions"},"Loading community extensions")," for more information."),(0,r.yg)("h2",{id:"known-limitations"},"Known limitations"),(0,r.yg)("p",null,"This section lists the known limitations that apply to the Iceberg extension."),(0,r.yg)("ul",null,(0,r.yg)("li",{parentName:"ul"},"This extension does not fully utilize the Iceberg features such as snapshotting or schema evolution."),(0,r.yg)("li",{parentName:"ul"},"The Iceberg input source reads every single live file on the Iceberg table up to the latest snapshot, which makes the table scan less performant. It is recommended to use Iceberg filters on partition columns in the ingestion spec in order to limit the number of data files being retrieved. Since, Druid doesn't store the last ingested iceberg snapshot ID, it cannot identify the files created between that snapshot and the latest snapshot on Iceberg."),(0,r.yg)("li",{parentName:"ul"},"It does not handle Iceberg ",(0,r.yg)("a",{parentName:"li",href:"https://iceberg.apache.org/docs/latest/evolution/"},"schema evolution")," yet. In cases where an existing Iceberg table column is deleted and recreated with the same name, ingesting this table into Druid may bring the data for this column before it was deleted."),(0,r.yg)("li",{parentName:"ul"},"The Hive catalog has not been tested on Hadoop 2.x.x and is not guaranteed to work with Hadoop 2.")))}h.isMDXComponent=!0}}]);