blob: ac00a4f38d5f0fde07c5ad79c0eef36c2addef5d [file] [log] [blame]
"use strict";(self.webpackChunk=self.webpackChunk||[]).push([[6603],{3905:(e,t,n)=>{n.d(t,{Zo:()=>c,kt:()=>f});var r=n(67294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?o(Object(n),!0).forEach((function(t){a(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):o(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function l(e,t){if(null==e)return{};var n,r,a=function(e,t){if(null==e)return{};var n,r,a={},o=Object.keys(e);for(r=0;r<o.length;r++)n=o[r],t.indexOf(n)>=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r<o.length;r++)n=o[r],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var p=r.createContext({}),s=function(e){var t=r.useContext(p),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=s(e.components);return r.createElement(p.Provider,{value:t},e.children)},d="mdxType",m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},u=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,p=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=s(n),u=a,f=d["".concat(p,".").concat(u)]||d[u]||m[u]||o;return n?r.createElement(f,i(i({ref:t},c),{},{components:n})):r.createElement(f,i({ref:t},c))}));function f(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=u;var l={};for(var p in t)hasOwnProperty.call(t,p)&&(l[p]=t[p]);l.originalType=e,l[d]="string"==typeof e?e:a,i[1]=l;for(var s=2;s<o;s++)i[s]=n[s];return r.createElement.apply(null,i)}return r.createElement.apply(null,n)}u.displayName="MDXCreateElement"},75837:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>c,contentTitle:()=>p,default:()=>f,frontMatter:()=>l,metadata:()=>s,toc:()=>d});var r=n(87462),a=n(63366),o=(n(67294),n(3905)),i=["components"],l={id:"orc",title:"ORC Extension"},p=void 0,s={unversionedId:"development/extensions-core/orc",id:"development/extensions-core/orc",title:"ORC Extension",description:"\x3c!--",source:"@site/docs/latest/development/extensions-core/orc.md",sourceDirName:"development/extensions-core",slug:"/development/extensions-core/orc",permalink:"/docs/latest/development/extensions-core/orc",draft:!1,tags:[],version:"current",frontMatter:{id:"orc",title:"ORC Extension"}},c={},d=[{value:"ORC extension",id:"orc-extension",level:2},{value:"Migration from &#39;contrib&#39; extension",id:"migration-from-contrib-extension",level:3}],m={toc:d},u="wrapper";function f(e){var t=e.components,n=(0,a.Z)(e,i);return(0,o.kt)(u,(0,r.Z)({},m,n,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"orc-extension"},"ORC extension"),(0,o.kt)("p",null,"This Apache Druid extension enables Druid to ingest and understand the Apache ORC data format."),(0,o.kt)("p",null,"The extension provides the ",(0,o.kt)("a",{parentName:"p",href:"/docs/latest/ingestion/data-formats#orc"},"ORC input format")," and the ",(0,o.kt)("a",{parentName:"p",href:"/docs/latest/ingestion/data-formats#orc-hadoop-parser"},"ORC Hadoop parser"),"\nfor ",(0,o.kt)("a",{parentName:"p",href:"/docs/latest/ingestion/native-batch"},"native batch ingestion")," and ",(0,o.kt)("a",{parentName:"p",href:"/docs/latest/ingestion/hadoop"},"Hadoop batch ingestion"),", respectively.\nPlease see corresponding docs for details."),(0,o.kt)("p",null,"To use this extension, make sure to ",(0,o.kt)("a",{parentName:"p",href:"/docs/latest/configuration/extensions#loading-extensions"},"include")," ",(0,o.kt)("inlineCode",{parentName:"p"},"druid-orc-extensions")," in the extensions load list."),(0,o.kt)("h3",{id:"migration-from-contrib-extension"},"Migration from 'contrib' extension"),(0,o.kt)("p",null,"This extension, first available in version 0.15.0, replaces the previous 'contrib' extension which was available until\n0.14.0-incubating. While this extension can index any data the 'contrib' extension could, the JSON spec for the\ningestion task is ",(0,o.kt)("em",{parentName:"p"},"incompatible"),", and will need modified to work with the newer 'core' extension."),(0,o.kt)("p",null,"To migrate to 0.15.0+:"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"In ",(0,o.kt)("inlineCode",{parentName:"li"},"inputSpec")," of ",(0,o.kt)("inlineCode",{parentName:"li"},"ioConfig"),", ",(0,o.kt)("inlineCode",{parentName:"li"},"inputFormat")," must be changed from ",(0,o.kt)("inlineCode",{parentName:"li"},'"org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat"')," to\n",(0,o.kt)("inlineCode",{parentName:"li"},'"org.apache.orc.mapreduce.OrcInputFormat"')),(0,o.kt)("li",{parentName:"ul"},"The 'contrib' extension supported a ",(0,o.kt)("inlineCode",{parentName:"li"},"typeString")," property, which provided the schema of the\nORC file, of which was essentially required to have the types correct, but notably ",(0,o.kt)("em",{parentName:"li"},"not")," the column names, which\nfacilitated column renaming. In the 'core' extension, column renaming can be achieved with\n",(0,o.kt)("a",{parentName:"li",href:"/docs/latest/ingestion/ingestion-spec#flattenspec"},(0,o.kt)("inlineCode",{parentName:"a"},"flattenSpec")),". For example, ",(0,o.kt)("inlineCode",{parentName:"li"},'"typeString":"struct<time:string,name:string>"'),"\nwith the actual schema ",(0,o.kt)("inlineCode",{parentName:"li"},"struct<_col0:string,_col1:string>"),", to preserve Druid schema would need replaced with:")),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-json"},'"flattenSpec": {\n "fields": [\n {\n "type": "path",\n "name": "time",\n "expr": "$._col0"\n },\n {\n "type": "path",\n "name": "name",\n "expr": "$._col1"\n }\n ]\n ...\n}\n')),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("p",{parentName:"li"},"The 'contrib' extension supported a ",(0,o.kt)("inlineCode",{parentName:"p"},"mapFieldNameFormat")," property, which provided a way to specify a dimension to\nflatten ",(0,o.kt)("inlineCode",{parentName:"p"},"OrcMap")," columns with primitive types. This functionality has also been replaced with\n",(0,o.kt)("a",{parentName:"p",href:"/docs/latest/ingestion/ingestion-spec#flattenspec"},(0,o.kt)("inlineCode",{parentName:"a"},"flattenSpec")),". For example: ",(0,o.kt)("inlineCode",{parentName:"p"},'"mapFieldNameFormat": "<PARENT>_<CHILD>"'),"\nfor a dimension ",(0,o.kt)("inlineCode",{parentName:"p"},"nestedData_dim1"),", to preserve Druid schema could be replaced with"),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-json"},'"flattenSpec": {\n "fields": [\n {\n "type": "path",\n "name": "nestedData_dim1",\n "expr": "$.nestedData.dim1"\n }\n ]\n ...\n}\n')))),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},"")))}f.isMDXComponent=!0}}]);