blob: 55828dfcaaae0b5537b2c91d7d66b0f3786b26ff [file] [log] [blame]
"use strict";(self.webpackChunk=self.webpackChunk||[]).push([[3685],{15680:(e,n,t)=>{t.d(n,{xA:()=>c,yg:()=>g});var r=t(96540);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function o(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function i(e){for(var n=1;n<arguments.length;n++){var t=null!=arguments[n]?arguments[n]:{};n%2?o(Object(t),!0).forEach((function(n){a(e,n,t[n])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(t)):o(Object(t)).forEach((function(n){Object.defineProperty(e,n,Object.getOwnPropertyDescriptor(t,n))}))}return e}function l(e,n){if(null==e)return{};var t,r,a=function(e,n){if(null==e)return{};var t,r,a={},o=Object.keys(e);for(r=0;r<o.length;r++)t=o[r],n.indexOf(t)>=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r<o.length;r++)t=o[r],n.indexOf(t)>=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var p=r.createContext({}),s=function(e){var n=r.useContext(p),t=n;return e&&(t="function"==typeof e?e(n):i(i({},n),e)),t},c=function(e){var n=s(e.components);return r.createElement(p.Provider,{value:n},e.children)},d="mdxType",m={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},u=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,o=e.originalType,p=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=s(t),u=a,g=d["".concat(p,".").concat(u)]||d[u]||m[u]||o;return t?r.createElement(g,i(i({ref:n},c),{},{components:t})):r.createElement(g,i({ref:n},c))}));function g(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var o=t.length,i=new Array(o);i[0]=u;var l={};for(var p in n)hasOwnProperty.call(n,p)&&(l[p]=n[p]);l.originalType=e,l[d]="string"==typeof e?e:a,i[1]=l;for(var s=2;s<o;s++)i[s]=t[s];return r.createElement.apply(null,i)}return r.createElement.apply(null,t)}u.displayName="MDXCreateElement"},18969:(e,n,t)=>{t.r(n),t.d(n,{assets:()=>c,contentTitle:()=>p,default:()=>g,frontMatter:()=>l,metadata:()=>s,toc:()=>d});var r=t(58168),a=t(98587),o=(t(96540),t(15680)),i=["components"],l={id:"orc",title:"ORC Extension"},p=void 0,s={unversionedId:"development/extensions-core/orc",id:"development/extensions-core/orc",title:"ORC Extension",description:"\x3c!--",source:"@site/docs/latest/development/extensions-core/orc.md",sourceDirName:"development/extensions-core",slug:"/development/extensions-core/orc",permalink:"/docs/latest/development/extensions-core/orc",draft:!1,tags:[],version:"current",frontMatter:{id:"orc",title:"ORC Extension"}},c={},d=[{value:"ORC extension",id:"orc-extension",level:2},{value:"Migration from &#39;contrib&#39; extension",id:"migration-from-contrib-extension",level:3}],m={toc:d},u="wrapper";function g(e){var n=e.components,t=(0,a.A)(e,i);return(0,o.yg)(u,(0,r.A)({},m,t,{components:n,mdxType:"MDXLayout"}),(0,o.yg)("h2",{id:"orc-extension"},"ORC extension"),(0,o.yg)("p",null,"This Apache Druid extension enables Druid to ingest and understand the Apache ORC data format."),(0,o.yg)("p",null,"The extension provides the ",(0,o.yg)("a",{parentName:"p",href:"/docs/latest/ingestion/data-formats#orc"},"ORC input format")," and the ",(0,o.yg)("a",{parentName:"p",href:"/docs/latest/ingestion/data-formats#orc-hadoop-parser"},"ORC Hadoop parser"),"\nfor ",(0,o.yg)("a",{parentName:"p",href:"/docs/latest/ingestion/native-batch"},"native batch ingestion")," and ",(0,o.yg)("a",{parentName:"p",href:"/docs/latest/ingestion/hadoop"},"Hadoop batch ingestion"),", respectively.\nPlease see corresponding docs for details."),(0,o.yg)("p",null,"To use this extension, make sure to ",(0,o.yg)("a",{parentName:"p",href:"/docs/latest/configuration/extensions#loading-extensions"},"include")," ",(0,o.yg)("inlineCode",{parentName:"p"},"druid-orc-extensions")," in the extensions load list."),(0,o.yg)("h3",{id:"migration-from-contrib-extension"},"Migration from 'contrib' extension"),(0,o.yg)("p",null,"This extension, first available in version 0.15.0, replaces the previous 'contrib' extension which was available until\n0.14.0-incubating. While this extension can index any data the 'contrib' extension could, the JSON spec for the\ningestion task is ",(0,o.yg)("em",{parentName:"p"},"incompatible"),", and will need modified to work with the newer 'core' extension."),(0,o.yg)("p",null,"To migrate to 0.15.0+:"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"In ",(0,o.yg)("inlineCode",{parentName:"li"},"inputSpec")," of ",(0,o.yg)("inlineCode",{parentName:"li"},"ioConfig"),", ",(0,o.yg)("inlineCode",{parentName:"li"},"inputFormat")," must be changed from ",(0,o.yg)("inlineCode",{parentName:"li"},'"org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat"')," to\n",(0,o.yg)("inlineCode",{parentName:"li"},'"org.apache.orc.mapreduce.OrcInputFormat"')),(0,o.yg)("li",{parentName:"ul"},"The 'contrib' extension supported a ",(0,o.yg)("inlineCode",{parentName:"li"},"typeString")," property, which provided the schema of the\nORC file, of which was essentially required to have the types correct, but notably ",(0,o.yg)("em",{parentName:"li"},"not")," the column names, which\nfacilitated column renaming. In the 'core' extension, column renaming can be achieved with\n",(0,o.yg)("a",{parentName:"li",href:"/docs/latest/ingestion/ingestion-spec#flattenspec"},(0,o.yg)("inlineCode",{parentName:"a"},"flattenSpec")),". For example, ",(0,o.yg)("inlineCode",{parentName:"li"},'"typeString":"struct<time:string,name:string>"'),"\nwith the actual schema ",(0,o.yg)("inlineCode",{parentName:"li"},"struct<_col0:string,_col1:string>"),", to preserve Druid schema would need replaced with:")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-json"},'"flattenSpec": {\n "fields": [\n {\n "type": "path",\n "name": "time",\n "expr": "$._col0"\n },\n {\n "type": "path",\n "name": "name",\n "expr": "$._col1"\n }\n ]\n ...\n}\n')),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("p",{parentName:"li"},"The 'contrib' extension supported a ",(0,o.yg)("inlineCode",{parentName:"p"},"mapFieldNameFormat")," property, which provided a way to specify a dimension to\nflatten ",(0,o.yg)("inlineCode",{parentName:"p"},"OrcMap")," columns with primitive types. This functionality has also been replaced with\n",(0,o.yg)("a",{parentName:"p",href:"/docs/latest/ingestion/ingestion-spec#flattenspec"},(0,o.yg)("inlineCode",{parentName:"a"},"flattenSpec")),". For example: ",(0,o.yg)("inlineCode",{parentName:"p"},'"mapFieldNameFormat": "<PARENT>_<CHILD>"'),"\nfor a dimension ",(0,o.yg)("inlineCode",{parentName:"p"},"nestedData_dim1"),", to preserve Druid schema could be replaced with"),(0,o.yg)("pre",{parentName:"li"},(0,o.yg)("code",{parentName:"pre",className:"language-json"},'"flattenSpec": {\n "fields": [\n {\n "type": "path",\n "name": "nestedData_dim1",\n "expr": "$.nestedData.dim1"\n }\n ]\n ...\n}\n')))),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"")))}g.isMDXComponent=!0}}]);