blob: 896c1c325cc58d531a8f74dedaec36f314265d36 [file] [log] [blame]
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><title>Native batch input sources · Apache Druid</title><meta name="viewport" content="width=device-width"/><link rel="canonical" href="https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html"/><meta name="generator" content="Docusaurus"/><meta name="description" content="&lt;!--"/><meta name="docsearch:language" content="en"/><meta name="docsearch:version" content="0.23.0" /><meta property="og:title" content="Native batch input sources · Apache Druid"/><meta property="og:type" content="website"/><meta property="og:url" content="https://druid.apache.org/index.html"/><meta property="og:description" content="&lt;!--"/><meta property="og:image" content="https://druid.apache.org/img/druid_nav.png"/><meta name="twitter:card" content="summary"/><meta name="twitter:image" content="https://druid.apache.org/img/druid_nav.png"/><link rel="shortcut icon" href="/img/favicon.png"/><link rel="stylesheet" href="https://cdn.jsdelivr.net/docsearch.js/1/docsearch.min.css"/><link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/default.min.css"/><script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-131010415-1"></script><script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments); }
gtag('js', new Date());
gtag('config', 'UA-131010415-1');
</script><link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.7.2/css/all.css"/><link rel="stylesheet" href="/css/code-block-buttons.css"/><script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js"></script><script type="text/javascript" src="/js/code-block-buttons.js"></script><script src="/js/scrollSpy.js"></script><link rel="stylesheet" href="/css/main.css"/><script src="/js/codetabs.js"></script></head><body class="sideNavVisible separateOnPageNav"><div class="fixedHeaderContainer"><div class="headerWrapper wrapper"><header><a href="/"><img class="logo" src="/img/druid_nav.png" alt="Apache Druid"/></a><div class="navigationWrapper navigationSlider"><nav class="slidingNav"><ul class="nav-site nav-site-internal"><li class=""><a href="/technology" target="_self">Technology</a></li><li class=""><a href="/use-cases" target="_self">Use Cases</a></li><li class=""><a href="/druid-powered" target="_self">Powered By</a></li><li class="siteNavGroupActive"><a href="/docs/latest/design/index.html" target="_self">Docs</a></li><li class=""><a href="/community/" target="_self">Community</a></li><li class=""><a href="https://www.apache.org" target="_self">Apache</a></li><li class=""><a href="/downloads.html" target="_self">Download</a></li><li class="navSearchWrapper reactNavSearchWrapper"><input type="text" id="search_input_react" placeholder="Search" title="Search"/></li></ul></nav></div></header></div></div><div class="navPusher"><div class="docMainWrapper wrapper"><div class="docsNavContainer" id="docsNav"><nav class="toc"><div class="toggleNav"><section class="navWrapper wrapper"><div class="navBreadcrumb wrapper"><div class="navToggle" id="navToggler"><div class="hamburger-menu"><div class="line1"></div><div class="line2"></div><div class="line3"></div></div></div><h2><i></i><span>Batch ingestion</span></h2><div class="tocToggler" id="tocToggler"><i class="icon-toc"></i></div></div><div class="navGroups"><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Getting started<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/design/index.html">Introduction to Apache Druid</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/index.html">Quickstart</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/docker.html">Docker</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/single-server.html">Single server deployment</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/cluster.html">Clustered deployment</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Tutorials<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-batch.html">Loading files natively</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-kafka.html">Load from Apache Kafka</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-batch-hadoop.html">Load from Apache Hadoop</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-query.html">Querying data</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-rollup.html">Roll-up</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-retention.html">Configuring data retention</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-update-data.html">Updating existing data</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-compaction.html">Compacting segments</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-delete-data.html">Deleting data</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-ingestion-spec.html">Writing an ingestion spec</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-transform-spec.html">Transforming input data</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/tutorials/tutorial-kerberos-hadoop.html">Kerberized HDFS deep storage</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Design<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/design/architecture.html">Design</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/segments.html">Segments</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/processes.html">Processes and servers</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/dependencies/deep-storage.html">Deep storage</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/dependencies/metadata-storage.html">Metadata storage</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/dependencies/zookeeper.html">ZooKeeper</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Ingestion<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/index.html">Ingestion</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/data-formats.html">Data formats</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/data-model.html">Data model</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/rollup.html">Data rollup</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/partitioning.html">Partitioning</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/ingestion-spec.html">Ingestion spec</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/schema-design.html">Schema design tips</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/data-management.html">Data management</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/compaction.html">Compaction</a></li><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Stream ingestion</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/kafka-ingestion.html">Apache Kafka ingestion</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/kafka-supervisor-reference.html">Apache Kafka supervisor</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/kafka-supervisor-operations.html">Apache Kafka operations</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/kinesis-ingestion.html">Amazon Kinesis</a></li></ul></div><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Batch ingestion</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/native-batch.html">Native batch</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/native-batch-simple-task.html">Simple task indexing</a></li><li class="navListItem navListItemActive"><a class="navItem" href="/docs/latest/ingestion/native-batch-input-sources.html">Input sources</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/native-batch-firehose.html">Firehose</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/hadoop.html">Hadoop-based</a></li></ul></div><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/tasks.html">Task reference</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/faq.html">Troubleshooting FAQ</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Querying<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Druid SQL</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql.html">Overview and syntax</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-data-types.html">SQL data types</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-operators.html">Operators</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-scalar.html">Scalar functions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-aggregations.html">Aggregation functions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-multivalue-string-functions.html">Multi-value string functions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-api.html">Druid SQL API</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-jdbc.html">JDBC driver API</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-query-context.html">SQL query context</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-metadata-tables.html">SQL metadata tables</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sql-translation.html">SQL query translation</a></li></ul></div><li class="navListItem"><a class="navItem" href="/docs/latest/querying/querying.html">Native queries</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/query-execution.html">Query execution</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/troubleshooting.html">Troubleshooting</a></li><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Concepts</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/querying/datasource.html">Datasources</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/joins.html">Joins</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/lookups.html">Lookups</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/multi-value-dimensions.html">Multi-value dimensions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/multitenancy.html">Multitenancy</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/caching.html">Query caching</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/using-caching.html">Using query caching</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/query-context.html">Query context</a></li></ul></div><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Native query types</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/querying/timeseriesquery.html">Timeseries</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/topnquery.html">TopN</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/groupbyquery.html">GroupBy</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/scan-query.html">Scan</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/searchquery.html">Search</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/timeboundaryquery.html">TimeBoundary</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/segmentmetadataquery.html">SegmentMetadata</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/datasourcemetadataquery.html">DatasourceMetadata</a></li></ul></div><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Native query components</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/querying/filters.html">Filters</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/granularities.html">Granularities</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/dimensionspecs.html">Dimensions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/aggregations.html">Aggregations</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/post-aggregations.html">Post-aggregations</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/misc/math-expr.html">Expressions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/having.html">Having filters (groupBy)</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/limitspec.html">Sorting and limiting (groupBy)</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/topnmetricspec.html">Sorting (topN)</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/sorting-orders.html">String comparators</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/virtual-columns.html">Virtual columns</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/geo.html">Spatial filters</a></li></ul></div></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Configuration<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/configuration/index.html">Configuration reference</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions.html">Extensions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/configuration/logging.html">Logging</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Operations<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/operations/druid-console.html">Web console</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/getting-started.html">Getting started with Apache Druid</a></li><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Security</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/operations/security-overview.html">Security overview</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/security-user-auth.html">User authentication and authorization</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/auth-ldap.html">LDAP auth</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/password-provider.html">Password providers</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/dynamic-config-provider.html">Dynamic Config Providers</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/tls-support.html">TLS support</a></li></ul></div><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Performance tuning</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/operations/basic-cluster-tuning.html">Basic cluster tuning</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/segment-optimization.html">Segment Size Optimization</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/mixed-workloads.html">Mixed workloads</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/http-compression.html">HTTP compression</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/clean-metadata-store.html">Automated metadata cleanup</a></li></ul></div><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Monitoring</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/operations/request-logging.html">Request logging</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/metrics.html">Metrics</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/alerts.html">Alerts</a></li></ul></div><li class="navListItem"><a class="navItem" href="/docs/latest/operations/api-reference.html">API reference</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/high-availability.html">High availability</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/rolling-updates.html">Rolling updates</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/rule-configuration.html">Retaining or automatically dropping data</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/other-hadoop.html">Working with different versions of Apache Hadoop</a></li><div class="navGroup subNavGroup"><h4 class="navGroupSubcategoryTitle">Misc</h4><ul><li class="navListItem"><a class="navItem" href="/docs/latest/operations/management-uis.html">Legacy Management UIs</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/dump-segment.html">dump-segment tool</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/reset-cluster.html">reset-cluster tool</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/insert-segment-to-db.html">insert-segment-to-db tool</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/pull-deps.html">pull-deps tool</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/deep-storage-migration.html">Deep storage migration</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/export-metadata.html">Export Metadata Tool</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/metadata-migration.html">Metadata Migration</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/use_sbt_to_build_fat_jar.html">Content for build.sbt</a></li></ul></div></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Development<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/development/overview.html">Developing on Druid</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/modules.html">Creating extensions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/javascript.html">JavaScript functionality</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/build.html">Build from source</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/versioning.html">Versioning</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/experimental.html">Experimental features</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Misc<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/misc/papers-and-talks.html">Papers</a></li></ul></div><div class="navGroup"><h3 class="navGroupCategoryTitle collapsible">Hidden<span class="arrow"><svg width="24" height="24" viewBox="0 0 24 24"><path fill="#565656" d="M7.41 15.41L12 10.83l4.59 4.58L18 14l-6-6-6 6z"></path><path d="M0 0h24v24H0z" fill="none"></path></svg></span></h3><ul class="hide"><li class="navListItem"><a class="navItem" href="/docs/latest/comparisons/druid-vs-elasticsearch.html">Apache Druid vs Elasticsearch</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/comparisons/druid-vs-key-value.html">Apache Druid vs. Key/Value Stores (HBase/Cassandra/OpenTSDB)</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/comparisons/druid-vs-kudu.html">Apache Druid vs Kudu</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/comparisons/druid-vs-redshift.html">Apache Druid vs Redshift</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/comparisons/druid-vs-spark.html">Apache Druid vs Spark</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/comparisons/druid-vs-sql-on-hadoop.html">Apache Druid vs SQL-on-Hadoop</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/auth.html">Authentication and Authorization</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/broker.html">Broker</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/coordinator.html">Coordinator Process</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/historical.html">Historical Process</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/indexer.html">Indexer Process</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/indexing-service.html">Indexing Service</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/middlemanager.html">MiddleManager Process</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/overlord.html">Overlord Process</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/router.html">Router Process</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/design/peons.html">Peons</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/approximate-histograms.html">Approximate Histogram aggregators</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/avro.html">Apache Avro</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/azure.html">Microsoft Azure</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/bloom-filter.html">Bloom Filter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/datasketches-extension.html">DataSketches extension</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/datasketches-hll.html">DataSketches HLL Sketch module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/datasketches-quantiles.html">DataSketches Quantiles Sketch module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/datasketches-theta.html">DataSketches Theta Sketch module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/datasketches-tuple.html">DataSketches Tuple Sketch module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/druid-basic-security.html">Basic Security</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/druid-kerberos.html">Kerberos</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/druid-lookups.html">Cached Lookup Module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/druid-ranger-security.html">Apache Ranger Security</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/google.html">Google Cloud Storage</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/hdfs.html">HDFS</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/kafka-extraction-namespace.html">Apache Kafka Lookups</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/lookups-cached-global.html">Globally Cached Lookups</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/mysql.html">MySQL Metadata Store</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/orc.html">ORC Extension</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/druid-pac4j.html">Druid pac4j based Security extension</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/parquet.html">Apache Parquet Extension</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/postgresql.html">PostgreSQL Metadata Store</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/protobuf.html">Protobuf</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/s3.html">S3-compatible</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/simple-client-sslcontext.html">Simple SSLContext Provider Module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/stats.html">Stats aggregator</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/test-stats.html">Test Stats Aggregators</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/druid-aws-rds.html">Druid AWS RDS Module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-core/kubernetes.html">Kubernetes</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/ambari-metrics-emitter.html">Ambari Metrics Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/cassandra.html">Apache Cassandra</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/cloudfiles.html">Rackspace Cloud Files</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/distinctcount.html">DistinctCount Aggregator</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/graphite.html">Graphite Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/influx.html">InfluxDB Line Protocol Parser</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/influxdb-emitter.html">InfluxDB Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/kafka-emitter.html">Kafka Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/materialized-view.html">Materialized View</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/momentsketch-quantiles.html">Moment Sketches for Approximate Quantiles module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/moving-average-query.html">Moving Average Query</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/opentsdb-emitter.html">OpenTSDB Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/redis-cache.html">Druid Redis Cache</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/sqlserver.html">Microsoft SQLServer</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/statsd.html">StatsD Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/tdigestsketch-quantiles.html">T-Digest Quantiles Sketch module</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/thrift.html">Thrift</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/time-min-max.html">Timestamp Min/Max aggregators</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/gce-extensions.html">GCE Extensions</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/aliyun-oss.html">Aliyun OSS</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/development/extensions-contrib/prometheus.html">Prometheus Emitter</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/operations/kubernetes.html">kubernetes</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/hll-old.html">Cardinality/HyperUnique aggregators</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/querying/select-query.html">Select</a></li><li class="navListItem"><a class="navItem" href="/docs/latest/ingestion/standalone-realtime.html">Realtime Process</a></li></ul></div></div></section></div><script>
var coll = document.getElementsByClassName('collapsible');
var checkActiveCategory = true;
for (var i = 0; i < coll.length; i++) {
var links = coll[i].nextElementSibling.getElementsByTagName('*');
if (checkActiveCategory){
for (var j = 0; j < links.length; j++) {
if (links[j].classList.contains('navListItemActive')){
coll[i].nextElementSibling.classList.toggle('hide');
coll[i].childNodes[1].classList.toggle('rotate');
checkActiveCategory = false;
break;
}
}
}
coll[i].addEventListener('click', function() {
var arrow = this.childNodes[1];
arrow.classList.toggle('rotate');
var content = this.nextElementSibling;
content.classList.toggle('hide');
});
}
document.addEventListener('DOMContentLoaded', function() {
createToggler('#navToggler', '#docsNav', 'docsSliderActive');
createToggler('#tocToggler', 'body', 'tocActive');
var headings = document.querySelector('.toc-headings');
headings && headings.addEventListener('click', function(event) {
var el = event.target;
while(el !== headings){
if (el.tagName === 'A') {
document.body.classList.remove('tocActive');
break;
} else{
el = el.parentNode;
}
}
}, false);
function createToggler(togglerSelector, targetSelector, className) {
var toggler = document.querySelector(togglerSelector);
var target = document.querySelector(targetSelector);
if (!toggler) {
return;
}
toggler.onclick = function(event) {
event.preventDefault();
target.classList.toggle(className);
};
}
});
</script></nav></div><div class="container mainContainer docsContainer"><div class="wrapper"><div class="post"><header class="postHeader"><a class="edit-page-link button" href="https://github.com/apache/druid/edit/master/docs/ingestion/native-batch-input-source.md" target="_blank" rel="noreferrer noopener">Edit</a><h1 id="__docusaurus" class="postHeaderTitle">Native batch input sources</h1></header><article><div><span><!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<p>The input source defines where your index task reads data for Apache Druid native batch ingestion. Only the native parallel task and simple task support the input source.</p>
<p>For general information on native batch indexing and parallel task indexing, see <a href="/docs/latest/ingestion/native-batch.html">Native batch ingestion</a>.</p>
<h2><a class="anchor" aria-hidden="true" id="s3-input-source"></a><a href="#s3-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>S3 input source</h2>
<blockquote>
<p>You need to include the <a href="/docs/latest/development/extensions-core/s3.html"><code>druid-s3-extensions</code></a> as an extension to use the S3 input source.</p>
</blockquote>
<p>The S3 input source reads objects directly from S3. You can specify either:</p>
<ul>
<li>a list of S3 URI strings</li>
<li>a list of S3 location prefixes that attempts to list the contents and ingest
all objects contained within the locations.</li>
</ul>
<p>The S3 input source is splittable. Therefore, you can use it with the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>. Each worker task of <code>index_parallel</code> reads one or multiple objects.</p>
<p>Sample specs:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "s3",
"uris": ["s3://foo/bar/file.json", "s3://bar/foo/file2.json"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "s3",
"prefixes": ["s3://foo/bar/", "s3://bar/foo/"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "s3",
"objects": [
{ "bucket": "foo", "path": "bar/file1.json"},
{ "bucket": "bar", "path": "foo/file2.json"}
]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "s3",
"uris": ["s3://foo/bar/file.json", "s3://bar/foo/file2.json"],
"properties": {
"accessKeyId": "KLJ78979SDFdS2",
"secretAccessKey": "KLS89s98sKJHKJKJH8721lljkd"
}
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "s3",
"uris": ["s3://foo/bar/file.json", "s3://bar/foo/file2.json"],
"properties": {
"accessKeyId": "KLJ78979SDFdS2",
"secretAccessKey": "KLS89s98sKJHKJKJH8721lljkd",
"assumeRoleArn": "arn:aws:iam::2981002874992:role/role-s3"
}
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be <code>s3</code>.</td><td>None</td><td>yes</td></tr>
<tr><td>uris</td><td>JSON array of URIs where S3 objects to be ingested are located.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>prefixes</td><td>JSON array of URI prefixes for the locations of S3 objects to be ingested. Empty objects starting with one of the given prefixes will be skipped.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>objects</td><td>JSON array of S3 Objects to be ingested.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>properties</td><td>Properties Object for overriding the default S3 configuration. See below for more information.</td><td>None</td><td>No (defaults will be used if not given)</td></tr>
</tbody>
</table>
<p>Note that the S3 input source will skip all empty objects only when <code>prefixes</code> is specified.</p>
<p>S3 Object:</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>bucket</td><td>Name of the S3 bucket</td><td>None</td><td>yes</td></tr>
<tr><td>path</td><td>The path where data is located.</td><td>None</td><td>yes</td></tr>
</tbody>
</table>
<p>Properties Object:</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>accessKeyId</td><td>The <a href="/docs/latest/operations/password-provider.html">Password Provider</a> or plain text string of this S3 InputSource's access key</td><td>None</td><td>yes if secretAccessKey is given</td></tr>
<tr><td>secretAccessKey</td><td>The <a href="/docs/latest/operations/password-provider.html">Password Provider</a> or plain text string of this S3 InputSource's secret key</td><td>None</td><td>yes if accessKeyId is given</td></tr>
<tr><td>assumeRoleArn</td><td>AWS ARN of the role to assume <a href="https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_request.html">see</a>. <strong>assumeRoleArn</strong> can be used either with the ingestion spec AWS credentials or with the default S3 credentials</td><td>None</td><td>no</td></tr>
<tr><td>assumeRoleExternalId</td><td>A unique identifier that might be required when you assume a role in another account <a href="https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_request.html">see</a></td><td>None</td><td>no</td></tr>
</tbody>
</table>
<blockquote>
<p><strong>Note:</strong> If <code>accessKeyId</code> and <code>secretAccessKey</code> are not given, the default <a href="/docs/latest/development/extensions-core/s3.html#s3-authentication-methods">S3 credentials provider chain</a> is used.</p>
</blockquote>
<h2><a class="anchor" aria-hidden="true" id="google-cloud-storage-input-source"></a><a href="#google-cloud-storage-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Google Cloud Storage Input Source</h2>
<blockquote>
<p>You need to include the <a href="/docs/latest/development/extensions-core/google.html"><code>druid-google-extensions</code></a> as an extension to use the Google Cloud Storage input source.</p>
</blockquote>
<p>The Google Cloud Storage input source is to support reading objects directly
from Google Cloud Storage. Objects can be specified as list of Google
Cloud Storage URI strings. The Google Cloud Storage input source is splittable
and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>, where each worker task of <code>index_parallel</code> will read
one or multiple objects.</p>
<p>Sample specs:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "google",
"uris": ["gs://foo/bar/file.json", "gs://bar/foo/file2.json"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "google",
"prefixes": ["gs://foo/bar/", "gs://bar/foo/"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "google",
"objects": [
{ "bucket": "foo", "path": "bar/file1.json"},
{ "bucket": "bar", "path": "foo/file2.json"}
]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be <code>google</code>.</td><td>None</td><td>yes</td></tr>
<tr><td>uris</td><td>JSON array of URIs where Google Cloud Storage objects to be ingested are located.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>prefixes</td><td>JSON array of URI prefixes for the locations of Google Cloud Storage objects to be ingested. Empty objects starting with one of the given prefixes will be skipped.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>objects</td><td>JSON array of Google Cloud Storage objects to be ingested.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
</tbody>
</table>
<p>Note that the Google Cloud Storage input source will skip all empty objects only when <code>prefixes</code> is specified.</p>
<p>Google Cloud Storage object:</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>bucket</td><td>Name of the Google Cloud Storage bucket</td><td>None</td><td>yes</td></tr>
<tr><td>path</td><td>The path where data is located.</td><td>None</td><td>yes</td></tr>
</tbody>
</table>
<h2><a class="anchor" aria-hidden="true" id="azure-input-source"></a><a href="#azure-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Azure input source</h2>
<blockquote>
<p>You need to include the <a href="/docs/latest/development/extensions-core/azure.html"><code>druid-azure-extensions</code></a> as an extension to use the Azure input source.</p>
</blockquote>
<p>The Azure input source reads objects directly from Azure Blob store or Azure Data Lake sources. You can
specify objects as a list of file URI strings or prefixes. You can split the Azure input source for use with <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a> indexing and each worker task reads one chunk of the split data.</p>
<p>Sample specs:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "azure",
"uris": ["azure://container/prefix1/file.json", "azure://container/prefix2/file2.json"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "azure",
"prefixes": ["azure://container/prefix1/", "azure://container/prefix2/"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "azure",
"objects": [
{ "bucket": "container", "path": "prefix1/file1.json"},
{ "bucket": "container", "path": "prefix2/file2.json"}
]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be <code>azure</code>.</td><td>None</td><td>yes</td></tr>
<tr><td>uris</td><td>JSON array of URIs where the Azure objects to be ingested are located, in the form &quot;azure://&lt;container&gt;/&lt;path-to-file&gt;&quot;</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>prefixes</td><td>JSON array of URI prefixes for the locations of Azure objects to ingest, in the form &quot;azure://&lt;container&gt;/&lt;prefix&gt;&quot;. Empty objects starting with one of the given prefixes are skipped.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
<tr><td>objects</td><td>JSON array of Azure objects to ingest.</td><td>None</td><td><code>uris</code> or <code>prefixes</code> or <code>objects</code> must be set</td></tr>
</tbody>
</table>
<p>Note that the Azure input source skips all empty objects only when <code>prefixes</code> is specified.</p>
<p>The <code>objects</code> property is:</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>bucket</td><td>Name of the Azure Blob Storage or Azure Data Lake container</td><td>None</td><td>yes</td></tr>
<tr><td>path</td><td>The path where data is located.</td><td>None</td><td>yes</td></tr>
</tbody>
</table>
<h2><a class="anchor" aria-hidden="true" id="hdfs-input-source"></a><a href="#hdfs-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>HDFS Input Source</h2>
<blockquote>
<p>You need to include the <a href="/docs/latest/development/extensions-core/hdfs.html"><code>druid-hdfs-storage</code></a> as an extension to use the HDFS input source.</p>
</blockquote>
<p>The HDFS input source is to support reading files directly
from HDFS storage. File paths can be specified as an HDFS URI string or a list
of HDFS URI strings. The HDFS input source is splittable and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>,
where each worker task of <code>index_parallel</code> will read one or multiple files.</p>
<p>Sample specs:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "hdfs",
"paths": "hdfs://namenode_host/foo/bar/", "hdfs://namenode_host/bar/foo"
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "hdfs",
"paths": "hdfs://namenode_host/foo/bar/", "hdfs://namenode_host/bar/foo"
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "hdfs",
"paths": "hdfs://namenode_host/foo/bar/file.json", "hdfs://namenode_host/bar/foo/file2.json"
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "hdfs",
"paths": ["hdfs://namenode_host/foo/bar/file.json", "hdfs://namenode_host/bar/foo/file2.json"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be <code>hdfs</code>.</td><td>None</td><td>yes</td></tr>
<tr><td>paths</td><td>HDFS paths. Can be either a JSON array or comma-separated string of paths. Wildcards like <code>*</code> are supported in these paths. Empty files located under one of the given paths will be skipped.</td><td>None</td><td>yes</td></tr>
</tbody>
</table>
<p>You can also ingest from other storage using the HDFS input source if the HDFS client supports that storage.
However, if you want to ingest from cloud storage, consider using the service-specific input source for your data storage.
If you want to use a non-hdfs protocol with the HDFS input source, include the protocol
in <code>druid.ingestion.hdfs.allowedProtocols</code>. See <a href="/docs/latest/configuration/index.html#hdfs-input-source">HDFS input source security configuration</a> for more details.</p>
<h2><a class="anchor" aria-hidden="true" id="http-input-source"></a><a href="#http-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>HTTP Input Source</h2>
<p>The HTTP input source is to support reading files directly from remote sites via HTTP.</p>
<blockquote>
<p><strong>NOTE:</strong> Ingestion tasks run under the operating system account that runs the Druid processes, for example the Indexer, Middle Manager, and Peon. This means any user who can submit an ingestion task can specify an <code>HTTPInputSource</code> at any location where the Druid process has permissions. For example, using <code>HTTPInputSource</code>, a console user has access to internal network locations where the they would be denied access otherwise.</p>
</blockquote>
<blockquote>
<p><strong>WARNING:</strong> <code>HTTPInputSource</code> is not limited to the HTTP or HTTPS protocols. It uses the Java <code>URI</code> class that supports HTTP, HTTPS, FTP, file, and jar protocols by default. This means you should never run Druid under the <code>root</code> account, because a user can use the file protocol to access any files on the local disk.</p>
</blockquote>
<p>For more information about security best practices, see <a href="/docs/latest/operations/security-overview.html#best-practices">Security overview</a>.</p>
<p>The HTTP input source is <em>splittable</em> and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>,
where each worker task of <code>index_parallel</code> will read only one file. This input source does not support Split Hint Spec.</p>
<p>Sample specs:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "http",
"uris": ["http://example.com/uri1", "http://example2.com/uri2"]
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<p>Example with authentication fields using the DefaultPassword provider (this requires the password to be in the ingestion spec):</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "http",
"uris": ["http://example.com/uri1", "http://example2.com/uri2"],
"httpAuthenticationUsername": "username",
"httpAuthenticationPassword": "password123"
},
"inputFormat": {
"type": "json"
},
...
},
...
</code></pre>
<p>You can also use the other existing Druid PasswordProviders. Here is an example using the EnvironmentVariablePasswordProvider:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "http",
"uris": ["http://example.com/uri1", "http://example2.com/uri2"],
"httpAuthenticationUsername": "username",
"httpAuthenticationPassword": {
"type": "environment",
"variable": "HTTP_INPUT_SOURCE_PW"
}
},
"inputFormat": {
"type": "json"
},
...
},
...
}
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>default</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be <code>http</code></td><td>None</td><td>yes</td></tr>
<tr><td>uris</td><td>URIs of the input files. See below for the protocols allowed for URIs.</td><td>None</td><td>yes</td></tr>
<tr><td>httpAuthenticationUsername</td><td>Username to use for authentication with specified URIs. Can be optionally used if the URIs specified in the spec require a Basic Authentication Header.</td><td>None</td><td>no</td></tr>
<tr><td>httpAuthenticationPassword</td><td>PasswordProvider to use with specified URIs. Can be optionally used if the URIs specified in the spec require a Basic Authentication Header.</td><td>None</td><td>no</td></tr>
</tbody>
</table>
<p>You can only use protocols listed in the <code>druid.ingestion.http.allowedProtocols</code> property as HTTP input sources.
The <code>http</code> and <code>https</code> protocols are allowed by default. See <a href="/docs/latest/configuration/index.html#http-input-source">HTTP input source security configuration</a> for more details.</p>
<h2><a class="anchor" aria-hidden="true" id="inline-input-source"></a><a href="#inline-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Inline Input Source</h2>
<p>The Inline input source can be used to read the data inlined in its own spec.
It can be used for demos or for quickly testing out parsing and schema.</p>
<p>Sample spec:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "inline",
"data": "0,values,formatted\n1,as,CSV"
},
"inputFormat": {
"type": "csv"
},
...
},
...
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be &quot;inline&quot;.</td><td>yes</td></tr>
<tr><td>data</td><td>Inlined data to ingest.</td><td>yes</td></tr>
</tbody>
</table>
<h2><a class="anchor" aria-hidden="true" id="local-input-source"></a><a href="#local-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Local Input Source</h2>
<p>The Local input source is to support reading files directly from local storage,
and is mainly intended for proof-of-concept testing.
The Local input source is <em>splittable</em> and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>,
where each worker task of <code>index_parallel</code> will read one or multiple files.</p>
<p>Sample spec:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "local",
"filter" : "*.csv",
"baseDir": "/data/directory",
"files": ["/bar/foo", "/foo/bar"]
},
"inputFormat": {
"type": "csv"
},
...
},
...
</code></pre>
<table>
<thead>
<tr><th>property</th><th>description</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be &quot;local&quot;.</td><td>yes</td></tr>
<tr><td>filter</td><td>A wildcard filter for files. See <a href="http://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter">here</a> for more information.</td><td>yes if <code>baseDir</code> is specified</td></tr>
<tr><td>baseDir</td><td>Directory to search recursively for files to be ingested. Empty files under the <code>baseDir</code> will be skipped.</td><td>At least one of <code>baseDir</code> or <code>files</code> should be specified</td></tr>
<tr><td>files</td><td>File paths to ingest. Some files can be ignored to avoid ingesting duplicate files if they are located under the specified <code>baseDir</code>. Empty files will be skipped.</td><td>At least one of <code>baseDir</code> or <code>files</code> should be specified</td></tr>
</tbody>
</table>
<h2><a class="anchor" aria-hidden="true" id="druid-input-source"></a><a href="#druid-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Druid Input Source</h2>
<p>The Druid input source is to support reading data directly from existing Druid segments,
potentially using a new schema and changing the name, dimensions, metrics, rollup, etc. of the segment.
The Druid input source is <em>splittable</em> and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>.
This input source has a fixed input format for reading from Druid segments;
no <code>inputFormat</code> field needs to be specified in the ingestion spec when using this input source.</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be &quot;druid&quot;.</td><td>yes</td></tr>
<tr><td>dataSource</td><td>A String defining the Druid datasource to fetch rows from</td><td>yes</td></tr>
<tr><td>interval</td><td>A String representing an ISO-8601 interval, which defines the time range to fetch the data over.</td><td>yes</td></tr>
<tr><td>filter</td><td>See <a href="/docs/latest/querying/filters.html">Filters</a>. Only rows that match the filter, if specified, will be returned.</td><td>no</td></tr>
</tbody>
</table>
<p>The Druid input source can be used for a variety of purposes, including:</p>
<ul>
<li>Creating new datasources that are rolled-up copies of existing datasources.</li>
<li>Changing the <a href="/docs/latest/ingestion/partitioning.html">partitioning or sorting</a> of a datasource to improve performance.</li>
<li>Updating or removing rows using a <a href="/docs/latest/ingestion/ingestion-spec.html#transformspec"><code>transformSpec</code></a>.</li>
</ul>
<p>When using the Druid input source, the timestamp column shows up as a numeric field named <code>__time</code> set to the number
of milliseconds since the epoch (January 1, 1970 00:00:00 UTC). It is common to use this in the timestampSpec, if you
want the output timestamp to be equivalent to the input timestamp. In this case, set the timestamp column to <code>__time</code>
and the format to <code>auto</code> or <code>millis</code>.</p>
<p>It is OK for the input and output datasources to be the same. In this case, newly generated data will overwrite the
previous data for the intervals specified in the <code>granularitySpec</code>. Generally, if you are going to do this, it is a
good idea to test out your reindexing by writing to a separate datasource before overwriting your main one.
Alternatively, if your goals can be satisfied by <a href="/docs/latest/ingestion/compaction.html">compaction</a>, consider that instead as a simpler
approach.</p>
<p>An example task spec is shown below. It reads from a hypothetical raw datasource <code>wikipedia_raw</code> and creates a new
rolled-up datasource <code>wikipedia_rollup</code> by grouping on hour, &quot;countryName&quot;, and &quot;page&quot;.</p>
<pre><code class="hljs css language-json">{
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"index_parallel"</span>,
<span class="hljs-attr">"spec"</span>: {
<span class="hljs-attr">"dataSchema"</span>: {
<span class="hljs-attr">"dataSource"</span>: <span class="hljs-string">"wikipedia_rollup"</span>,
<span class="hljs-attr">"timestampSpec"</span>: {
<span class="hljs-attr">"column"</span>: <span class="hljs-string">"__time"</span>,
<span class="hljs-attr">"format"</span>: <span class="hljs-string">"millis"</span>
},
<span class="hljs-attr">"dimensionsSpec"</span>: {
<span class="hljs-attr">"dimensions"</span>: [
<span class="hljs-string">"countryName"</span>,
<span class="hljs-string">"page"</span>
]
},
<span class="hljs-attr">"metricsSpec"</span>: [
{
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"count"</span>,
<span class="hljs-attr">"name"</span>: <span class="hljs-string">"cnt"</span>
}
],
<span class="hljs-attr">"granularitySpec"</span>: {
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"uniform"</span>,
<span class="hljs-attr">"queryGranularity"</span>: <span class="hljs-string">"HOUR"</span>,
<span class="hljs-attr">"segmentGranularity"</span>: <span class="hljs-string">"DAY"</span>,
<span class="hljs-attr">"intervals"</span>: [<span class="hljs-string">"2016-06-27/P1D"</span>],
<span class="hljs-attr">"rollup"</span>: <span class="hljs-literal">true</span>
}
},
<span class="hljs-attr">"ioConfig"</span>: {
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"index_parallel"</span>,
<span class="hljs-attr">"inputSource"</span>: {
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"druid"</span>,
<span class="hljs-attr">"dataSource"</span>: <span class="hljs-string">"wikipedia_raw"</span>,
<span class="hljs-attr">"interval"</span>: <span class="hljs-string">"2016-06-27/P1D"</span>
}
},
<span class="hljs-attr">"tuningConfig"</span>: {
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"index_parallel"</span>,
<span class="hljs-attr">"partitionsSpec"</span>: {
<span class="hljs-attr">"type"</span>: <span class="hljs-string">"hashed"</span>
},
<span class="hljs-attr">"forceGuaranteedRollup"</span>: <span class="hljs-literal">true</span>,
<span class="hljs-attr">"maxNumConcurrentSubTasks"</span>: <span class="hljs-number">1</span>
}
}
}
</code></pre>
<blockquote>
<p>Note: Older versions (0.19 and earlier) did not respect the timestampSpec when using the Druid input source. If you
have ingestion specs that rely on this and cannot rewrite them, set
<a href="/docs/latest/configuration/index.html#indexer-general-configuration"><code>druid.indexer.task.ignoreTimestampSpecForDruidInputSource</code></a>
to <code>true</code> to enable a compatibility mode where the timestampSpec is ignored.</p>
</blockquote>
<h2><a class="anchor" aria-hidden="true" id="sql-input-source"></a><a href="#sql-input-source" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>SQL Input Source</h2>
<p>The SQL input source is used to read data directly from RDBMS.
The SQL input source is <em>splittable</em> and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>, where each worker task will read from one SQL query from the list of queries.
This input source does not support Split Hint Spec.
Since this input source has a fixed input format for reading events, no <code>inputFormat</code> field needs to be specified in the ingestion spec when using this input source.
Please refer to the Recommended practices section below before using this input source.</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be &quot;sql&quot;.</td><td>Yes</td></tr>
<tr><td>database</td><td>Specifies the database connection details. The database type corresponds to the extension that supplies the <code>connectorConfig</code> support. The specified extension must be loaded into Druid:<br/><br/><ul><li><a href="/docs/latest/development/extensions-core/mysql.html">mysql-metadata-storage</a> for <code>mysql</code></li><li> <a href="/docs/latest/development/extensions-core/postgresql.html">postgresql-metadata-storage</a> extension for <code>postgresql</code>.</li></ul><br/><br/>You can selectively allow JDBC properties in <code>connectURI</code>. See <a href="/docs/latest/configuration/index.html#jdbc-connections-to-external-databases">JDBC connections security config</a> for more details.</td><td>Yes</td></tr>
<tr><td>foldCase</td><td>Toggle case folding of database column names. This may be enabled in cases where the database returns case insensitive column names in query results.</td><td>No</td></tr>
<tr><td>sqls</td><td>List of SQL queries where each SQL query would retrieve the data to be indexed.</td><td>Yes</td></tr>
</tbody>
</table>
<p>An example SqlInputSource spec is shown below:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "sql",
"database": {
"type": "mysql",
"connectorConfig": {
"connectURI": "jdbc:mysql://host:port/schema",
"user": "user",
"password": "password"
}
},
"sqls": ["SELECT * FROM table1 WHERE timestamp BETWEEN '2013-01-01 00:00:00' AND '2013-01-01 11:59:59'", "SELECT * FROM table2 WHERE timestamp BETWEEN '2013-01-01 00:00:00' AND '2013-01-01 11:59:59'"]
}
},
...
</code></pre>
<p>The spec above will read all events from two separate SQLs for the interval <code>2013-01-01/2013-01-02</code>.
Each of the SQL queries will be run in its own sub-task and thus for the above example, there would be two sub-tasks.</p>
<p><strong>Recommended practices</strong></p>
<p>Compared to the other native batch InputSources, SQL InputSource behaves differently in terms of reading the input data and so it would be helpful to consider the following points before using this InputSource in a production environment:</p>
<ul>
<li><p>During indexing, each sub-task would execute one of the SQL queries and the results are stored locally on disk. The sub-tasks then proceed to read the data from these local input files and generate segments. Presently, there isn’t any restriction on the size of the generated files and this would require the MiddleManagers or Indexers to have sufficient disk capacity based on the volume of data being indexed.</p></li>
<li><p>Filtering the SQL queries based on the intervals specified in the <code>granularitySpec</code> can avoid unwanted data being retrieved and stored locally by the indexing sub-tasks. For example, if the <code>intervals</code> specified in the <code>granularitySpec</code> is <code>[&quot;2013-01-01/2013-01-02&quot;]</code> and the SQL query is <code>SELECT * FROM table1</code>, <code>SqlInputSource</code> will read all the data for <code>table1</code> based on the query, even though only data between the intervals specified will be indexed into Druid.</p></li>
<li><p>Pagination may be used on the SQL queries to ensure that each query pulls a similar amount of data, thereby improving the efficiency of the sub-tasks.</p></li>
<li><p>Similar to file-based input formats, any updates to existing data will replace the data in segments specific to the intervals specified in the <code>granularitySpec</code>.</p></li>
</ul>
<h2><a class="anchor" aria-hidden="true" id="combining-input-sources"></a><a href="#combining-input-sources" aria-hidden="true" class="hash-link"><svg class="hash-link-icon" aria-hidden="true" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"></path></svg></a>Combining input sources</h2>
<p>The Combining input source is used to read data from multiple InputSources. This input source should be only used if all the delegate input sources are
<em>splittable</em> and can be used by the <a href="/docs/latest/ingestion/native-batch.html">Parallel task</a>. This input source will identify the splits from its delegates and each split will be processed by a worker task. Similar to other input sources, this input source supports a single <code>inputFormat</code>. Therefore, please note that delegate input sources requiring an <code>inputFormat</code> must have the same format for input data.</p>
<table>
<thead>
<tr><th>property</th><th>description</th><th>required?</th></tr>
</thead>
<tbody>
<tr><td>type</td><td>This should be &quot;combining&quot;.</td><td>Yes</td></tr>
<tr><td>delegates</td><td>List of <em>splittable</em> InputSources to read data from.</td><td>Yes</td></tr>
</tbody>
</table>
<p>Sample spec:</p>
<pre><code class="hljs css language-json">...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "combining",
"delegates" : [
{
"type": "local",
"filter" : "*.csv",
"baseDir": "/data/directory",
"files": ["/bar/foo", "/foo/bar"]
},
{
"type": "druid",
"dataSource": "wikipedia",
"interval": "2013-01-01/2013-01-02"
}
]
},
"inputFormat": {
"type": "csv"
},
...
},
...
</code></pre>
</span></div></article></div><div class="docs-prevnext"><a class="docs-prev button" href="/docs/latest/ingestion/native-batch-simple-task.html"><span class="arrow-prev"></span><span>Simple task indexing</span></a><a class="docs-next button" href="/docs/latest/ingestion/native-batch-firehose.html"><span>Firehose</span><span class="arrow-next"></span></a></div></div></div><nav class="onPageNav"><ul class="toc-headings"><li><a href="#s3-input-source">S3 input source</a></li><li><a href="#google-cloud-storage-input-source">Google Cloud Storage Input Source</a></li><li><a href="#azure-input-source">Azure input source</a></li><li><a href="#hdfs-input-source">HDFS Input Source</a></li><li><a href="#http-input-source">HTTP Input Source</a></li><li><a href="#inline-input-source">Inline Input Source</a></li><li><a href="#local-input-source">Local Input Source</a></li><li><a href="#druid-input-source">Druid Input Source</a></li><li><a href="#sql-input-source">SQL Input Source</a></li><li><a href="#combining-input-sources">Combining input sources</a></li></ul></nav></div><footer class="nav-footer druid-footer" id="footer"><div class="container"><div class="text-center"><p><a href="/technology">Technology</a> · <a href="/use-cases">Use Cases</a> · <a href="/druid-powered">Powered by Druid</a> · <a href="/docs/latest/latest">Docs</a> · <a href="/community/">Community</a> · <a href="/downloads.html">Download</a> · <a href="/faq">FAQ</a></p></div><div class="text-center"><a title="Join the user group" href="https://groups.google.com/forum/#!forum/druid-user" target="_blank"><span class="fa fa-comments"></span></a> · <a title="Follow Druid" href="https://twitter.com/druidio" target="_blank"><span class="fab fa-twitter"></span></a> · <a title="Download via Apache" href="https://www.apache.org/dyn/closer.cgi?path=/incubator/druid/{{ site.druid_versions[0].versions[0].version }}/apache-druid-{{ site.druid_versions[0].versions[0].version }}-bin.tar.gz" target="_blank"><span class="fas fa-feather"></span></a> · <a title="GitHub" href="https://github.com/apache/druid" target="_blank"><span class="fab fa-github"></span></a></div><div class="text-center license">Copyright © 2019 <a href="https://www.apache.org/" target="_blank">Apache Software Foundation</a>.<br/>Except where otherwise noted, licensed under <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA 4.0</a>.<br/>Apache Druid, Druid, and the Druid logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</div></div></footer></div><script type="text/javascript" src="https://cdn.jsdelivr.net/docsearch.js/1/docsearch.min.js"></script><script>
document.addEventListener('keyup', function(e) {
if (e.target !== document.body) {
return;
}
// keyCode for '/' (slash)
if (e.keyCode === 191) {
const search = document.getElementById('search_input_react');
search && search.focus();
}
});
</script><script>
var search = docsearch({
apiKey: '2de99082a9f38e49dfaa059bbe4c901d',
indexName: 'apache_druid',
inputSelector: '#search_input_react',
algoliaOptions: {"facetFilters":["language:en","version:0.23.0"]}
});
</script></body></html>